<a href="https://colab.research.google.com/github/chenchongsong/udacity-cs344-colab/blob/main/notebook/udacity_cs344_hw4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Homework 4 for Udacity CS344 Course, Intro to Parallel Programming
# clone the code repo,
!git clone https://github.com/chenchongsong/udacity-cs344-colab
!pip install git+git://github.com/depctg/nvcc4jupyter.git

# load cuda plugin
%config NVCCPluginV2.static_dir = True
%config NVCCPluginV2.relative_dir = "udacity-cs344-colab/src/HW4"
%load_ext nvcc_plugin

# change to work directory, generate makefiles
!mkdir udacity-cs344-colab/build
%cd udacity-cs344-colab/build
!cmake ../src

In [None]:
%%cuda --name student_func.cu

//Udacity HW 4
//Radix Sorting

#include "utils.h"
#include <thrust/device_vector.h>

/* Red Eye Removal
   ===============

   For this assignment we are implementing red eye removal.  This is
   accomplished by first creating a score for every pixel that tells us how
   likely it is to be a red eye pixel.  We have already done this for you - you
   are receiving the scores and need to sort them in ascending order so that we
   know which pixels to alter to remove the red eye.

   Note: ascending order == smallest to largest

   Each score is associated with a position, when you sort the scores, you must
   also move the positions accordingly.

   Implementing Parallel Radix Sort with CUDA
   ==========================================

   The basic idea is to construct a histogram on each pass of how many of each
   "digit" there are.   Then we scan this histogram so that we know where to put
   the output of each digit.  For example, the first 1 must come after all the
   0s so we have to know how many 0s there are to be able to start moving 1s
   into the correct position.

   1) Histogram of the number of occurrences of each digit
   2) Exclusive Prefix Sum of Histogram
   3) Determine relative offset of each digit
        For example [0 0 1 1 0 0 1]
                ->  [0 1 0 1 2 3 2]
   4) Combine the results of steps 2 & 3 to determine the final
      output location for each element and move it there

   LSB Radix sort is an out-of-place sort and you will need to ping-pong values
   between the input and output buffers we have provided.  Make sure the final
   sorted results end up in the output buffer!  Hint: You may need to do a copy
   at the end.

 */

// #define USE_THRUST_SORT

#ifdef USE_THRUST_SORT
// 13ms
#include <thrust/sort.h>
void your_sort(unsigned int* const d_inputVals,
               unsigned int* const d_inputPos,
               unsigned int* const d_outputVals,
               unsigned int* const d_outputPos,
               const size_t numElems) {
  // Thrust vectors wrapping raw GPU data
  // std::cout << "numElems: " << numElems << std::endl;  // 220480

  // thrust::device_vector<unsigned int> d_inputVals_vec(d_inputVals, d_inputVals + numElems);  // copy
  // thrust::device_vector<unsigned int> d_inputPos_vec(d_inputPos, d_inputPos + numElems);  // copy
  // thrust::sort_by_key(d_inputVals_vec.begin(), d_inputVals_vec.end(), d_inputPos_vec.begin());  // key_start, key_end, value_start
  // checkCudaErrors(cudaMemcpy(d_outputVals, thrust::raw_pointer_cast(&d_inputVals_vec[0]),
  //                            numElems * sizeof(unsigned int), cudaMemcpyDeviceToDevice));
  // checkCudaErrors(cudaMemcpy(d_outputPos, thrust::raw_pointer_cast(&d_inputPos_vec[0]),
  //                            numElems * sizeof(unsigned int), cudaMemcpyDeviceToDevice));

  checkCudaErrors(cudaMemcpy(d_outputVals, d_inputVals,
                             numElems * sizeof(unsigned int), cudaMemcpyDeviceToDevice));
  checkCudaErrors(cudaMemcpy(d_outputPos, d_inputPos,
                             numElems * sizeof(unsigned int), cudaMemcpyDeviceToDevice));
  thrust::device_ptr<unsigned int> d_outputVals_p(d_outputVals);
  thrust::device_ptr<unsigned int> d_outputPos_p(d_outputPos);
  thrust::sort_by_key(d_outputVals_p, d_outputVals_p + numElems, d_outputPos_p);  // inplace, argument: key_start, key_end, value_start
}
#else
// self-implementation of Radix Sort (with the aid of thrust library)
// 131ms
#include <thrust/scan.h>
const int BLOCK_SIZE = 1024;

__global__ void predicate(unsigned int* d_pred, const unsigned int* d_in, size_t numElems, int bit) {
  int pos = blockDim.x * blockIdx.x + threadIdx.x;
  if (pos >= numElems) return;
  unsigned int bin = ((d_in[pos] >> bit) & 1u);
  d_pred[pos] = bin;
}

__global__ void negatePredicate(unsigned int* d_pred, size_t numElems) {
  int pos = blockDim.x * blockIdx.x + threadIdx.x;
  if (pos >= numElems) return;
  d_pred[pos] = d_pred[pos] ? 0 : 1;
}

__global__ void moveElements(unsigned int* d_out, const unsigned int* d_in, const unsigned int histo_0, const unsigned int histo_1,
                const unsigned int* d_pred, const unsigned int* d_scan_true, const unsigned int* d_scan_false, size_t numElems) {
  int pos = blockDim.x * blockIdx.x + threadIdx.x;
  if (pos >= numElems) return;
  // Calculate new index of element at position pos
  int newPos;  
  if (d_pred[pos]) {
    newPos = histo_0 + d_scan_false[pos];
  } else {
    newPos = histo_1 + d_scan_true[pos];
  }
  if (newPos >= numElems) return;
  d_out[newPos] = d_in[pos];
}

unsigned int thrustScan(thrust::device_vector<unsigned int>& d_scan_vec, thrust::device_vector<unsigned int>& d_pred_vec) {
  thrust::exclusive_scan(d_pred_vec.begin(), d_pred_vec.end(), d_scan_vec.begin());  // very slow but don't know why
  unsigned int finalSum = d_scan_vec.back();
  return finalSum;
}

void your_sort(unsigned int* const d_inputVals,
               unsigned int* const d_inputPos,
               unsigned int* const d_outputVals,
               unsigned int* const d_outputPos,
               size_t numElems) {
  int numBlock = ceil(1.0f * numElems / BLOCK_SIZE);

  thrust::device_vector<unsigned int> d_pred_vec(numElems + 1), d_scan_true_vec(numElems + 1), d_scan_false_vec(numElems + 1);
  d_scan_true_vec[numElems] = 0;
  d_scan_false_vec[numElems] = 0;
  unsigned int* d_pred = thrust::raw_pointer_cast(&d_pred_vec[0]);
  unsigned int* d_scan_true = thrust::raw_pointer_cast(&d_scan_true_vec[0]);
  unsigned int* d_scan_false = thrust::raw_pointer_cast(&d_scan_false_vec[0]);

  for (size_t bit = 0; bit < 32; bit++) {
    // Compute predicate
    if (bit % 2 == 0) {
      predicate<<<numBlock, BLOCK_SIZE>>>(d_pred, d_inputVals, numElems, bit);
    } else {
      predicate<<<numBlock, BLOCK_SIZE>>>(d_pred, d_outputVals, numElems, bit);
    }

    // Compute offset of positives
    unsigned int numTrue = thrustScan(d_scan_true_vec, d_pred_vec);

    // Flip bits
    negatePredicate<<<numBlock, BLOCK_SIZE>>>(d_pred, numElems);

    // Compute offset of negatives
    unsigned int numFalse = thrustScan(d_scan_false_vec, d_pred_vec);

    // Exclusive Prefix Sum of 2-bins histogram is: [0, numFalse].
    const unsigned int histo_0 = 0;
    const unsigned int histo_1 = numFalse;

    // Moving elements and indices
    if (bit % 2 == 0) {
      moveElements<<<numBlock, BLOCK_SIZE>>>(d_outputVals, d_inputVals, histo_0, histo_1, d_pred, d_scan_true, d_scan_false, numElems);
      moveElements<<<numBlock, BLOCK_SIZE>>>(d_outputPos, d_inputPos, histo_0, histo_1, d_pred, d_scan_true, d_scan_false, numElems);
    }
    else {
      moveElements<<<numBlock, BLOCK_SIZE>>>(d_inputVals, d_outputVals, histo_0, histo_1, d_pred, d_scan_true, d_scan_false, numElems);
      moveElements<<<numBlock, BLOCK_SIZE>>>(d_inputPos, d_outputPos, histo_0, histo_1, d_pred, d_scan_true, d_scan_false, numElems);
    }
  }

  // Copy result into d_outputVals
  checkCudaErrors(cudaMemcpy(d_outputVals, d_inputVals, numElems * sizeof(unsigned int), cudaMemcpyDeviceToDevice));
  checkCudaErrors(cudaMemcpy(d_outputPos, d_inputPos, numElems * sizeof(unsigned int), cudaMemcpyDeviceToDevice));
}
#endif

In [None]:
# make the cuda project
!make HW4
print("\n====== RESULT OF HW4 =======\n")
!bin/HW4 ../src/HW4/red_eye_effect.gold ../src/HW4/red_eye_effect_template_5.jpg

In [None]:
# plot output images
import matplotlib.pyplot as plt
_,ax = plt.subplots(1,2, dpi=150)

ax[0].imshow(plt.imread("../src/HW4/red_eye_effect_5.jpg"))
ax[0].set_title("original")
ax[0].grid(False)

ax[1].imshow(plt.imread("HW4_output.png"))
ax[1].set_title("output")
ax[1].grid(False)

plt.show()