In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [0]:
code = """
#include<iostream>
#include<math.h>

#define n 8

using namespace std;

__global__ void minimum(int *input) {
    int tid = threadIdx.x;
    int step_size = 1;
    int number_of_threads = blockDim.x;
    
    printf("No of threads = %d", number_of_threads);
    
    while(number_of_threads>0) {
        if(tid < number_of_threads) {
            int first = tid*step_size*2;
            int second = first + step_size;
            if(input[second] < input[first])
              input[first] = input[second];
        }
        step_size <<= 1;
        number_of_threads >>= 1;
    }
}

__global__ void maximum(int *input) {
    int tid = threadIdx.x;
    int step_size = 1;
    int number_of_threads = blockDim.x;
    
    while(number_of_threads>0) {
        if(tid < number_of_threads) {
            int first = tid*step_size*2;
            int second = first + step_size;
            if(input[second] > input[first])
              input[first] = input[second];
        }
        step_size <<= 1;
        number_of_threads >>= 1;
    }
}

__global__ void sum(int *input) {
    const int tid = threadIdx.x;
    int step_size = 1;
    int number_of_threads = blockDim.x;
    
    while(number_of_threads > 0) {
        if(tid < number_of_threads) {
            int first = tid * step_size * 2;
            int second = first + step_size;
            
            input[first] += input[second];
        }
        step_size <<= 1;
        number_of_threads >>= 1;
       
    }
}

__global__ void mean_diff_sq(float *input, float mean) {
    input[threadIdx.x] -= mean;
    input[threadIdx.x] *= input[threadIdx.x];
}

__global__ void sum_floats(float *input) {
    int tid = threadIdx.x;
    int step_size = 1;
    int number_of_threads = blockDim.x;
    
    while(number_of_threads > 0) {
        if(tid < number_of_threads) {
            int first = tid * step_size * 2;
            int second = first + step_size;
            
            input[first] += input[second];
        }
        step_size <<= 1;
        number_of_threads >>= 1;
       
    }
}

void copy_int_to_float(float *dest, int *src, int size){
    for(int i=0; i<size; i++)
        dest[i] = float(src[i]);
}

void random_ints(int *input, int size) {
    for(int i=0; i<size; i++)  {
        input[i] = rand()%100;
        cout<<input[i]<<"  ";   
    }
    cout<<endl;

}

int main() {
    //int n=8;
    int size = n*sizeof(int); //calculate no. of bytes for array
        
    int *arr;
    int *arr_d, result;
   
    arr = (int *)malloc(size);
    random_ints(arr, n);
    
    cudaMalloc((void **)&arr_d, size);
    
    //MIN
    cudaMemcpy(arr_d, arr, size, cudaMemcpyHostToDevice);
    
    minimum<<<1,n/2>>>(arr_d);
    
    cudaMemcpy(&result, arr_d, sizeof(int), cudaMemcpyDeviceToHost);
    
    cout<<"The minimum element is "<<result<<endl;
      
       
    //MAX
    cudaMemcpy(arr_d, arr, size, cudaMemcpyHostToDevice);
    
    maximum<<<1,n/2>>>(arr_d);
    
    cudaMemcpy(&result, arr_d, sizeof(int), cudaMemcpyDeviceToHost);
    
    cout<<"The maximum element is "<<result<<endl;
    
    //SUM
    cudaMemcpy(arr_d, arr, size, cudaMemcpyHostToDevice);
    
    sum<<<1,n/2>>>(arr_d);
    
    cudaMemcpy(&result, arr_d, sizeof(int), cudaMemcpyDeviceToHost);
    
    cout<<"The sum is "<<result<<endl;
    
    //AVERAGE
    
    float mean = float(result)/n;
    cout<<"The mean is "<<mean<<endl;
    
    //STANDARD DEVIATION
    float *arr_float;
    float *arr_std, stdValue;
    
    arr_float = (float *)malloc(n*sizeof(float));
    cudaMalloc((void **)&arr_std, n*sizeof(float));
    
    copy_int_to_float(arr_float, arr, n);
    
    cudaMemcpy(arr_std, arr_float, n*sizeof(float), cudaMemcpyHostToDevice);
    
    mean_diff_sq <<<1,n>>>(arr_std, mean);
    sum_floats<<<1,n/2>>>(arr_std);
    
    cudaMemcpy(&stdValue, arr_std, sizeof(float), cudaMemcpyDeviceToHost);
    
    
    stdValue = stdValue / n;
    cout<<"The variance is "<<stdValue<<endl;
    stdValue = sqrt(stdValue);
    
    cout<<"The standard deviation is "<<stdValue<<endl;
    
    cudaFree(arr_d);
           
    return 0;
}
"""

In [0]:
text_file = open("assign1.cu", "w")
text_file.write(code)
text_file.close()

In [0]:
!nvcc assign1.cu

In [16]:
!./a.out

96  44  42  49  11  93  82  21  
No of threads = 4No of threads = 4No of threads = 4No of threads = 4The minimum element is 11
The maximum element is 96
The sum is 438
The mean is 54.75
The variance is 908.938
The standard deviation is 30.1486


In [0]:
!nvprof ./a.out

Enter the no of elements : 
9295  2008  8678  8725  418  2377  12675  13271  4747  2307  
==207== NVPROF is profiling process 207, command: ./a.out
The minimum element is 418
The minimum element using CPU is 418
The maximum element is 13271
The sum of elements is 57447
The average of elements is 5744
==207== Profiling application: ./a.out
==207== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   25.29%  9.0560us         4  2.2640us  2.1120us  2.4320us  [CUDA memcpy DtoH]
                   19.93%  7.1360us         4  1.7840us  1.5680us  2.3360us  [CUDA memcpy HtoD]
                   14.39%  5.1520us         1  5.1520us  5.1520us  5.1520us  minimum(int*)
                   14.21%  5.0880us         1  5.0880us  5.0880us  5.0880us  average(int*)
                   13.23%  4.7360us         1  4.7360us  4.7360us  4.7360us  maximum(int*)
                   12.96%  4.6400us         1  4.6400us  4.6400us  4.6400us  sum(int*)