### Check the cuda compiler version

In [None]:
!nvidia-smi

In [None]:
!nvcc --version

In [None]:
!git clone https://github.com/NVIDIA/cuda-samples.git

In [None]:
!cd cuda-samples/Samples/1_Utilities/deviceQuery && make


In [None]:
!cd cuda-samples/Samples/1_Utilities/deviceQuery && ls
!cuda-samples/Samples/1_Utilities/deviceQuery/./deviceQuery

## nvcc for Jupyter notebook

In [None]:
!pip install nvcc4jupyter

In [None]:
%load_ext nvcc4jupyter

# Problem One - Taylor Series - Sine Approximation

## CPU Implementation

In [None]:
%%writefile cpu_sin.cu
#include <stdio.h>
#include <math.h>
#include <time.h>

void cpu_sin_approximation(float *array, float *results, int N, int p) {
    for (int i = 0; i < N; i++) {
        float x = array[i];
        float term = x;
        float sin_value = term;

        for (int j = 1; j < p; j++) {
            term *= -x * x / ((2 * j) * (2 * j + 1));
            sin_value += term;
        }
        results[i] = sin_value;
    }
}


## GPU Implementation

In [None]:
%%writefile gpu_sin.cu
%%cuda
#include <stdio.h>
#include <cuda.h>

__global__ void gpu_sin_approximation(float *array, float *results, int N, int p) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < N) {
        float x = array[i];
        float term = x;
        float sin_value = term;

        for (int j = 1; j < p; j++) {
            term *= -x * x / ((2 * j) * (2 * j + 1));
            sin_value += term;
        }

        results[i] = sin_value;
    }
}

void call_gpu_sin_approximation(float *array, float *results, int N, int p) {
    float *d_array, *d_results;
    cudaMalloc((void**)&d_array, N * sizeof(float));
    cudaMalloc((void**)&d_results, N * sizeof(float));

    cudaMemcpy(d_array, array, N * sizeof(float), cudaMemcpyHostToDevice);

    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;

    gpu_sin_approximation<<<numBlocks, blockSize>>>(d_array, d_results, N, p);

    cudaMemcpy(results, d_results, N * sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(d_array);
    cudaFree(d_results);
}


In [None]:
%%writefile main_program.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#include <math.h>

__global__ void gpu_sin_approximation(float *array, float *results, int N, int p) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        float x = array[i];
        float term = x; // Initialize with x (the first term)
        float sin_value = term;

        for (int j = 1; j < p; j++) {
            // Calculate each term in the series
            term *= -x * x / ((2 * j) * (2 * j + 1)); // Using the previous term to compute the next
            sin_value += term;
        }

        results[i] = sin_value;
    }
}

void cpu_sin_approximation(float *array, float *results, int N, int p) {
    for (int i = 0; i < N; i++) {
        float x = array[i];
        float term = x; // Initialize with x (the first term)
        float sin_value = term;

        for (int j = 1; j < p; j++) {
            // Calculate each term in the series
            term *= -x * x / ((2 * j) * (2 * j + 1)); // Using the previous term to compute the next
            sin_value += term;
        }

        results[i] = sin_value;
    }
}

void call_gpu_sin_approximation(float *array, float *results, int N, int p) {
    float *d_array, *d_results;
    cudaMalloc((void**)&d_array, N * sizeof(float));
    cudaMalloc((void**)&d_results, N * sizeof(float));

    cudaMemcpy(d_array, array, N * sizeof(float), cudaMemcpyHostToDevice);

    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;

    gpu_sin_approximation<<<numBlocks, blockSize>>>(d_array, d_results, N, p);

    cudaMemcpy(results, d_results, N * sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(d_array);
    cudaFree(d_results);
}

int main() {
    int N_values[] = {8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304}; // Powers of 2
    int p_values[] = {3, 5, 10, 20, 50, 100}; // Values of p
    int num_N = sizeof(N_values) / sizeof(N_values[0]);
    int num_p = sizeof(p_values) / sizeof(p_values[0]);

    for (int n = 0; n < num_N; n++) {
        int N = N_values[n];
        float *array = (float *)malloc(N * sizeof(float));
        float *results_cpu = (float *)malloc(N * sizeof(float));
        float *results_gpu = (float *)malloc(N * sizeof(float));

        for (int i = 0; i < N; i++) {
            array[i] = ((float)rand() / (float)(RAND_MAX)) * 2 * M_PI;
        }

        for (int p = 0; p < num_p; p++) {
            int terms = p_values[p];
            clock_t start, end;

            // CPU Timing
            start = clock();
            cpu_sin_approximation(array, results_cpu, N, terms);
            end = clock();
            double cpu_time = ((double)(end - start)) / CLOCKS_PER_SEC;

            // GPU Timing
            start = clock();
            call_gpu_sin_approximation(array, results_gpu, N, terms);
            end = clock();
            double gpu_time = ((double)(end - start)) / CLOCKS_PER_SEC;

            printf("N = %d, p = %d: CPU Time = %f seconds, GPU Time = %f seconds\n", N, terms, cpu_time, gpu_time);
        }

        free(array);
        free(results_cpu);
        free(results_gpu);
    }

    return 0;
}



In [None]:
!nvcc main_program.cu -o sin_approximation

In [None]:
!./sin_approximation


In [None]:
ls