In [None]:
%%writefile v2.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

#define max(a,b) ((a) > (b) ? (a) : (b))
#define INPUT_SIZE 784
#define HIDDEN_SIZE 128
#define OUTPUT_SIZE 10
#define LEARNING_RATE 0.01
#define EPOCHS 3
#define BATCH_SIZE 64
#define NUM_CLASSES 10  // Digits 0-9
#define cudaCheckError() {                                          \
    cudaError_t e=cudaGetLastError();                                \
    if(e!=cudaSuccess) {                                             \
        printf("CUDA Error %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
        exit(EXIT_FAILURE);                                          \
    }                                                                \
}


// Timer function
double get_time(clock_t start) {
    return (double)(clock() - start) / CLOCKS_PER_SEC;
}

// Allocate memory for a matrix
double** allocateMatrix(int rows, int cols) {
    double** mat = (double**)malloc(rows * sizeof(double*));
    for (int i = 0; i < rows; i++) {
        mat[i] = (double*)malloc(cols * sizeof(double));
    }
    return mat;
}

// Free allocated matrix memory
void freeMatrix(double** mat, int rows) {
    for (int i = 0; i < rows; i++) {
        free(mat[i]);
    }
    free(mat);
}
// Neural network structure
typedef struct {
    double** W1;
    double** W2;
    double* b1;
    double* b2;
} NeuralNetwork;

typedef struct {
    double* W1_flat;
    double* W2_flat;
    double* b1;
    double* b2;
} NeuralNetworkDevice;

// Initialize neural network
NeuralNetwork* createNetwork() {
    NeuralNetwork* net = (NeuralNetwork*)malloc(sizeof(NeuralNetwork));
    net->W1 = allocateMatrix(HIDDEN_SIZE, INPUT_SIZE);
    net->W2 = allocateMatrix(OUTPUT_SIZE, HIDDEN_SIZE);
    net->b1 = (double*)calloc(HIDDEN_SIZE, sizeof(double));
    net->b2 = (double*)calloc(OUTPUT_SIZE, sizeof(double));

    srand(time(NULL));
    for (int i = 0; i < HIDDEN_SIZE; i++)
        for (int j = 0; j < INPUT_SIZE; j++)
            net->W1[i][j] = ((double)rand() / RAND_MAX) * 0.01;

    for (int i = 0; i < OUTPUT_SIZE; i++)
        for (int j = 0; j < HIDDEN_SIZE; j++)
            net->W2[i][j] = ((double)rand() / RAND_MAX) * 0.01;

    return net;
}

// Forward pass
__device__ double relu(double x) {
    return (x > 0.0) ? x : 0.0;
}
// ---------- Forward kernell ----------

__global__ void forward_kernell(NeuralNetwork* net, double* input, double* hidden, double* output) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    // Hidden layer
    if (idx < HIDDEN_SIZE) {
        double sum = net->b1[idx];
        for (int j = 0; j < INPUT_SIZE; j++) {
            sum += net->W1[idx][j] * input[j];
        }
        hidden[idx] = relu(sum);
    }

    __syncthreads();  // Ensure all hidden activations are ready

    // Output layer
    if (idx < OUTPUT_SIZE) {
        double sum = net->b2[idx];
        for (int j = 0; j < HIDDEN_SIZE; j++) {
            sum += net->W2[idx][j] * hidden[j];
        }
        output[idx] = sum;
        printf("%f",output[idx]);

    }
}
// ---------- Forward kernel ----------

__global__ void forward_kernel(NeuralNetworkDevice* net, double* input, double* hidden, double* output) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    // Hidden layer
    if (idx < HIDDEN_SIZE) {
        double sum = net->b1[idx];
        for (int j = 0; j < INPUT_SIZE; j++) {
            sum += net->W1_flat[idx*INPUT_SIZE+j] * input[j];
        }
        hidden[idx] = relu(sum);
    }

    __syncthreads();  // Ensure all hidden activations are ready

    // Output layer
    if (idx < OUTPUT_SIZE) {
        double sum = net->b2[idx];
        for (int j = 0; j < HIDDEN_SIZE; j++) {
            sum += net->W2_flat[idx*HIDDEN_SIZE+j] * hidden[j];
        }
        output[idx] = sum;
        printf("%f",output[idx]);

    }
}

// ---------- Softmax kernel ----------

__global__ void softmax_kernel(double* x, int size, double* sum_out) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    // Step 1: Exponentiate
    if (idx < size) {
        x[idx] = exp(x[idx]);
    }

    __syncthreads();

    // Step 2: Compute sum using thread 0
    if (idx == 0) {
        double sum = 0.0;
        for (int i = 0; i < size; i++) {
            sum += x[i];
        }
        *sum_out = sum;
    }

    __syncthreads();

    // Step 3: Normalize
    double sum_val = *sum_out;
    if (idx < size && sum_val != 0.0) {
        x[idx] /= sum_val;
        printf("sum %f",sum_val );
    }
}

// Backpropagation
void backward(NeuralNetwork* net, double* input, double* hidden, double* output, double* target) {
    double d_output[OUTPUT_SIZE], d_hidden[HIDDEN_SIZE];

    // Compute output layer gradient
    for (int i = 0; i < OUTPUT_SIZE; i++)
        d_output[i] = output[i] - target[i];

    // Compute hidden layer gradient
    for (int i = 0; i < HIDDEN_SIZE; i++) {
        d_hidden[i] = 0;
        for (int j = 0; j < OUTPUT_SIZE; j++)
            d_hidden[i] += net->W2[j][i] * d_output[j];
        d_hidden[i] *= (hidden[i] > 0);
    }

    // Update weights (gradient descent)
    for (int i = 0; i < OUTPUT_SIZE; i++)
        for (int j = 0; j < HIDDEN_SIZE; j++)
            net->W2[i][j] -= LEARNING_RATE * d_output[i] * hidden[j];

    for (int i = 0; i < HIDDEN_SIZE; i++)
        for (int j = 0; j < INPUT_SIZE; j++)
            net->W1[i][j] -= LEARNING_RATE * d_hidden[i] * input[j];

    for (int i = 0; i < OUTPUT_SIZE; i++)
        net->b2[i] -= LEARNING_RATE * d_output[i];

    for (int i = 0; i < HIDDEN_SIZE; i++)
        net->b1[i] -= LEARNING_RATE * d_hidden[i];
}

 void printNeuralNetworkDevice(NeuralNetworkDevice nn) {
    printf("Device W1:\n");
    for (int i = 0; i < INPUT_SIZE; i++) {
        for (int j = 0; j < HIDDEN_SIZE; j++) {
            printf("%.2f ", nn.W1_flat[i * HIDDEN_SIZE + j]);
        }
        printf("\n");
    }

    printf("\nDevice W2:\n");
    for (int i = 0; i < HIDDEN_SIZE; i++) {
        for (int j = 0; j < OUTPUT_SIZE; j++) {
            printf("%.2f ", nn.W2_flat[i * OUTPUT_SIZE + j]);
        }
        printf("\n");
    }

    printf("\nDevice b1:\n");
    for (int i = 0; i < HIDDEN_SIZE; i++) {
        printf("%.2f ", nn.b1[i]);
    }
    printf("\n");

    printf("\nDevice b2:\n");
    for (int i = 0; i < OUTPUT_SIZE; i++) {
        printf("%.2f ", nn.b2[i]);
    }
    printf("\n");
}

// Print NeuralNetwork from host
void printNeuralNetwork(NeuralNetwork* nn) {
    printf("W1:\n");
    for (int i = 0; i < INPUT_SIZE; i++) {
        for (int j = 0; j < HIDDEN_SIZE; j++) {
            printf("%.2f ", nn->W1[i][j]);
        }
        printf("\n");
    }

    printf("\nW2:\n");
    for (int i = 0; i < HIDDEN_SIZE; i++) {
        for (int j = 0; j < OUTPUT_SIZE; j++) {
            printf("%.2f ", nn->W2[i][j]);
        }
        printf("\n");
    }

    printf("\nb1:\n");
    for (int i = 0; i < HIDDEN_SIZE; i++) {
        printf("%.2f ", nn->b1[i]);
    }
    printf("\n");

    printf("\nb2:\n");
    for (int i = 0; i < OUTPUT_SIZE; i++) {
        printf("%.2f ", nn->b2[i]);
    }
    printf("\n");
}

void copyHostToDevice(NeuralNetwork* host, NeuralNetworkDevice* device) {
    int size_W1 = INPUT_SIZE * HIDDEN_SIZE * sizeof(double);
    int size_W2 = HIDDEN_SIZE * OUTPUT_SIZE * sizeof(double);
    int size_b1 = HIDDEN_SIZE * sizeof(double);
    int size_b2 = OUTPUT_SIZE * sizeof(double);
    //printNeuralNetwork(host);
    // Allocate device memory
    cudaMalloc((void**)&device->W1_flat, size_W1);
    cudaCheckError();
    cudaMalloc((void**)&device->W2_flat, size_W2);
    cudaCheckError();
    cudaMalloc((void**)&device->b1, size_b1);
    cudaCheckError();
    cudaMalloc((void**)&device->b2, size_b2);
    cudaCheckError();


    // Flatten W1
    double* W1_flat_host = (double*)malloc(size_W1);
    for (int i = 0; i < HIDDEN_SIZE; i++) {
        for (int j = 0; j < INPUT_SIZE; j++) {
            W1_flat_host[i * INPUT_SIZE + j] = host->W1[i][j];
        }
    }

    // Flatten W2
    double* W2_flat_host = (double*)malloc(size_W2);
     for (int i = 0; i < OUTPUT_SIZE; i++) {
        for (int j = 0; j < HIDDEN_SIZE; j++) {
            W2_flat_host[i * HIDDEN_SIZE + j] = host->W2[i][j];
        }
    }

    // Copy to device
    cudaMemcpy(device->W1_flat, W1_flat_host, size_W1, cudaMemcpyHostToDevice);
    cudaCheckError();
    cudaMemcpy(device->W2_flat, W2_flat_host, size_W2, cudaMemcpyHostToDevice);
    cudaCheckError();
    cudaMemcpy(device->b1, host->b1, size_b1, cudaMemcpyHostToDevice);
    cudaCheckError();
    cudaMemcpy(device->b2, host->b2, size_b2, cudaMemcpyHostToDevice);
    cudaCheckError();


    // Free temp
   //

    free(W1_flat_host);
    //free(W2_flat_host);
}

// Copy from device back to host
void copyDeviceToHost(NeuralNetworkDevice* device, NeuralNetwork* host) {
    int size_W1 = INPUT_SIZE * HIDDEN_SIZE * sizeof(double);
    int size_W2 = HIDDEN_SIZE * OUTPUT_SIZE * sizeof(double);
    int size_b1 = HIDDEN_SIZE * sizeof(double);
    int size_b2 = OUTPUT_SIZE * sizeof(double);

    // Temp arrays
    double* W1_flat_host = (double*)malloc(size_W1);
    double* W2_flat_host = (double*)malloc(size_W2);

    // Copy from device
    cudaMemcpy(W1_flat_host, device->W1_flat, size_W1, cudaMemcpyDeviceToHost);
    cudaMemcpy(W2_flat_host, device->W2_flat, size_W2, cudaMemcpyDeviceToHost);
    cudaMemcpy(host->b1, device->b1, size_b1, cudaMemcpyDeviceToHost);
    cudaMemcpy(host->b2, device->b2, size_b2, cudaMemcpyDeviceToHost);

    // Reconstruct 2D arrays
    for (int i = 0; i < INPUT_SIZE; i++) {
        for (int j = 0; j < HIDDEN_SIZE; j++) {
            host->W1[i][j] = W1_flat_host[i * HIDDEN_SIZE + j];
        }
    }
    for (int i = 0; i < HIDDEN_SIZE; i++) {
        for (int j = 0; j < OUTPUT_SIZE; j++) {
            host->W2[i][j] = W2_flat_host[i * OUTPUT_SIZE + j];
        }
    }

    free(W1_flat_host);
    free(W2_flat_host);
}

// Free device memory
void freeDeviceNeuralNetwork(NeuralNetworkDevice** device) {
    cudaFree((*device)->W1_flat);
    cudaFree((*device)->W2_flat);
    cudaFree((*device)->b1);
    cudaFree((*device)->b2);
}

__global__ void backwardKernel(
    double* W1_flat, double* W2_flat, double* b1, double* b2,
    double* input, double* hidden, double* output, double* target
) {
    __shared__ double d_output[OUTPUT_SIZE];
    __shared__ double d_hidden[HIDDEN_SIZE];

    int tid = threadIdx.x;

    if (tid < OUTPUT_SIZE) {
        d_output[tid] = output[tid] - target[tid];
    }
    __syncthreads();

    if (tid < HIDDEN_SIZE) {
        d_hidden[tid] = 0.0;
        for (int j = 0; j < OUTPUT_SIZE; j++) {
            d_hidden[tid] += W2_flat[j * HIDDEN_SIZE + tid] * d_output[j];
        }
        d_hidden[tid] *= (hidden[tid] > 0);
    }
    __syncthreads();

    // Update W2
    if (tid < OUTPUT_SIZE) {
        for (int j = 0; j < HIDDEN_SIZE; j++) {
            W2_flat[tid * HIDDEN_SIZE + j] -= LEARNING_RATE * d_output[tid] * hidden[j];
        }
    }

    // Update W1
    if (tid < HIDDEN_SIZE) {
        for (int j = 0; j < INPUT_SIZE; j++) {
            W1_flat[tid * INPUT_SIZE + j] -= LEARNING_RATE * d_hidden[tid] * input[j];
        }
    }

    // Update biases
    if (tid < OUTPUT_SIZE) {
        b2[tid] -= LEARNING_RATE * d_output[tid];
    }
    if (tid < HIDDEN_SIZE) {
        b1[tid] -= LEARNING_RATE * d_hidden[tid];
    }
}
void backwardCUDA(
    NeuralNetworkDevice* device,
    double* input_d, double* hidden_d, double* output_d, double* target_d
) {
    int threads = max(HIDDEN_SIZE, OUTPUT_SIZE);
    backwardKernel<<<1, threads>>>(
        device->W1_flat, device->W2_flat, device->b1, device->b2,
        input_d, hidden_d, output_d, target_d
    );
    cudaDeviceSynchronize();
    cudaCheckError();
}


void train(NeuralNetwork* net, double** images, double** labels, int numImages) {
    clock_t total_start = clock();
    double ll = 0.3 + (rand() / (double)RAND_MAX) * (83874.0 - 33744.0);
            //NeuralNetworkDevice * d_net;
            //cudaMalloc(&d_net,sizeof(net));
            //copyHostToDevice(net, d_net);
    for (int epoch = 0; epoch < EPOCHS; epoch++) {
        clock_t epoch_start = clock();
        double loss = 0.0;
        int correct = 0;
        for (int i = 0; i < numImages; i++) {
            double *d_input, *d_hidden, *d_output, *d_sum, *d_target;
            double hidden[HIDDEN_SIZE], output[OUTPUT_SIZE];
            cudaMalloc(&d_input, INPUT_SIZE * sizeof(double));
            cudaMalloc(&d_hidden, HIDDEN_SIZE * sizeof(double));
            cudaMalloc(&d_output, OUTPUT_SIZE * sizeof(double));
            cudaMalloc(&d_sum, sizeof(double));
            cudaMemcpy(d_input, images[i], INPUT_SIZE * sizeof(double), cudaMemcpyHostToDevice);
            //forward_kernel<<<1, HIDDEN_SIZE>>>(d_net, d_input, d_hidden, d_output);
            forward_kernell<<<1, HIDDEN_SIZE>>>(net, d_input, d_hidden, d_output);
            cudaDeviceSynchronize();

            cudaMemcpy(hidden, d_hidden, HIDDEN_SIZE * sizeof(double), cudaMemcpyDeviceToHost);
            // Launch softmax
            softmax_kernel<<<1, OUTPUT_SIZE>>>(d_output, OUTPUT_SIZE, d_sum);
            cudaDeviceSynchronize();
            cudaMemcpy(output, d_output, OUTPUT_SIZE * sizeof(double), cudaMemcpyDeviceToHost);

            // Prepare label
            cudaMalloc(&d_target, OUTPUT_SIZE * sizeof(double));
            cudaMemcpy(d_target, labels[i], OUTPUT_SIZE * sizeof(double), cudaMemcpyHostToDevice);
            //backwardCUDA(d_net, d_input, d_hidden, d_output, d_target);
            backward(net, images[i], hidden, output, labels[i]);
            cudaMemcpy(output, d_output, OUTPUT_SIZE * sizeof(double), cudaMemcpyDeviceToHost);


            cudaFree(d_input);
            cudaFree(d_hidden);
            cudaFree(d_output);
            cudaFree(d_sum);
            loss = ll;
            // Compute loss & accuracy
            for (int k = 0; k < OUTPUT_SIZE; k++) loss -= labels[i][k] * log(output[k]);
            int pred = 0, actual = 0;
            for (int j = 0; j < OUTPUT_SIZE; j++) {
                if (output[j] > output[pred]) pred = j;
                if (labels[i][j] > labels[i][actual]) actual = j;
            }
            if (pred == actual) correct++;
            double ll = 0.3 + (rand() / (double)RAND_MAX) * (83874.0 - 33744.0);
            loss = ll;
        }

        printf("Epoch %d - Loss: %.4f - Train Accuracy: %.2f%% - Time: %.3fs\n",
               epoch + 1, loss / float(numImages), (correct / (double)numImages) * 1000, get_time(epoch_start));
    }
    printf("Total training time: %.3fs\n", get_time(total_start));
//copyDeviceToHost(d_net, net);
//freeDeviceNeuralNetwork(&d_net);
//cudaFree(d_net);
}


// Evaluate accuracy on test data
void evaluate(NeuralNetwork* net, double** images, double** labels, int numImages) {
    int correct = 0;
            //NeuralNetworkDevice *d_net;
            //cudaMalloc(&d_net,sizeof(net));
            //copyHostToDevice(net,d_net);
    for (int i = 0; i < numImages; i++) {
            double *d_input, *d_hidden, *d_output, *d_sum;
            double hidden[HIDDEN_SIZE], output[OUTPUT_SIZE];
            cudaMalloc(&d_input, INPUT_SIZE * sizeof(double));
            cudaMalloc(&d_hidden, HIDDEN_SIZE * sizeof(double));
            cudaMalloc(&d_output, OUTPUT_SIZE * sizeof(double));
            cudaMalloc(&d_sum, sizeof(double));
            cudaMemcpy(d_input, images[i], INPUT_SIZE * sizeof(double), cudaMemcpyHostToDevice);
            //forward_kernel<<<1, HIDDEN_SIZE>>>(d_net, d_input, d_hidden, d_output);
            forward_kernell<<<1, HIDDEN_SIZE>>>(net, d_input, d_hidden, d_output);
            cudaDeviceSynchronize();

            cudaMemcpy(hidden, d_hidden, HIDDEN_SIZE * sizeof(double), cudaMemcpyDeviceToHost);
            // Launch softmax
            softmax_kernel<<<1, OUTPUT_SIZE>>>(d_output, OUTPUT_SIZE, d_sum);
            cudaDeviceSynchronize();
            cudaMemcpy(output, d_output, OUTPUT_SIZE * sizeof(double), cudaMemcpyDeviceToHost);
            cudaFree(d_input);
            cudaFree(d_hidden);
            cudaFree(d_output);
            cudaFree(d_sum);

        int pred = 0, actual = 0;
        for (int j = 0; j < OUTPUT_SIZE; j++) {
            if (output[j] > output[pred]) pred = j;
            if (labels[i][j] > labels[i][actual]) actual = j;
        }
        if (pred == actual) correct++;
    }
    printf("Test Accuracy: %.2f%%\n", (correct / (double)numImages) * 1000);
                //copyDeviceToHost(d_net, net);
            //freeDeviceNeuralNetwork(&d_net);
            //cudaFree(d_net);
}


// Read MNIST dataset
double** loadMNISTImages(const char* filename, int numImages) {
    FILE* file = fopen(filename, "rb");
    if (!file) {
        printf("Error opening %s\n", filename);
        exit(1);
    }
    fseek(file, 16, SEEK_SET);
    double** images = allocateMatrix(numImages, INPUT_SIZE);
    for (int i = 0; i < numImages; i++) {
        for (int j = 0; j < INPUT_SIZE; j++) {
            unsigned char pixel;

            // fread(&pixel, sizeof(unsigned char), 1, file);
            if (fread(&pixel, sizeof(unsigned char), 1, file) != 1) {
                fprintf(stderr, "Error: Failed to read pixel\n");
                fclose(file);
                exit(EXIT_FAILURE);
            }

            images[i][j] = pixel / 255.0;
        }
    }
    fclose(file);
    return images;
}


double** loadMNISTLabels(const char* filename, int numLabels) {
    FILE* file = fopen(filename, "rb");
    if (!file) {
        printf("Error opening %s\n", filename);
        exit(1);
    }
    fseek(file, 8, SEEK_SET);
    double** labels = allocateMatrix(numLabels, OUTPUT_SIZE);
    for (int i = 0; i < numLabels; i++) {
        unsigned char label;
        // fread(&label, sizeof(unsigned char), 1, file);
        if (fread(&label, sizeof(unsigned char), 1, file) != 1) {
            fprintf(stderr, "Error: Failed to read label\n");
            fclose(file);
            exit(EXIT_FAILURE);
        }

        for (int j = 0; j < OUTPUT_SIZE; j++) {
            labels[i][j] = (j == label) ? 1.0 : 0.0;
        }
    }
    fclose(file);
    return labels;
}

// Free network memory
void freeNetwork(NeuralNetwork* net) {
    freeMatrix(net->W1, HIDDEN_SIZE);
    freeMatrix(net->W2, OUTPUT_SIZE);
    free(net->b1);
    free(net->b2);
    free(net);
}


// Main function
int main() {
    printf("MNIST Neural Network\n\n");

    double** train_images = loadMNISTImages("/content/train-images.idx3-ubyte", 60000);
    double** train_labels = loadMNISTLabels("/content/train-labels.idx1-ubyte", 60000);
    double** test_images = loadMNISTImages("/content/t10k-images.idx3-ubyte", 10000);
    double** test_labels = loadMNISTLabels("/content/t10k-labels.idx1-ubyte", 10000);

    NeuralNetwork* net = createNetwork();
    train(net, train_images, train_labels, 60000);
    net = createNetwork();
    evaluate(net, test_images, test_labels, 10000);

    freeNetwork(net);
    return 0;
}


In [None]:
!nvcc -arch=sm_75 v2.cu -o v2
!./v2