In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
%cd /content/drive/MyDrive/

/content/drive/MyDrive


In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

In [36]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Wed_Apr_11_23:16:29_CDT_2018
Cuda compilation tools, release 9.2, V9.2.88


In [37]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-ojtc_pe7
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-ojtc_pe7
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=0743e11067589b0bed10c087ffd9106663e6c09f3b9682aeb7a1d0522aa649f2
  Stored in directory: /tmp/pip-ephem-wheel-cache-cy2vr6uz/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin


In [38]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [141]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include "chrono"

void matDist (float * a, float * b, int n, int BLOCK_SIZE, float * c) {
    float sum;
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            sum = 0.0f;
            for (int k = 0; k < n; k++) {
                sum += abs(a[i * n + j] - b[j * n + i]);
            }
            c[i * n + j] = sum;
        }
    }
}

__global__ void matDistGlobal ( float * a, float * b, int n, int BLOCK_SIZE, float * c ) {
    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    float sum = 0.0f;
    int ia = n * BLOCK_SIZE * by + n * ty;
    int ib = BLOCK_SIZE * bx + tx;
    int ic = ia + ib;
    for (int k = 0; k < n; k++) {
        sum += abs(a[ia + k] - b[ib + k*n]);
    }
    c[ic] = sum;
}

__global__ void matDistShared (float * a, float * b, int n, int BLOCK_SIZE, float * c) {
    int bx = blockIdx.x, by = blockIdx.y;
    int tx = threadIdx.x, ty = threadIdx.y;
    int aBegin = n * 16 * by;
    int aEnd = aBegin + n - 1;
    int bBegin = 16 * bx;
    int aStep = 16, bStep = 16 * n;
    float sum = 0.0f;
    __shared__ float as [16][17];
    __shared__ float bs [16][17];
    for ( int ia = aBegin, ib = bBegin; ia <= aEnd; ia += aStep, ib += bStep) {
        as [ty][tx] = a [ia + n * ty + tx];
        bs [ty][tx] = b [ib + n * ty + tx];
        __syncthreads ();
        for ( int k = 0; k < BLOCK_SIZE; k++) {
            sum += abs(as[k][ty] - bs [tx][k]);
        }
        __syncthreads ();
    }
    c[aBegin + bBegin + n * ty + tx] = sum;
}

int main() {
    int BLOCK_SIZE = 16; int N = 1024;

    int numBytes = N * N * sizeof(float);
    float * h_A = (float*)malloc(numBytes);;
    float * h_B = (float*)malloc(numBytes);;
    float * h_C = (float*)malloc(numBytes);;
    
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            h_A[i *N + j] = rand();
            h_B[j * N + i] = h_A[i * N + j];
        }
    }

    float * d_A; float * d_B; float * d_C;
    
    cudaMalloc((void**)&d_A, numBytes);
    cudaMalloc((void**)&d_B, numBytes);
    cudaMalloc((void**)&d_C, numBytes);
    
    dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
    dim3 blocks(N / BLOCK_SIZE, N / BLOCK_SIZE);
    
    cudaMemcpy(d_A, h_A, numBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, numBytes, cudaMemcpyHostToDevice);

    auto start = std::chrono::high_resolution_clock::now();

    matDist(h_A, h_B, N, BLOCK_SIZE, h_C);

    auto stop = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);

    printf("Serial: %i\n\n", duration.count());

    auto startG = std::chrono::high_resolution_clock::now();

    matDistGlobal<<<blocks,threads>>> (d_A, d_B, N, BLOCK_SIZE, d_C);

    auto stopG = std::chrono::high_resolution_clock::now();
    auto durationG = std::chrono::duration_cast<std::chrono::microseconds>(stopG - startG);

    cudaMemcpy(h_C, d_C, numBytes, cudaMemcpyDeviceToHost);

    printf("Global: %i\n\n", durationG.count());

    auto startS = std::chrono::high_resolution_clock::now();

    matDistShared<<<blocks,threads>>> (d_A, d_B, N, BLOCK_SIZE, d_C);

    auto stopS = std::chrono::high_resolution_clock::now();
    auto durationS = std::chrono::duration_cast<std::chrono::microseconds>(stopS - startS);

    cudaMemcpy(h_C, d_C, numBytes, cudaMemcpyDeviceToHost);

    printf("Shared: %i\n\n", durationS.count());

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);
return 0;
}

Serial: 4883807

Global: 156

Shared: 26




In [142]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include "chrono"

void matDist (float * a, float * b, int n, int BLOCK_SIZE, float * c) {
    float sum;
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            sum = 0.0f;
            for (int k = 0; k < n; k++) {
                sum += abs(a[i * n + j] - b[j * n + i]);
            }
            c[i * n + j] = sum;
        }
    }
}

__global__ void matDistGlobal ( float * a, float * b, int n, int BLOCK_SIZE, float * c ) {
    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    float sum = 0.0f;
    int ia = n * BLOCK_SIZE * by + n * ty;
    int ib = BLOCK_SIZE * bx + tx;
    int ic = ia + ib;
    for (int k = 0; k < n; k++) {
        sum += abs(a[ia + k] - b[ib + k*n]);
    }
    c[ic] = sum;
}

__global__ void matDistShared (float * a, float * b, int n, int BLOCK_SIZE, float * c) {
    int bx = blockIdx.x, by = blockIdx.y;
    int tx = threadIdx.x, ty = threadIdx.y;
    int aBegin = n * 16 * by;
    int aEnd = aBegin + n - 1;
    int bBegin = 16 * bx;
    int aStep = 16, bStep = 16 * n;
    float sum = 0.0f;
    __shared__ float as [16][17];
    __shared__ float bs [16][17];
    for ( int ia = aBegin, ib = bBegin; ia <= aEnd; ia += aStep, ib += bStep) {
        as [ty][tx] = a [ia + n * ty + tx];
        bs [ty][tx] = b [ib + n * ty + tx];
        __syncthreads ();
        for ( int k = 0; k < BLOCK_SIZE; k++) {
            sum += abs(as[k][ty] - bs [tx][k]);
        }
        __syncthreads ();
    }
    c[aBegin + bBegin + n * ty + tx] = sum;
}

int main() {
    int BLOCK_SIZE = 16; int N = 2048;

    int numBytes = N * N * sizeof(float);
    float * h_A = (float*)malloc(numBytes);;
    float * h_B = (float*)malloc(numBytes);;
    float * h_C = (float*)malloc(numBytes);;
    
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            h_A[i *N + j] = rand();
            h_B[j * N + i] = h_A[i * N + j];
        }
    }

    float * d_A; float * d_B; float * d_C;
    
    cudaMalloc((void**)&d_A, numBytes);
    cudaMalloc((void**)&d_B, numBytes);
    cudaMalloc((void**)&d_C, numBytes);
    
    dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
    dim3 blocks(N / BLOCK_SIZE, N / BLOCK_SIZE);
    
    cudaMemcpy(d_A, h_A, numBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, numBytes, cudaMemcpyHostToDevice);

    auto start = std::chrono::high_resolution_clock::now();

    matDist(h_A, h_B, N, BLOCK_SIZE, h_C);

    auto stop = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);

    printf("Serial: %i\n\n", duration.count());

    auto startG = std::chrono::high_resolution_clock::now();

    matDistGlobal<<<blocks,threads>>> (d_A, d_B, N, BLOCK_SIZE, d_C);

    auto stopG = std::chrono::high_resolution_clock::now();
    auto durationG = std::chrono::duration_cast<std::chrono::microseconds>(stopG - startG);

    cudaMemcpy(h_C, d_C, numBytes, cudaMemcpyDeviceToHost);

    printf("Global: %i\n\n", durationG.count());

    auto startS = std::chrono::high_resolution_clock::now();

    matDistShared<<<blocks,threads>>> (d_A, d_B, N, BLOCK_SIZE, d_C);

    auto stopS = std::chrono::high_resolution_clock::now();
    auto durationS = std::chrono::duration_cast<std::chrono::microseconds>(stopS - startS);

    cudaMemcpy(h_C, d_C, numBytes, cudaMemcpyDeviceToHost);

    printf("Shared: %i\n\n", durationS.count());

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);
return 0;
}

Serial: 38978654

Global: 235

Shared: 29




In [140]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include "chrono"

void matDist (float * a, float * b, int n, int BLOCK_SIZE, float * c) {
    float sum;
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            sum = 0.0f;
            for (int k = 0; k < n; k++) {
                sum += abs(a[i * n + j] - b[j * n + i]);
            }
            c[i * n + j] = sum;
        }
    }
}

__global__ void matDistGlobal ( float * a, float * b, int n, int BLOCK_SIZE, float * c ) {
    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    float sum = 0.0f;
    int ia = n * BLOCK_SIZE * by + n * ty;
    int ib = BLOCK_SIZE * bx + tx;
    int ic = ia + ib;
    for (int k = 0; k < n; k++) {
        sum += abs(a[ia + k] - b[ib + k*n]);
    }
    c[ic] = sum;
}

__global__ void matDistShared (float * a, float * b, int n, int BLOCK_SIZE, float * c) {
    int bx = blockIdx.x, by = blockIdx.y;
    int tx = threadIdx.x, ty = threadIdx.y;
    int aBegin = n * 16 * by;
    int aEnd = aBegin + n - 1;
    int bBegin = 16 * bx;
    int aStep = 16, bStep = 16 * n;
    float sum = 0.0f;
    __shared__ float as [16][17];
    __shared__ float bs [16][17];
    for ( int ia = aBegin, ib = bBegin; ia <= aEnd; ia += aStep, ib += bStep) {
        as [ty][tx] = a [ia + n * ty + tx];
        bs [ty][tx] = b [ib + n * ty + tx];
        __syncthreads ();
        for ( int k = 0; k < BLOCK_SIZE; k++) {
            sum += abs(as[k][ty] - bs [tx][k]);
        }
        __syncthreads ();
    }
    c[aBegin + bBegin + n * ty + tx] = sum;
}

int main() {
    int BLOCK_SIZE = 16; int N = 4096;

    int numBytes = N * N * sizeof(float);
    float * h_A = (float*)malloc(numBytes);;
    float * h_B = (float*)malloc(numBytes);;
    float * h_C = (float*)malloc(numBytes);;
    
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            h_A[i *N + j] = rand();
            h_B[j * N + i] = h_A[i * N + j];
        }
    }

    float * d_A; float * d_B; float * d_C;
    
    cudaMalloc((void**)&d_A, numBytes);
    cudaMalloc((void**)&d_B, numBytes);
    cudaMalloc((void**)&d_C, numBytes);
    
    dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
    dim3 blocks(N / BLOCK_SIZE, N / BLOCK_SIZE);
    
    cudaMemcpy(d_A, h_A, numBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, numBytes, cudaMemcpyHostToDevice);

    auto start = std::chrono::high_resolution_clock::now();

    matDist(h_A, h_B, N, BLOCK_SIZE, h_C);

    auto stop = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);

    printf("Serial: %i\n\n", duration.count());

    auto startG = std::chrono::high_resolution_clock::now();

    matDistGlobal<<<blocks,threads>>> (d_A, d_B, N, BLOCK_SIZE, d_C);

    auto stopG = std::chrono::high_resolution_clock::now();
    auto durationG = std::chrono::duration_cast<std::chrono::microseconds>(stopG - startG);

    cudaMemcpy(h_C, d_C, numBytes, cudaMemcpyDeviceToHost);

    printf("Global: %i\n\n", durationG.count());

    auto startS = std::chrono::high_resolution_clock::now();

    matDistShared<<<blocks,threads>>> (d_A, d_B, N, BLOCK_SIZE, d_C);

    auto stopS = std::chrono::high_resolution_clock::now();
    auto durationS = std::chrono::duration_cast<std::chrono::microseconds>(stopS - startS);

    cudaMemcpy(h_C, d_C, numBytes, cudaMemcpyDeviceToHost);

    printf("Shared: %i\n\n", durationS.count());

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);
return 0;
}

Serial: 314069477

Global: 933

Shared: 23


