-
Notifications
You must be signed in to change notification settings - Fork 20
/
matrix_transpose.cu
110 lines (83 loc) · 2.99 KB
/
matrix_transpose.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#include <iostream>
#include <math.h>
#include <time.h>
#include <stdlib.h>
#include <random>
#include <vector>
#include <chrono>
#define TILE_DIM 32
#define BLOCK_ROWS 8
__global__ void transposeNaive(int *odata, const int *idata, const int n, const int m) {
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS) {
if (x < m && (y+j) < n) {
odata[x*n + (y+j)] = idata[(y+j)*m + x];
}
}
}
__global__ void transposeSharedMem(int *odata, const int *idata, const int n, const int m) {
__shared__ int tile[TILE_DIM * (TILE_DIM+1)];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
tile[(threadIdx.y+j)*(TILE_DIM+1) + threadIdx.x] = idata[(y+j)*m + x];
}
__syncthreads();
x = blockIdx.y * TILE_DIM + threadIdx.x;
y = blockIdx.x * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS) {
if (x < n && (y+j) < m) {
odata[(y+j)*n + x] = tile[threadIdx.x*(TILE_DIM+1) + threadIdx.y+j];
}
}
}
std::vector<std::vector<int>> random_matrix(const int num_rows, const int num_cols, const int min_val=0.0, const int max_val=1000.0) {
std::vector<std::vector<int>> my_arr;
static std::random_device rd;
static std::mt19937 mte(rd());
std::uniform_int_distribution<int> dist(min_val, max_val);
for (int i = 0; i < num_rows; i++) {
std::vector<int> my_arr_col;
for (int j = 0; j < num_cols; j++) {
my_arr_col.push_back(dist(mte));
}
my_arr.push_back(my_arr_col);
}
return my_arr;
}
bool check_correctness(int *odata, const int *idata, const int n, const int m) {
for (int i = 0; i < n*m; i++) {
int y = i/m;
int x = i % m;
if ((n*x + y) >= n*m || odata[n*x + y] != idata[i]) {
return false;
}
}
return true;
}
int main(void) {
int n = 2000;
int m = 5000;
dim3 dimGrid((m + TILE_DIM - 1)/TILE_DIM, (n + TILE_DIM - 1)/TILE_DIM, 1);
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
int *idata, *odata;
cudaMallocManaged(&idata, n*m*sizeof(int));
cudaMallocManaged(&odata, n*m*sizeof(int));
std::vector<std::vector<int>> my_arr = random_matrix(n, m, 0.0, 100.0);
for (int i = 0; i < n; i++) {
for (int j = 0; j < m; j++) {
idata[m*i + j] = my_arr[i][j];
}
}
auto t1 = std::chrono::high_resolution_clock::now();
transposeSharedMem<<<dimGrid, dimBlock>>>(odata, idata, n, m);
cudaDeviceSynchronize();
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( t2 - t1 ).count();
std::cout << duration << std::endl;
std::cout << check_correctness(odata, idata, n, m) << std::endl;
cudaFree(idata);
cudaFree(odata);
return 0;
}