-
Notifications
You must be signed in to change notification settings - Fork 0
/
mandel.cu
111 lines (100 loc) · 3.25 KB
/
mandel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
//http://nothings.org/stb/stb_image_write.h
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"
#include "stdio.h"
#include <vector>
int __host__ __device__ mandel(double p, double q, double maxv, int maxk)
{
double x0 = p;
double y0 = q;
double r;
int k = 0;
do
{
double x = x0 * x0 - y0 * y0 + p;
double y = 2 * x0 * y0 + q;
x0 = x;
y0 = y;
r = x * x + y * y;
k++;
}
while (r <= maxv && k < maxk);
return k;
}
int iceil(int a, int b)
{
return (a+b-1)/b;
}
void __global__ kernel(unsigned char * out, int w,int h, double pmin, double pmax, double qmin, double qmax, double maxv, int maxk)
{
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
int index = j + i * w;
if(i < h && j < w)
{
int k = mandel(i*((pmax-pmin)/h)+pmin,j*((qmax-qmin)/w)+qmin, maxv, maxk);
//i*w+j
out[index] = k >= maxk ? 0 : k;
}
}
int main(int argc, char const *argv[])
{
double pmin = -2.25, pmax = 0.75, qmin = -1.5, qmax = 1.5;
double maxv = 100;
int maxk = 128;
int w = 640;
int h = 480;
cudaSetDevice(0);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor);
printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
deviceProp.maxGridSize[0],
deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]);
//double tp = omp_get_wtime();
std::vector<unsigned char> image(w*h);
unsigned char * dev_i;
//std::vector<unsigned char> image2(w*h);
//double t0 = omp_get_wtime();
for(int i = 0; i < h; i++)
for(int j = 0; j < w; j++)
{
int k = mandel(i*((pmax-pmin)/h)+pmin,j*((qmax-qmin)/w)+qmin, maxv, maxk);
image[i*w+j] = k >= maxk ? 0 : k;
//image2[i*w+j] = omp_get_thread_num()*256/n;
}
stbi_write_png("out.png",w,h,1,&image[0],w);
// GPU
// transfer
cudaMalloc(&dev_i,w*h);
cudaMemset(dev_i,128,w*h);
// malloc
unsigned int kernelTime;
//cutCreateTimer(&kernelTime);
//cutResetTimer(kernelTime);
//cutStartTim,1);
dim3 dimBlock(32, 32,1);
dim3 dimGrid(iceil(w,dimBlock.x),iceil(h,dimBlock.y));
printf("Blocks %d %d %d Threads %d %d %d\n",dimGrid.x,dimGrid.y,dimGrid.z,dimBlock.x,dimBlock.y,dimBlock.z);
kernel<<<dimGrid,dimBlock>>>(dev_i,w,h,pmin,pmax,qmin,qmax,maxv,maxk);
{
cudaError_t cudaerr = cudaPeekAtLastError();
if (cudaerr != 0)
printf("kernel launch failed with error \"%s\".\n",
cudaGetErrorString(cudaerr));
}
cudaMemcpy(&image[0],dev_i,w*h,cudaMemcpyDeviceToHost);
//cutStopTimer(kernelTime);
cudaFree(dev_i);
//printf ("Time for the kernel: %f ms\n", cutGetTimerValue(kernelTime));
stbi_write_png("outGPU.png",w,h,1,&image[0],w);
/// double t1 = omp_get_wtime();
//printf("Output is %f and warmup %f\n",t1-t0,t0-tp);
return 0;
}