-
Notifications
You must be signed in to change notification settings - Fork 2
/
solution.py
213 lines (174 loc) · 5.99 KB
/
solution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# SPDX-License-Identifier: BSD-3-Clause
from math import ceil
from numba import cuda, types
import numpy as np
# The idea here was to read in the input as a sequence of bytes and do all
# processing on the GPU. Unfortunately this didn't work out in the time I had
# available, but I leave my incomplete solution here in case I return to it, or
# in case it is of interest.
with open('example_input') as f:
encoded = f.read().encode('utf-8')
np_batch = np.frombuffer(encoded, dtype=np.uint8)
batch = cuda.to_device(np_batch)
print(batch)
n_bytes = len(batch)
newline_char = ord('\n')
cb = ord('b')
cy = ord('y')
cr = ord('r')
ci = ord('i')
ce = ord('e')
ch = ord('h')
cl = ord('l')
cp = ord('p')
ccolon = ord(':')
@cuda.jit
def validate_passports(batch, newline_counter, newline_locs, answer):
# Thread ID, as usual
i = cuda.grid(1)
# Grid group - used for synchronizing all threads within the grid
grid = cuda.cg.this_grid()
# Find the locations of the blank lines. One thread per byte of input
# (looking at the adjacent byte as well), each checks if it is on a blank
# line. Since we don't know which threads will find blank lines, we
# atomically increment a count of position in the array storing the
# positions of blank lines.
if i < (n_bytes - 1):
if batch[i] == newline_char and batch[i + 1] == newline_char:
nli = cuda.atomic.inc(newline_counter, 0, n_bytes)
newline_locs[nli] = i
# Now we know where all the blank lines are, but not in order.
# Do a grid sync so that every thread can see all the blank line locations.
grid.sync()
# Debugging - was using this to see where the kernel got to
if i == 0:
print(9901)
# Sort the list of blank lines. Commented code is a whole-grid odd-even
# parallel sort I started writing (which works to some extent), but I
# switched to using a single-thread version to make life easier whilst
# developing. See e.g. Chapter 46 from GPU Gems2:
# https://developer.nvidia.com/gpugems/gpugems2/part-vi-simulation-and-numerical-algorithms/chapter-46-improved-gpu-sorting
# my_index = i * 2
# even = False
# newline_count = newline_counter[0]
#
# if i == 0:
# print(9901)
#
# if my_index < (newline_count - 2):
# while True:
# if i == 0:
# print(9902)
# # Initialise the count of changes
# if i == 0:
# answer[0] = 0
# # Wait until every thread can see that the count is 0
# grid.sync()
#
# if i == 0:
# print(9903)
#
# # Determine indices into the array being sorted based on whether
# # we're in an odd or even pass
# ri1 = my_index
# ri2 = my_index + 1
# if even:
# ri1 += 1
# ri2 += 1
#
# # Read in the values for comparison
# p1, p2 = newline_locs[ri1], newline_locs[ri2]
#
# # Swap the values if necessary
# if p1 < p2:
# newline_locs[ri1] = p1
# newline_locs[ri2] = p2
# else:
# newline_locs[ri1] = p2
# newline_locs[ri2] = p1
# # If we swapped something, make a note of that
# cuda.atomic.inc(answer, 0, n_bytes)
#
# if i == 0:
# print(9904)
#
# # Wait until every thread has finished this pass
# grid.sync()
#
# if i == 0:
# print(9905)
#
# if i == 0:
# print(answer[0])
# # If no thread swapped its pair, the list is now in order
# if answer[0] == 0:
# break
#
# # Swap between odd and even passes for the next iteration
# even = not even
# Quick implementation of single-threaded sort. Parallelise later.
if i == 0:
changed = True
while changed:
changed = False
for i in range(newline_counter[0] - 1):
p1, p2 = newline_locs[i], newline_locs[i + 1]
if p2 < p1:
newline_locs[i] = p2
newline_locs[i + 1] = p1
changed = True
print(99021)
#print(990000 + i)
if i == 0:
print(9989)
# Now we've sorted the list, sync the whole grid again to make sure all
# threads can see the sorted list.
grid.sync()
if i == 0:
print(9902)
# Very incomplete for the rest of the kernel - the idea was for one thread
# per line to check each line for the required fields (then parallelise
# further by one thread per byte if I got this working).
if i < newline_counter[0] + 1:
if i == 0:
lb = 0
else:
lb = newline_locs[i - 1]
ub = newline_locs[i]
# Assume no line greater than 128 chars
my_line = cuda.local.array(128, dtype=types.uint8)
have_byr = False
have_iyr = False
have_eyr = False
have_hgt = False
have_hcl = False
have_ecl = False
have_pid = False
for p in range((ub - lb) - 4):
chars = my_line[p:p+4]
if i == 3:
print(chars)
if chars[3] != ccolon:
continue
if chars[0] == cb and chars[1] == cy and chars[2] == cr:
have_byr = True
valid = have_byr
if i == 0:
cuda.atomic.exch(answer, 0, 0)
if i == 0:
print(9904)
grid.sync()
if i == 0:
print(9905)
#if valid:
# cuda.atomic.inc(answer, 0, newline_counter[0])
n_threads = 256
n_blocks = ceil(n_bytes / n_threads)
answer = np.zeros(1, dtype=np.uint32)
newline_counter = np.zeros(1, dtype=np.uint32)
newline_locs = cuda.device_array(n_bytes, dtype=np.uint32)
validate_passports[n_blocks, n_threads](batch, newline_counter, newline_locs,
answer)
cuda.synchronize()
print(newline_locs.copy_to_host())
print(answer)