-
Notifications
You must be signed in to change notification settings - Fork 2
/
solution.py
134 lines (102 loc) · 4.13 KB
/
solution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# SPDX-License-Identifier: BSD-3-Clause
# The pure Python version rewritten to use CUDA with Numba. Various
# modifications are needed to implement Python functions / features not present
# in the CUDA - these are noted in the comments below.
from numba import cuda
from math import ceil
import numpy as np
# The example and real inputs have different preamble lengths
USE_EXAMPLE = False
if USE_EXAMPLE:
filename = 'example_input'
preamble_length = 5
else:
filename = 'input'
preamble_length = 25
# Read in input
with open(filename) as f:
# lists have very little support in the CUDA target - use a NumPy array for
# the input instead
seq = np.asarray([int(x.strip()) for x in f.readlines()], dtype=np.uint32)
# Part 1
# The kernel for part 1 parallelises the search for the impostor by assigning
# each thread its own window to check.
@cuda.jit
def find_impostor(seq, answer):
# Thread ID
i = cuda.grid(1)
# Compute the window for this thread
lower = i
upper = lower + preamble_length
# If the window is outside the input, then don't do anything. This "early
# return" replaces the while loop in the sequential version
if upper >= len(seq):
return
# The potential impostor's value
current = seq[upper]
found = False
# Itertools is not supported in CUDA, so we implement the product operation
# with nested loops and index into the sequence using the indices we
# generate.
for xi in range(lower, upper):
for yi in range(lower, upper):
x = seq[xi]
y = seq[yi]
# Similar checks as found in the sequential version
if x == y:
continue
if x + y == current:
# No need to increment any indices here, because other threads
# handle other indices
found = True
if not found:
# We're looking for the first number that's not a sum of any pair in its
# preamble - as there may be multiple numbers like this, we use atomic
# min to ensure we give back the lowest index of an impostor
cuda.atomic.min(answer, 0, upper)
# Arbitrarily-chosen block size, and a grid large enough to have one thread per
# window.
n_threads = 256
n_blocks = ceil(len(seq) / n_threads)
# Instead of returning the first impostor value, we instead return its index -
# this is because we can't use list.index (or np.where) later on in part 2 to
# get the index.
impostor_idx = np.asarray([len(seq) + 1], dtype=np.uint32)
# Launch the kernel
find_impostor[n_blocks, n_threads](seq, impostor_idx)
# Look up the actual value of the solution based on the index found by the
# kernel.
impostor = seq[impostor_idx[0]]
print(f"The answer to part 1 is {impostor}")
# Part 2
# The kernel for part 2 parallelises using one thread per starting index.
@cuda.jit
def find_contiguous(impostor_idx, seq, answer):
i = cuda.grid(1)
# Ensure we're not working on elements beyond the index of the impostor
if i >= impostor_idx:
return
for j in range(impostor_idx - i):
# We can't use sum, min, and max on arrays in the CUDA target, so we
# implement them ourselves using a loop and an accumulator for each
candidate_sum = 0
# Starting minimum value chosen to be large enough that the initial
# minimum value we find will be smaller than this
candidate_min = 999999999999999
candidate_max = 0
for k in range(i, i+j):
current = seq[k]
candidate_sum += current
candidate_min = min(candidate_min, current)
candidate_max = max(candidate_max, current)
if candidate_sum == impostor:
# No atomic operation here as there's only one solution to be found
answer[0] = candidate_min + candidate_max
# Allocate space for result
answer = np.zeros(1, dtype=np.uint32)
# Make the grid large enough for one thread per starting point, up to the
# location of the impostor
n_blocks = ceil(impostor_idx / n_threads)
# Kernel launch
find_contiguous[n_blocks, n_threads](impostor_idx[0], seq, answer)
print(f"The answer to part 2 is {answer[0]}")