-
Notifications
You must be signed in to change notification settings - Fork 1.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Second tuning for thomaswue #263
Changes from 4 commits
afc80e1
1067e20
636c3e9
e97937f
b3b8851
7d12c49
5598cfb
9fd6367
bc5db32
45dda82
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,28 +30,28 @@ | |
|
||
/** | ||
* Simple solution that memory maps the input file, then splits it into one segment per available core and uses | ||
* sun.misc.Unsafe to directly access the mapped memory. | ||
* | ||
* Runs in 0.92s on my Intel i9-13900K | ||
* sun.misc.Unsafe to directly access the mapped memory. Uses a long at a time when checking for collision. | ||
* <p> | ||
* Runs in 0.80s on my Intel i9-13900K | ||
* Perf stats: | ||
* 65,004,666,383 cpu_core/cycles/ | ||
* 71,141,249,972 cpu_atom/cycles/ | ||
* 53,344,113,388 cpu_core/cycles/ | ||
* 66,193,198,848 cpu_atom/cycles/ | ||
*/ | ||
public class CalculateAverage_thomaswue { | ||
private static final String FILE = "./measurements.txt"; | ||
|
||
// Holding the current result for a single city. | ||
private static class Result { | ||
short min; | ||
short max; | ||
int min; | ||
int max; | ||
long sum; | ||
int count; | ||
final long nameAddress; | ||
|
||
private Result(long nameAddress, int value) { | ||
this.nameAddress = nameAddress; | ||
this.min = (short) value; | ||
this.max = (short) value; | ||
this.min = value; | ||
this.max = value; | ||
this.sum = value; | ||
this.count = 1; | ||
} | ||
|
@@ -66,8 +66,8 @@ private static double round(double value) { | |
|
||
// Accumulate another result into this one. | ||
private void add(Result other) { | ||
min = (short) Math.min(min, other.min); | ||
max = (short) Math.max(max, other.max); | ||
min = Math.min(min, other.min); | ||
max = Math.max(max, other.max); | ||
sum += other.sum; | ||
count += other.count; | ||
} | ||
|
@@ -81,8 +81,7 @@ public static void main(String[] args) throws IOException { | |
// Parallel processing of segments. | ||
List<HashMap<String, Result>> allResults = IntStream.range(0, chunks.length - 1).mapToObj(chunkIndex -> { | ||
HashMap<String, Result> cities = HashMap.newHashMap(1 << 10); | ||
Result[] results = new Result[1 << 18]; | ||
parseLoop(chunks[chunkIndex], chunks[chunkIndex + 1], results, cities); | ||
parseLoop(chunks[chunkIndex], chunks[chunkIndex + 1], cities); | ||
return cities; | ||
}).parallel().toList(); | ||
|
||
|
@@ -114,44 +113,44 @@ private static Unsafe initUnsafe() { | |
} | ||
} | ||
|
||
private static void parseLoop(long chunkStart, long chunkEnd, Result[] results, HashMap<String, Result> cities) { | ||
private static void parseLoop(long chunkStart, long chunkEnd, HashMap<String, Result> cities) { | ||
Result[] results = new Result[1 << 18]; | ||
long scanPtr = chunkStart; | ||
byte b; | ||
while (scanPtr < chunkEnd) { | ||
long nameAddress = scanPtr; | ||
int hash = 0; | ||
|
||
// Skip first letter. | ||
scanPtr++; | ||
|
||
// Scan for ';' delimiter, always 4 bytes at a time. | ||
while (true) { | ||
int nextVal = UNSAFE.getInt(scanPtr); | ||
if ((nextVal & 0x3B) == 0x3B) { | ||
scanPtr++; | ||
break; | ||
} | ||
else if ((nextVal & 0x3B00) == 0x3B00) { | ||
scanPtr += 2; | ||
hash = hash ^ (nextVal & 0xFF); | ||
break; | ||
} | ||
else if ((nextVal & 0x3B0000) == 0x3B0000) { | ||
scanPtr += 3; | ||
hash = hash ^ (nextVal & 0xFFFF); | ||
break; | ||
} | ||
else if (((nextVal & 0x3B000000) == 0x3B000000)) { | ||
scanPtr += 4; | ||
hash = hash ^ (nextVal & 0xFFFFFF); | ||
break; | ||
long hash = 0; | ||
|
||
// Search for ';', one long at a time. | ||
long word = UNSAFE.getLong(scanPtr); | ||
int pos = findDelimiter(word); | ||
if (pos != 8) { | ||
scanPtr += pos; | ||
word = word & (-1L >>> ((8 - pos - 1) << 3)); | ||
hash ^= word; | ||
} | ||
else { | ||
scanPtr += 8; | ||
hash ^= word; | ||
while (true) { | ||
word = UNSAFE.getLong(scanPtr); | ||
pos = findDelimiter(word); | ||
if (pos != 8) { | ||
scanPtr += pos; | ||
word = word & (-1L >>> ((8 - pos - 1) << 3)); | ||
hash ^= word; | ||
break; | ||
} | ||
else { | ||
scanPtr += 8; | ||
hash ^= word; | ||
} | ||
} | ||
scanPtr += 4; | ||
hash = hash ^ nextVal; | ||
} | ||
|
||
// Save length of name for later. | ||
int nameLength = (int) (scanPtr - nameAddress - 1); | ||
int nameLength = (int) (scanPtr - nameAddress); | ||
scanPtr++; | ||
|
||
// Parse number. | ||
int number; | ||
|
@@ -175,42 +174,25 @@ else if (((nextVal & 0x3B000000) == 0x3B000000)) { | |
} | ||
|
||
// Final calculation for index into hash table. | ||
int tableIndex = (((hash ^ (hash >>> 18)) & (results.length - 1))); | ||
while (true) { | ||
int hashAsInt = (int) (hash ^ (hash >>> 32)); | ||
int finalHash = (hashAsInt ^ (hashAsInt >>> 18)); | ||
int tableIndex = (finalHash & (results.length - 1)); | ||
outer: while (true) { | ||
Result existingResult = results[tableIndex]; | ||
if (existingResult == null) { | ||
newEntry(results, cities, nameAddress, number, tableIndex, nameLength); | ||
break; | ||
} | ||
else { | ||
// Check for collision. | ||
boolean result = true; | ||
int i = 0; | ||
if ((long) nameLength >= 8) { | ||
if (UNSAFE.getLong(existingResult.nameAddress) != UNSAFE.getLong(nameAddress)) { | ||
result = false; | ||
} | ||
else { | ||
i += 8; | ||
for (; i < nameLength - 8; i += 8) { | ||
if (UNSAFE.getLong(existingResult.nameAddress + i) != UNSAFE.getLong(nameAddress + i)) { | ||
tableIndex = (tableIndex + 1) & (results.length - 1); | ||
continue outer; | ||
} | ||
} | ||
else if ((long) nameLength >= 4) { | ||
if (UNSAFE.getInt(existingResult.nameAddress) != UNSAFE.getInt(nameAddress)) { | ||
result = false; | ||
} | ||
else { | ||
i += 4; | ||
} | ||
} | ||
if (result) { | ||
for (; i < (long) nameLength; ++i) { | ||
if (UNSAFE.getByte(existingResult.nameAddress + i) != UNSAFE.getByte(nameAddress + i)) { | ||
result = false; | ||
break; | ||
} | ||
} | ||
} | ||
if (result) { | ||
if (((UNSAFE.getLong(existingResult.nameAddress + i) ^ UNSAFE.getLong(nameAddress + i)) << (64 - (nameLength - i) << 3)) == 0) { | ||
existingResult.min = (short) Math.min(existingResult.min, number); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like these cast would now be unnecessary like in lines 69-70 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, agreed, thanks for spotting. Left over from last version and now this might actually add an additional instruction! |
||
existingResult.max = (short) Math.max(existingResult.max, number); | ||
existingResult.sum += number; | ||
|
@@ -229,12 +211,20 @@ else if ((long) nameLength >= 4) { | |
} | ||
} | ||
|
||
private static int findDelimiter(long word) { | ||
long input = word ^ 0x3B3B3B3B3B3B3B3BL; | ||
long tmp = (input & 0x7F7F7F7F7F7F7F7FL) + 0x7F7F7F7F7F7F7F7FL; | ||
tmp = ~(tmp | input | 0x7F7F7F7F7F7F7F7FL); | ||
return Long.numberOfTrailingZeros(tmp) >>> 3; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can be micro-optimized as follows: private static int findDelimiter(long word) {
long input = word ^ 0x3B3B3B3B3B3B3B3BL;
long tmp = (input - 0x0101010101010101L) & ~input & 0x8080808080808080L;
return Long.numberOfTrailingZeros(tmp) >>> 3;
} Which saves 1 instruction. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, indeed, thanks. This got another almost 2%. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @gunnarmorling This is already the second contribution from @mukel, I would like to ask if you could add him as a co-author to the submission as well in case you will re-evaluate. |
||
|
||
private static void newEntry(Result[] results, HashMap<String, Result> cities, long nameAddress, int number, int hash, int nameLength) { | ||
Result r = new Result(nameAddress, number); | ||
results[hash] = r; | ||
byte[] bytes = new byte[nameLength]; | ||
UNSAFE.copyMemory(null, nameAddress, bytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, nameLength); | ||
cities.put(new String(bytes, StandardCharsets.UTF_8), r); | ||
String nameAsString = new String(bytes, StandardCharsets.UTF_8); | ||
cities.put(nameAsString, r); | ||
} | ||
|
||
private static long[] getSegments(int numberOfChunks) throws IOException { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This read can go over the end of the file. For example, a file contains the last entry "X;0.0\n". The read goes over 2 bytes at the end of the file. It will touch the next page, if the file has size equal to page size. If the next page is protected, that is seg fault. But I can't reproduce it locally. I always have ~136K memory after the end of the mapped region which I can access witout getting a seg fault. Maybe you have an explaination?
A workaround is to simply copy the tail with padding (equal has the same flaw and we want it read words).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, didn't think about that. It is very unlikely to exactly have a page boundary there, but will have to add the workaround for that last row.