Skip to content

Commit

Permalink
deduplicate "eclipse" #1743
Browse files Browse the repository at this point in the history
CharDeduplication was not designed to deduplicate tokens with length 7+
which could lead to high memory consumption. With this change tokens of
all sizes can be deduplicated.

#1743

A benchmark implemented in CharDeduplicationTest.main(String[]) shows
the new deduplication is performed at similar speed (.21s instead of
.16s) but deduplicates much more tokens (99% instead of 36%).
  • Loading branch information
EcljpseB0T authored and jukzi committed Dec 14, 2023
1 parent dcafabb commit 86aa2dd
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 198 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,17 @@ public class CharDeduplication {

// ----- immutable static part (thread safe): ----

private final static char[] CHAR_ARRAY0 = new char[0];
static final char[] ASCII_CHARS[] = new char[128][];
static {
for (int i = 0; i < ASCII_CHARS.length; i++) {
ASCII_CHARS[i] = new char[] { (char) i };
}
}
public static final int TABLE_SIZE = 30; // XXX thats not a prime -> bad for hashing, nor a power of 2 -> expensive
// modulo computation
public static final int INTERNAL_TABLE_SIZE = 6; // 30*6 =180 entries

public static final int OPTIMIZED_LENGTH = 6;

private final static char[] CHAR_ARRAY0 = new char[0];
/** size of hash table, does not affect performance due to hashing but affects memory */
public static final int TABLE_SIZE = 8192; // a power of 2 to fast compute modulo
/** number of entries to linear search affects performance but decreases collisions - does not affect memory */
public static final int SEARCH_SIZE = 8; // a power of 2, has to be smaller then TABLE_SIZE

/** avoid OOME by additional CharDeduplication memory **/
static final class CacheReference<T> {
Expand All @@ -59,56 +57,32 @@ T get() {

private final static ThreadLocal<CacheReference<CharDeduplication>> mutableCache = ThreadLocal.withInitial(()->new CacheReference<>(CharDeduplication::new));

private static final char[] optimizedCurrentTokenSource1(char[] source, int startPosition) {
// optimization at no speed cost of 99.5 % of the singleCharIdentifier
char charOne = source[startPosition];
if (charOne < ASCII_CHARS.length) {
return ASCII_CHARS[charOne];
}
return new char[] { charOne };
}

/** @return an instance that is *not* thread safe. To be used in a single thread only. **/
public static CharDeduplication getThreadLocalInstance() {
return mutableCache.get().get();
}

// ----- mutable non-static part (not thread safe!): ----

/** single threaded only **/
public final char[][][][] charArray_length = new char[OPTIMIZED_LENGTH - 1][TABLE_SIZE][INTERNAL_TABLE_SIZE][];

int newEntry2 = 0;
int newEntry3 = 0;
int newEntry4 = 0;
int newEntry5 = 0;
int newEntry6 = 0;
/** single threaded only, hashtable with restricted linear probing **/
private final char[][] hashTable = new char[TABLE_SIZE][];
private final int circularBufferPointer[] = new int[TABLE_SIZE];

private CharDeduplication() {
init();
}

private void init() {
for (int i = 0; i < OPTIMIZED_LENGTH - 1; i++) {
final char[] initCharArray = new char[i + 2];
for (int j = 0; j < TABLE_SIZE; j++) {
for (int k = 0; k < INTERNAL_TABLE_SIZE; k++) {
this.charArray_length[i][j][k] = initCharArray;
}
}
}
// private
}

/** public for test purpose only **/
@Deprecated
public void reset() {
init();
Arrays.fill(this.hashTable, null);
Arrays.fill(this.circularBufferPointer, 0);
}

/**
* like Arrays.copyOfRange(source, from, to) but returns a cached instance of the former result if
* available
*
*
* @param from
* start index (inclusive)
* @param to
Expand All @@ -118,167 +92,60 @@ public void reset() {
**/
public char[] sharedCopyOfRange(char[] source, int from, int to) {
int length = to - from;
switch (length) { // see OptimizedLength
switch (length) {
case 1:
return optimizedCurrentTokenSource1(source, from);
case 2:
return optimizedCurrentTokenSource2(source, from);
case 3:
return optimizedCurrentTokenSource3(source, from);
case 4:
return optimizedCurrentTokenSource4(source, from);
case 5:
return optimizedCurrentTokenSource5(source, from);
case 6:
return optimizedCurrentTokenSource6(source, from);
char charOne = source[from];
if (charOne < ASCII_CHARS.length) {
return ASCII_CHARS[charOne];
}
break;
case 0:
return CHAR_ARRAY0;
}
return Arrays.copyOfRange(source, from, to);
}

private final char[] optimizedCurrentTokenSource2(char[] source, int startPosition) {

char[] src = source;
int start = startPosition;
char c0, c1;
int hash = (((c0 = src[start]) << 6) + (c1 = src[start + 1])) % TABLE_SIZE;
char[][] table = this.charArray_length[0][hash];
int i = this.newEntry2;
while (++i < INTERNAL_TABLE_SIZE) {
char[] charArray = table[i];
if ((c0 == charArray[0]) && (c1 == charArray[1]))
return charArray;
}
// ---------other side---------
i = -1;
int max = this.newEntry2;
while (++i <= max) {
char[] charArray = table[i];
if ((c0 == charArray[0]) && (c1 == charArray[1]))
return charArray;
}
// --------add the entry-------
if (++max >= INTERNAL_TABLE_SIZE)
max = 0;
char[] r;
System.arraycopy(src, start, r = new char[2], 0, 2);
return table[this.newEntry2 = max] = r;
}

private final char[] optimizedCurrentTokenSource3(char[] source, int startPosition) {
char[] src = source;
int start = startPosition;
char c0, c1 = src[start + 1], c2;
int hash = (((c0 = src[start]) << 6) + (c2 = src[start + 2])) % TABLE_SIZE;
char[][] table = this.charArray_length[1][hash];
int i = this.newEntry3;
while (++i < INTERNAL_TABLE_SIZE) {
char[] charArray = table[i];
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]))
return charArray;
}
// ---------other side---------
i = -1;
int max = this.newEntry3;
while (++i <= max) {
char[] charArray = table[i];
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]))
return charArray;
}
// --------add the entry-------
if (++max >= INTERNAL_TABLE_SIZE)
max = 0;
char[] r;
System.arraycopy(src, start, r = new char[3], 0, 3);
return table[this.newEntry3 = max] = r;
}

private final char[] optimizedCurrentTokenSource4(char[] source, int startPosition) {
char[] src = source;
int start = startPosition;
char c0, c1 = src[start + 1], c2, c3 = src[start + 3];
int hash = (((c0 = src[start]) << 6) + (c2 = src[start + 2])) % TABLE_SIZE;
char[][] table = this.charArray_length[2][hash];
int i = this.newEntry4;
while (++i < INTERNAL_TABLE_SIZE) {
char[] charArray = table[i];
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]) && (c3 == charArray[3]))
int hash = hashCode(source, from, to);
int circularBufferStart = hash & (TABLE_SIZE - 1);
int positionToReplace = -1;
// linear probing within circular buffer:
for (int i = 0; i < SEARCH_SIZE; i++) {
int position = (circularBufferStart + i) & (TABLE_SIZE - 1);
char[] charArray = this.hashTable[position];
if (charArray == null) {
// this case only happens when the table is filling up,
// but helps to get good deduplication fast
positionToReplace = position;
} else if (equals(source, from, to, charArray)) {
// Successfully deduplicated:
return charArray;
}
}
// ---------other side---------
i = -1;
int max = this.newEntry4;
while (++i <= max) {
char[] charArray = table[i];
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]) && (c3 == charArray[3]))
return charArray;
char[] r = Arrays.copyOfRange(source, from, to);
// not found -> overwrite existing entries in a circular buffer:
if (positionToReplace == -1) {
// no empty entry found - normal case:
int j = this.circularBufferPointer[circularBufferStart]++;
positionToReplace = (circularBufferStart + (j & (SEARCH_SIZE-1))) & (TABLE_SIZE - 1);
}
// --------add the entry-------
if (++max >= INTERNAL_TABLE_SIZE)
max = 0;
char[] r;
System.arraycopy(src, start, r = new char[4], 0, 4);
return table[this.newEntry4 = max] = r;
this.hashTable[positionToReplace] = r;
return r;
}

private final char[] optimizedCurrentTokenSource5(char[] source, int startPosition) {
char[] src = source;
int start = startPosition;
char c0, c1 = src[start + 1], c2, c3 = src[start + 3], c4;
int hash = (((c0 = src[start]) << 12) + ((c2 = src[start + 2]) << 6) + (c4 = src[start + 4])) % TABLE_SIZE;
char[][] table = this.charArray_length[3][hash];
int i = this.newEntry5;
while (++i < INTERNAL_TABLE_SIZE) {
char[] charArray = table[i];
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]) && (c3 == charArray[3])
&& (c4 == charArray[4]))
return charArray;
private int hashCode(char[] source, int from, int to) {
int result = source[from];
for (int i = from + 1; i < to; i++) {
result = 31 * result + source[i];
}
// ---------other side---------
i = -1;
int max = this.newEntry5;
while (++i <= max) {
char[] charArray = table[i];
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]) && (c3 == charArray[3])
&& (c4 == charArray[4]))
return charArray;
}
// --------add the entry-------
if (++max >= INTERNAL_TABLE_SIZE)
max = 0;
char[] r;
System.arraycopy(src, start, r = new char[5], 0, 5);
return table[this.newEntry5 = max] = r;
return result;
}

private final char[] optimizedCurrentTokenSource6(char[] source, int startPosition) {
char[] src = source;
int start = startPosition;
char c0, c1 = src[start + 1], c2, c3 = src[start + 3], c4, c5 = src[start + 5];
int hash = (((c0 = src[start]) << 12) + ((c2 = src[start + 2]) << 6) + (c4 = src[start + 4])) % TABLE_SIZE;
char[][] table = this.charArray_length[4][hash];
int i = this.newEntry6;
while (++i < INTERNAL_TABLE_SIZE) {
char[] charArray = table[i];
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]) && (c3 == charArray[3])
&& (c4 == charArray[4]) && (c5 == charArray[5]))
return charArray;
private boolean equals(char[] source, int from, int to, char[] charArray) {
if (charArray.length != to - from) {
return false;
}
// ---------other side---------
i = -1;
int max = this.newEntry6;
while (++i <= max) {
char[] charArray = table[i];
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]) && (c3 == charArray[3])
&& (c4 == charArray[4]) && (c5 == charArray[5]))
return charArray;
for (int i = from; i < to; i++) {
if (source[i] != charArray[i - from]) {
return false;
}
}
// --------add the entry-------
if (++max >= INTERNAL_TABLE_SIZE)
max = 0;
char[] r;
System.arraycopy(src, start, r = new char[6], 0, 6);
return table[this.newEntry6 = max] = r;
return true;
}
}

0 comments on commit 86aa2dd

Please sign in to comment.