This repository has been archived by the owner on Apr 17, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
token.ts
72 lines (64 loc) · 2.02 KB
/
token.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import { get_encoding, TiktokenEncoding } from '@dqbd/tiktoken'
import { SplitTextChunk } from './types';
const MAX_CHUNK_LENGTH = 8191
const EMBEDDING_ENCODING: TiktokenEncoding = 'cl100k_base'
const CHUNK_OVERLAP = 0
interface SplitTextOptions {
maxTokens?: number
chunkOverlap?: number
encodingName?: TiktokenEncoding
}
export function splitText(
text: string,
{
maxTokens = MAX_CHUNK_LENGTH,
chunkOverlap = CHUNK_OVERLAP,
encodingName = EMBEDDING_ENCODING,
}: SplitTextOptions,
callback?: (chunk: SplitTextChunk) => void
): SplitTextChunk[] {
if (chunkOverlap >= maxTokens) {
throw new Error('Cannot have chunkOverlap >= chunkSize')
}
const tokenizer = get_encoding(encodingName)
const input_ids = tokenizer.encode(text)
const chunkSize = maxTokens
let start_idx = 0
let cur_idx = Math.min(start_idx + chunkSize, input_ids.length)
let chunk_ids = input_ids.slice(start_idx, cur_idx)
const decoder = new TextDecoder()
const chunks = []
while (start_idx < input_ids.length) {
const chunk = decoder.decode(tokenizer.decode(chunk_ids))
const chunkItem = { chunk, start: start_idx, end: cur_idx }
chunks.push(chunkItem)
callback && callback(chunkItem)
start_idx += chunkSize - chunkOverlap
cur_idx = Math.min(start_idx + chunkSize, input_ids.length)
chunk_ids = input_ids.slice(start_idx, cur_idx)
}
tokenizer.free()
return chunks
}
interface MergeOptions {
maxLen?: number
encodingName?: TiktokenEncoding
separator?: string
}
// should index chunks
export const merge = async (chunks: string[], options?: MergeOptions): Promise<string> => {
const tokenizer = get_encoding(options?.encodingName || EMBEDDING_ENCODING)
let curLen = 0;
const context = [];
for (const chunk of chunks) {
const nTokens = tokenizer.encode(chunk).length;
curLen += nTokens + 4;
if (curLen > (options?.maxLen || 1800)) {
break;
}
context.push(chunk);
}
return context.join(options?.separator !== undefined ?
options.separator :
'\n\n###\n\n');
};