In [5]:
!pip install zstandard
!pip install jsonlines
import os
import zstandard
import json
import jsonlines
import io
import datetime

def json_serial(obj):
    """JSON serializer for objects not serializable by default json code"""

    if isinstance(obj, (datetime.datetime,)):
        return obj.isoformat()
    raise TypeError ("Type %s not serializable" % type(obj))

# Modified version of lm_dataformat Archive for single file.
class Archive:
    def __init__(self, file_path, compression_level=3):
        self.file_path = file_path
        dir_name = os.path.dirname(file_path)
        if dir_name:
            os.makedirs(dir_name, exist_ok=True)    
        self.fh = open(self.file_path, 'wb')
        self.cctx = zstandard.ZstdCompressor(level=compression_level)
        self.compressor = self.cctx.stream_writer(self.fh)        
    
    def add_data(self, data, meta={}):
        self.compressor.write(json.dumps({'text': data, 'meta': meta}, default=json_serial).encode('UTF-8') + b'\n')
    
    def commit(self):
        self.compressor.flush(zstandard.FLUSH_FRAME)        
        self.fh.flush()
        self.fh.close()

# Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm.
class Reader:
    def __init__(self):
        pass

    def read_jsonl(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner='\n\n'):
        with open(file, 'rb') as fh:
            self.fh = fh
            cctx = zstandard.ZstdDecompressor()
            reader = io.BufferedReader(cctx.stream_reader(fh))
            rdr = jsonlines.Reader(reader)
            for ob in rdr:
                # naive jsonl where each object is just the string itself, with no meta. For legacy compatibility.
                if isinstance(ob, str):
                    assert not get_meta
                    yield ob
                    continue

                text = ob['text']

                if autojoin_paragraphs and isinstance(text, list):
                    text = para_joiner.join(text)

                if get_meta:
                    yield text, (ob['meta'] if 'meta' in ob else {})
                else:
                    yield text

Collecting jsonlines
  Downloading jsonlines-2.0.0-py3-none-any.whl (6.3 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-2.0.0


In [14]:
import glob
import os
import math

import tqdm

document_count = 0
total_text_size = 0
dataset_directory = "/datadrive/openwebtext2"
files = glob.glob(os.path.join(dataset_directory, "*jsonl.zst"))
for file_path in tqdm.tqdm(files, dynamic_ncols=True):
    reader = Reader()
    for document, metadata in reader.read_jsonl(file_path, get_meta=True):
        
        document_count += 1
        total_text_size += len(document)
        break

billion = math.pow(10, 9)
print(f"Total Document Count: {document_count:,}")
print(f"Total Uncompressed Text Size: {(total_text_size / billion):.2f} GB")

100%|██████████| 179/179 [00:00<00:00, 3336.43it/s]

Total Document Count: 179
Total Uncompressed Text Size: 0.00 GB





In [15]:
import glob
import os
import math

import tqdm

document_count = 0
total_text_size = 0
dataset_directory = "/datadrive/openwebtext2"
files = glob.glob(os.path.join(dataset_directory, "*jsonl.zst"))
for file_path in tqdm.tqdm(files, dynamic_ncols=True):
    reader = Reader()
    for document, metadata in reader.read_jsonl(file_path, get_meta=True):
        print(document)
        document_count += 1
        total_text_size += len(document)
        break
    break

billion = math.pow(10, 9)
print(f"Total Document Count: {document_count:,}")
print(f"Total Uncompressed Text Size: {(total_text_size / billion):.2f} GB")

  0%|          | 0/179 [00:00<?, ?it/s]

Judge arrested in Aruba case Fifth suspect in custody after U.S. teen's disappearance Paul Van Der Sloot was arrested after being questioned by police as a witness over the weekend. RELATED Gallery: Missing Aruba teen  Texas team to aid search  Publicity a concern for tourism  Was race a factor in arrests?  Interactive: Safety tips for travelers YOUR E-MAIL ALERTS Aruba Alabama Crime, Law and Justice or or Create Your Own ORANJESTAD, Aruba (CNN) -- An Aruban judge, the father of a 17-year-old suspect in the disappearance of an Alabama teenager, also has been arrested in the case, the island's police commissioner said Thursday. Prosecutors decided to keep Paul Van Der Sloot, 53, in custody for questioning for 48 hours, Aruba Police Commissioner Jan Van Der Straten said. Under Aruban law, if there is reasonable suspicion, the prosecutor's office can order a suspect held as long as another 48 hours. After that, a judge's decision is required to keep a suspect in jail. The legal system




In [35]:
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer
dataset_directory = "/datadrive/openwebtext2"

paths = [str(x) for x in Path("/datadrive/openwebtext2/").glob("**/*.zst")]

# Initialize a tokenizer

tokenizer = ByteLevelBPETokenizer(unk_token="[UNK]").from_file(vocab_filename = dataset_directory + '/gpt2-vocab.json', merges_filename=dataset_directory + "/gpt2-merges.txt")


TypeError: __init__() got an unexpected keyword argument 'unk_token'

In [27]:
tokenizer

Tokenizer(vocabulary_size=50257, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)

In [33]:
[x for x in tokenizer.get_vocab() if "" in x]


['<',
 '</',
 'Ġ<<',
 '<<',
 '"></',
 'Ġ<+',
 'Ġ<=',
 '><',
 '.</',
 'Ġ"<',
 'Ġ<[',
 'Ġ><',
 'Ġ<!--',
 'Ġ<@',
 'Ġ(<',
 'Ġ<',
 '></',
 '\\<',
 '<?',
 '.<',
 'Ġ</',
 ')</',
 '"><',
 'Ġ<-',
 '<|endoftext|>']