In [None]:
import datetime
import io
import json
import os
import random
from itertools import chain
from itertools import cycle

import jsonlines
import torch
import zstandard
from torch.utils.data.dataset import IterableDataset
from transformers import GPT2Tokenizer


# from openwebtext2 https://github.com/EleutherAI/openwebtext2/blob/master/utils/archiver.py
def json_serial(obj):
    """JSON serializer for objects not serializable by default json code"""

    if isinstance(obj, (datetime.datetime,)):
        return obj.isoformat()
    raise TypeError("Type %s not serializable" % type(obj))


class Archive:
    def __init__(self, file_path, compression_level=3):
        self.file_path = file_path
        dir_name = os.path.dirname(file_path)
        if dir_name:
            os.makedirs(dir_name, exist_ok=True)
        self.fh = open(self.file_path, "wb")
        self.cctx = zstandard.ZstdCompressor(level=compression_level)
        self.compressor = self.cctx.stream_writer(self.fh)

    def add_data(self, data, meta={}):
        self.compressor.write(
            json.dumps({"text": data, "meta": meta}, default=json_serial).encode(
                "UTF-8"
            )
            + b"\n"
        )

    def commit(self):
        self.compressor.flush(zstandard.FLUSH_FRAME)
        self.fh.flush()
        self.fh.close()


class Reader:
    def __init__(self):
        pass

    def read_jsonl(
        self, file, get_meta=False, autojoin_paragraphs=True, para_joiner="\n\n"
    ):
        with open(file, "rb") as fh:
            self.fh = fh
            cctx = zstandard.ZstdDecompressor()
            reader = io.BufferedReader(cctx.stream_reader(fh))
            rdr = jsonlines.Reader(reader)
            for ob in rdr:
                # naive jsonl where each object is just the string itself, with no meta. For legacy compatibility.
                if isinstance(ob, str):
                    assert not get_meta
                    yield ob
                    continue

                text = ob["text"]

                if autojoin_paragraphs and isinstance(text, list):
                    text = para_joiner.join(text)

                if get_meta:
                    yield file, text, (ob["meta"] if "meta" in ob else {})
                else:
                    yield file, text


class WebTextDocumentIterator:
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path

    def __iter__(self):
        reader = Reader()
        doc_chunk_size = 20000
        documents = []
        for i, x in enumerate(reader.read_jsonl(self.dataset_path)):
            documents.append(x)
            if len(documents) == doc_chunk_size:
                yield documents
                documents = []
        yield documents


class FileIterator:
    def __init__(self, dataset_paths):
        self.dataset_paths = dataset_paths

    def get_file(self, path):
        with open(path, "r", encoding="utf-8") as f:
            yield from f.readlines()

    def __iter__(self):
        for path in self.dataset_paths:
            yield from self.get_file(path)


class TokenizerIterator:
    def __init__(self, seq_len, tokenizer, seed, dataset_path):
        self.seq_len = seq_len
        self.tokenizer = tokenizer
        if "webtext" in dataset_path:
            self.document_iter = WebTextDocumentIterator(dataset_path)
        else:
            self.document_iter = FileIterator(dataset_path)
        self.seed = seed

    def __iter__(self):
        block = []
        for documents in self.document_iter:
            random.Random(self.seed).shuffle(documents)

            for doc_i, x in enumerate(documents):
                tokenized = self.tokenizer(text=x[1],).input_ids
                tokenized.append(self.tokenizer.eos_token_id)

                tokenized.insert(0, self.tokenizer.eos_token_id)
                tokenized_length = len(tokenized)
                for token in tokenized:
                    if len(block) == self.seq_len + 1:
                        yield block[:-1], block[1:], (x[0], doc_i)
                        block = []
                    block.append(token)

                # if len(tokenized) >= self.seq_len:
                #     i = 0
                #     while i <= len(tokenized) - self.seq_len:
                #         yield tokenized[i : i + self.seq_len]
                #         i += self.seq_len


class BatchIterator:
    def __init__(self, seq_len, batch_size, drop_last, tokenizer, dataset_paths):

        self.dataset_paths = dataset_paths
        self.batch_size = batch_size
        self.drop_last = drop_last
        self.seq_len = seq_len
        self.tokenizer = tokenizer

    def process_data(self, seed_dataset):
        seed, dataset = seed_dataset
        self.tokenizer_iter = TokenizerIterator(
            self.seq_len, self.tokenizer, seed, dataset
        )
        for x in self.tokenizer_iter:
            yield x
            # batch.append(x)
            # if len(batch) == self.batch_size:
            #     yield batch
            #     batch = []

    def shuffled_data_list(self, i):
        # split = len(self.dataset_paths) // self.batch_size
        # dataset_paths = self.dataset_paths[(i*split):((i+1)*split)]
        shuffled = self.dataset_paths
        # does not impact global seed
        random.Random(i).shuffle(shuffled)
        return [(i, x) for x in shuffled]

    def get_stream(self, data_list):
        return chain.from_iterable(map(self.process_data, cycle(data_list)))

    def get_streams(self):
        return zip(
            *[
                self.get_stream(self.shuffled_data_list(i))
                for i in range(len(self.dataset_paths))
            ]
        )

    def __iter__(self):
        return self.get_streams()

        # batches = []
        # batch = []
        # for x in self.tokenizer_iter:
        #     batch.append(x)
        #     if len(batch) == self.batch_size:
        #         yield batch
        #         batch = []


class WebTextIter(IterableDataset):
    def __init__(
        self, batch_size, dataset_paths, seq_len, tokenizer=None, drop_last=True
    ):
        if tokenizer is None:
            tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.seq_len = seq_len
        self.dataset_paths = dataset_paths
        self.batch_size = batch_size
        self.batch_iter = BatchIterator(
            seq_len=seq_len,
            batch_size=batch_size,
            drop_last=drop_last,
            tokenizer=tokenizer,
            dataset_paths=dataset_paths,
        )

    def __iter__(self):
        try:
            batch = []
            for streams in self.batch_iter:
                for sample in streams:
                    if len(batch) == self.batch_size:
                        yield self.collate_fn(batch)
                        batch = []
                    batch.append(sample)

        except StopIteration:
            return

    def collate_fn(self, batch):
        data_list, label_list, seq_len_list = [], [], []
        for _data, _label, _seq in batch:
            data_list.append(_data)
            label_list.append(_label)
            seq_len_list.append(_seq)
        return (
            torch.LongTensor(data_list),
            torch.LongTensor(label_list),
            seq_len_list,
        )
    
    
WebTextIter

In [78]:
import glob
import os
import math

import tqdm

dataset_directory = "/datadrive/openwebtext2/shards"
files = glob.glob(os.path.join(dataset_directory, "*")) 

In [82]:
import timeit

starttime = timeit.default_timer()
print("The start time is :",starttime)

path = files[0]
print(path)
reader = Reader()
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
documents = tokenizer.eos_token
doc_counts = 0
for i, x in enumerate(reader.read_jsonl(path)):
    documents = documents + x[1] + tokenizer.eos_token
    doc_counts +=1
    if doc_counts == 50000:
        break
print("The time difference is :", timeit.default_timer() - starttime)
print(doc_counts)


The start time is : 859961.188051127
/datadrive/openwebtext2/shards/shard_0
The time difference is : 4.927313721040264
50000


TypeError: 'generator' object is not subscriptable

In [7]:
import glob
import os
import math

import tqdm

document_count = 0
total_text_size = 0
dataset_directory = "/datadrive/openwebtext2"
files = glob.glob(os.path.join(dataset_directory, "*jsonl.zst"))

archives = [Archive("{}/test/shard_{}".format(dataset_directory,i)) for i in range(4)]
archives_dict = {i:a for i, a in enumerate(archives)}
file_count = 0
for file_path in tqdm.tqdm(files, dynamic_ncols=True):
    reader = Reader()
    file_count += 1
    for document, metadata in reader.read_jsonl(file_path, get_meta=True):
        document_count += 1
        archive_index = document_count % 4
        archives_dict[archive_index].add_data(document)
        total_text_size += len(document)
        if document_count == 10:
            break
    if file_count == 4:
        break

for a in archives:
    a.commit()

billion = math.pow(10, 9)
print(f"Total Document Count: {document_count:,}")
print(f"Total Uncompressed Text Size: {(total_text_size / billion):.2f} GB")

  2%|▏         | 3/179 [00:00<00:32,  5.38it/s]

Total Document Count: 2,502
Total Uncompressed Text Size: 0.02 GB





In [88]:
import glob
import os
import math

import tqdm

dataset_directory = "/datadrive/openwebtext2/test/"
files = glob.glob(os.path.join(dataset_directory, "*"))
train = files[0:3]
evals = files[3:]
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.eos_token_id
print(evals)

['/datadrive/openwebtext2/test/shard_3']


In [95]:
import os
from pathlib import Path

import torch
from tokenizers import ByteLevelBPETokenizer
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import IterableDataset
from transformers import GPT2Tokenizer

import random
from itertools import chain, cycle, islice
import torch.utils.data as data

import numpy as np


class WebTextDocumentIterator:
    def __init__(self, dataset_paths):
        self.dataset_paths = dataset_paths

    def get_document(self, reader, path):
        text = reader.read_jsonl(path)
        return text

    def __iter__(self):
        reader = Reader()
        for path in self.dataset_paths:
            yield from self.get_document(reader, path)


class FileIterator:
    def __init__(self, dataset_paths):
        self.dataset_paths = dataset_paths

    def get_file(self, path):
        with open(path, "r", encoding="utf-8") as f:
            yield from f.readlines()

    def __iter__(self):
        for path in self.dataset_paths:
            yield from self.get_file(path)


class TokenizerIterator:
    def __init__(self, seq_len, tokenizer, dataset_paths):
        self.seq_len = seq_len
        self.tokenizer = tokenizer
        self.document_iter = WebTextDocumentIterator(dataset_paths)

    def tokenize_doc(self, x):
        tokenized = self.tokenizer(text=x, truncation=True).input_ids
        tokenized.append(self.tokenizer.eos_token_id)

        tokenized.insert(0, self.tokenizer.eos_token_id)
        if len(tokenized) >= self.seq_len:
            i = 0
            while i <= len(tokenized) - self.seq_len:
                yield tokenized[i : i + self.seq_len]
                i += self.seq_len
        else:
            pass

    def __iter__(self):
        for x in self.document_iter:
            print(x)
            yield from self.tokenize_doc(x)


class BatchIterator:
    def __init__(self, seq_len, batch_size, drop_last, tokenizer, dataset_paths):

        self.dataset_paths = dataset_paths
        self.batch_size = batch_size
        self.drop_last = drop_last
        self.seq_len = seq_len

    def collate_fn(self, batch):
        data_list, label_list, seq_len_list = [], [], []
        for _data, _label, _seq in batch:
            data_list.append(_data)
            label_list.append(_label)
            seq_len_list.append(_seq)
        return (
            torch.LongTensor(data_list).permute(1,0),
            torch.LongTensor(label_list).permute(1,0),
            torch.LongTensor(seq_len_list),
        )
    def process_data(self, dataset):
        self.tokenizer_iter = TokenizerIterator(self.seq_len, tokenizer, [dataset])
        for x in self.tokenizer_iter:
            yield x
            
    def shuffled_data_list(self, i):
        split = len(self.dataset_paths) // self.batch_size
        dataset_paths = self.dataset_paths[(i*split):((i+1)*split)]
        return random.sample(dataset_paths, len(dataset_paths))
        
    
    def get_stream(self, data_list):
        return chain.from_iterable(map(self.process_data, cycle(data_list)))
    
    def get_streams(self):
        return zip(*[self.get_stream(self.shuffled_data_list(i)) for i in range(self.batch_size)])
    
    def __iter__(self):
        return self.get_streams()



class WebTextIter(IterableDataset):
    def __init__(self, batch_size, drop_last, dataset_paths, seq_len, tokenizer=None):
        if tokenizer is None:
            tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.seq_len = seq_len
        self.dataset_paths = dataset_paths
        self.batch_iter = BatchIterator(
            seq_len=seq_len,
            batch_size=batch_size,
            drop_last=drop_last,
            tokenizer=tokenizer,
            dataset_paths=dataset_paths,
        )

    def __iter__(self):
        for x in self.batch_iter:
            print(x)
            yield x

            
    def collate_fn(self, batch):
        data_list, label_list, seq_len_list = [], [], []
        for _data, _label, _seq in batch:
            data_list.append(_data)
            label_list.append(_label)
            seq_len_list.append(_seq)
        print(data_list)
        print("---")
        print(label_list)
        print("++")
        print(torch.LongTensor(data_list))
        print("++")
        print(torch.LongTensor(label_list).size())
        return (
            torch.LongTensor(data_list),
            torch.LongTensor(label_list),
            torch.LongTensor(seq_len_list),
        )


In [97]:
from torch.utils.data import DataLoader
print(train)
wt = WebTextIter(dataset_paths=train, batch_size=2, drop_last=True, seq_len=10)

dl = DataLoader(wt, batch_size=None, sampler=None)

for i, n in enumerate(dl):
    if i == 10:
        break

['/datadrive/openwebtext2/test/shard_0', '/datadrive/openwebtext2/test/shard_1', '/datadrive/openwebtext2/test/shard_2']
News and press releases

Mozilla's browsers global usage share is still growing according to OneStat.com

Amsterdam - November 2 2005 - OneStat.com ( www.onestat.com ), the number one provider of real-time web analytics, today reported that Mozilla's browsers have a total global usage share of 11,51 percent. The total usage share of Mozilla increased 2.82 percent since April 2005. Microsoft's Internet Explorer still dominates the global browser market with a global usage share of 85,45 percent which is 1.18 percent less as at the end of April.

"The global usage share of Mozilla's browsers is still growing and it seems that Netscape users and some Internet Explorer users are switching to the Firefox version. It also looks like that browser users of Internet Explorer for Apple's Mac are switching to Safari because the global usage share is still growing. It is also in

In [42]:
import glob
import io
import itertools
import json
import os
import random
from pathlib import Path
from typing import Optional

import numpy as np
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data import IterableDataset
from torchtext.data.utils import get_tokenizer
from torchtext.utils import download_from_url
from torchtext.utils import extract_archive
from torchtext.vocab import build_vocab_from_iterator


def build_vocab_from_file(vocab_file):
    symbols = []
    with open(vocab_file, "r", encoding="utf-8") as f:
        for line in f:
            symb = line.strip().split()[0]
            symbols.append(symb)
    return {s: i for i, s in enumerate(symbols)}


def build_vocab_from_json(vocab_file):
    with open(vocab_file) as json_file:
        data = json.load(json_file)
    return dict(data)

class OpenWebText2DataModule(pl.LightningDataModule):
    def __init__(
        self,
        sequence_length: int,
        batch_size: int,
        eval_batch_size: int = None,
        data_dir="/datadrive/openwebtext2",
    ):
        super().__init__()
        self.batch_size = batch_size
        self.eval_batch_size = eval_batch_size if eval_batch_size else 5
        self.sequence_length = sequence_length
        self.data_dir = data_dir

    def setup(self, stage: Optional[str] = None):
        files = glob.glob(os.path.join(self.data_dir + "/test", "*"))
        #files = glob.glob(os.path.join(self.data_dir, "*.jsonl.zst"))
        self.train_paths = files[:-1]
        self.val_paths = files[-1:]
        self.test_paths = files[-1:]

        vocab = build_vocab_from_json(self.data_dir + "/gpt2-vocab.json")
        self.vocab = vocab
        # self.train_paths = [
        #     path
        #     for idx, path in enumerate(all_paths)
        #     if idx % 10 in (0, 2, 4, 6, 8,)
        # ]
        # self.valid_paths = [
        #     path for idx, path in enumerate(all_paths) if idx % 10 in (1, 9)
        # ]
        # self.test_paths = [
        #     path for idx, path in enumerate(all_paths) if idx % 10 in (3, 7)
        # ]

    def train_dataloader(self):
        train_dataset = WebTextIter(
            dataset_paths=self.train_paths,
            seq_len=self.sequence_length,
            batch_size=self.batch_size,
            drop_last=True,
        )
        data_loader = DataLoader(
            train_dataset, batch_size=None, sampler=None
        )
        return data_loader

    def val_dataloader(self):
        val_dataset = WebTextIter(
            dataset_paths=self.val_paths,
            seq_len=self.sequence_length,
            batch_size=self.eval_batch_size,
        )

        data_loader = DataLoader(val_dataset, batch_size=None, sampler=None,)
        return data_loader

    def test_dataloader(self):
        test_dataset = WebTextIter(
            dataset_paths=self.test_paths,
            seq_len=self.sequence_length,
            batch_size=self.eval_batch_size,
        )
        return DataLoader(test_dataset, batch_size=None, sampler=None)


data_module = OpenWebText2DataModule(
            sequence_length=4, batch_size=2, eval_batch_size=2, data_dir="/datadrive/openwebtext2"
        )
data_module.prepare_data()
data_module.setup("fit")

In [43]:
td = data_module.train_dataloader()


In [44]:
for i, n in enumerate(td):
    print(n)
    if i == 2:
        break

News and press releases

Mozilla's browsers global usage share is still growing according to OneStat.com

Amsterdam - November 2 2005 - OneStat.com ( www.onestat.com ), the number one provider of real-time web analytics, today reported that Mozilla's browsers have a total global usage share of 11,51 percent. The total usage share of Mozilla increased 2.82 percent since April 2005. Microsoft's Internet Explorer still dominates the global browser market with a global usage share of 85,45 percent which is 1.18 percent less as at the end of April.

"The global usage share of Mozilla's browsers is still growing and it seems that Netscape users and some Internet Explorer users are switching to the Firefox version. It also looks like that browser users of Internet Explorer for Apple's Mac are switching to Safari because the global usage share is still growing. It is also interesting to see that Microsoft's Internet Explorer has less global usage share in the USA as in the UK. Mozilla's browse

In [129]:
count

3015

In [66]:
import os
import pickle
import os
from pathlib import Path

import torch
from tokenizers import ByteLevelBPETokenizer
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import IterableDataset
from transformers import GPT2Tokenizer
from collections import defaultdict 

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

seen = []
dupe = []
files = defaultdict(list)
docs = defaultdict(list)

def undo_transformations(batches, tokenizer=tokenizer):
    x_all, y_all, meta = batches
    for batch_idx in range(len(batches)-1):
        x = x_all[batch_idx]
        doc, doc_i, i = meta[batch_idx]
        x_tokens = tokenizer.convert_ids_to_tokens(x.tolist())
        x_str = tokenizer.convert_tokens_to_string(x_tokens)
        if x_str in seen:
            dupe.append(x_str)
        else:
            seen.append(x_str)
        files[str(doc) + '_' + str(doc_i)].append(i)
        #print("---SRC: {} \n\n".format(x_str))



PICKLE_FILE = '/datadrive/batches_9.pkl'

def read_from_pickle(path=PICKLE_FILE):
    with open(path, 'rb') as file:
        try:
            while True:
                yield pickle.load(file)
        except EOFError:
            pass
i = 0
for item in read_from_pickle():
    i +=1
    undo_transformations(item)

In [67]:
import collections
files
od = collections.OrderedDict(sorted(files.items()))
od

OrderedDict([('/datadrive/openwebtext2/shards/shard_23_10',
              [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,
               46,
               47,
               48,
               49,


In [68]:
len(files)

22

In [69]:
print(files.keys())

dict_keys(['/datadrive/openwebtext2/shards/shard_31_2', '/datadrive/openwebtext2/shards/shard_23_4', '/datadrive/openwebtext2/shards/shard_23_5', '/datadrive/openwebtext2/shards/shard_31_3', '/datadrive/openwebtext2/shards/shard_31_7', '/datadrive/openwebtext2/shards/shard_31_13', '/datadrive/openwebtext2/shards/shard_23_8', '/datadrive/openwebtext2/shards/shard_23_9', '/datadrive/openwebtext2/shards/shard_23_10', '/datadrive/openwebtext2/shards/shard_31_14', '/datadrive/openwebtext2/shards/shard_31_16', '/datadrive/openwebtext2/shards/shard_31_20', '/datadrive/openwebtext2/shards/shard_23_11', '/datadrive/openwebtext2/shards/shard_23_13', '/datadrive/openwebtext2/shards/shard_31_22', '/datadrive/openwebtext2/shards/shard_31_25', '/datadrive/openwebtext2/shards/shard_23_16', '/datadrive/openwebtext2/shards/shard_23_18', '/datadrive/openwebtext2/shards/shard_23_19', '/datadrive/openwebtext2/shards/shard_31_26', '/datadrive/openwebtext2/shards/shard_31_31', '/datadrive/openwebtext2/shard

In [1]:
import random
x = [1,2, 3, 4]
random.Random(4).sample(10)
x

TypeError: sample() missing 1 required positional argument: 'k'

In [75]:
tuple((1,2))

(1, 2)