<a href="https://colab.research.google.com/github/datacraft-paris/2311-Cerisara-LLM/blob/main/Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. [Introduction](https://www.google.com/)
2. [A Brief Overview of LLMs](https://colab.research.google.com/github/datacraft-paris/2311-Cerisara-LLM/blob/main/LLMs.ipynb)
    1. [Background: _Decoder-only_ Language Model](#language_models_are_conditional_probabilities")
    2. [Transformers](#transformers)
    3. [Large Language Model](#llms)
    4. [Exercices](#exercices)
3. Preparing the data (This notebook)
    1. [Setup](#setup)
    2. [Downloading the data](#download)
    3. [Preprocess the data](#preprocess)
      1. [Tokenization](#tokenize)
      2. [Group the texts](#group)
    4. [Run the preparation](#run)

# Setup <a name="setup"></a>

In [None]:
!pip install datasets

In [None]:
import os
from typing import Iterator, Dict, List, Tuple
from itertools import chain
from tqdm.notebook import tqdm
from pathlib import Path
from datasets import load_dataset, load_from_disk, concatenate_datasets, Features, Dataset, DatasetDict, features

# Download the data <a name="download"></a>

In [None]:
def stream_data(corpus: str) -> Iterator[str]:
    """Streams the huggingface dataset."""
    dataset = load_dataset("togethercomputer/RedPajama-Data-1T",
                            corpus,
                            streaming=True)
    for item in dataset["train"]:
        metadata = eval(item["meta"])
        if "language" not in metadata:
            raise ValueError(f"The data '{corpus}' does'nt contain any information about languages.")
        if corpus != "github" and metadata["language"] != "en": # only took english
            continue
        yield item["text"]

def subset_dataset(subset: int,
                   corpus: str
                   ) -> Iterator[Dict[str, str]]:
    """Extract only a subset of the whole dataset."""
    for idx, item in tqdm(enumerate(stream_data(corpus), 1),
                          total=subset,
                          desc=corpus):
        yield {
            "text": item,
            "corpus": corpus
        }
        if idx == subset:
            break

def dataset_from_generator(output_folder: str, subset: int=32_000) -> None:
    """Creates hf datasets object and saves it to the disk."""
    output_folder = Path(output_folder)
    corpora = {"c4", "arxiv", "stackexchange", "github"}
    output_folder.mkdir(exist_ok=True, parents=True)
    data_features = Features({
        "text": features.Value("string"),
        "corpus": features.Value("string")
    })
    dataset = DatasetDict({corpus: Dataset.from_generator(
                                        subset_dataset,
                                        features=data_features,
                                        gen_kwargs={"subset": subset, "corpus": corpus}
                                        )
                                    for corpus in corpora})

    dataset.save_to_disk(output_folder)

In [None]:
dataset_from_generator("data")

# Preprocess the data <a name="preprocess"></a>

## Tokenization <a name="tokenize"></a>

In [None]:
def tokenize_dataset(tokenizer,
                     dataset: Dataset,
                     remove_columns: List[str],
                     target_colum: str="text",
                     return_attention_mask: bool=False,
                     batched: bool=True,
                     batch_size: int=64
                     ) -> Dataset:
    """Tokenize the dataset."""
    tokenized_dataset = dataset.map(
        lambda examples: tokenizer(examples[target_colum],
                                   return_attention_mask=return_attention_mask),
        num_proc=os.cpu_count(),
        batched=batched,
        batch_size=batch_size,
        remove_columns=remove_columns,
        desc="Running tokenizer on dataset",
    )
    return tokenized_dataset

## Group the texts <a name="group"></a>

In [None]:
def group_texts(dataset: Dataset,
                max_length: int=1024,
                batch_size: int=128,
                return_labels: bool=False
                ) -> Dataset:
    """Grouping texts to max_length."""
    def group(examples):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) if k != "corpus" else examples[k]
                                 for k in examples.keys()}
        total_length = len(concatenated_examples["input_ids"])
        total_length = (total_length // max_length) * max_length
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
            for k, t in concatenated_examples.items()
        }
        if return_labels:
            result["labels"] = result["input_ids"].copy()
        return result
    dataset = dataset.map(
        group,
        batched=True,
        batch_size=batch_size,
        num_proc=os.cpu_count(),
        desc=f"Grouping texts in chunks of {max_length}",
        )
    return dataset

# Run the preparation <a name="run"></a>

In [None]:
def prepare_data(dataset_path: str,
                 tokenizer,
                 subset: int=8_000,
                 max_length: int=512,
                 return_labels: bool=False,
                 ) -> Dataset:
    """Prepare the dataset for the compression."""
    dataset = load_from_disk(dataset_path)
    dataset = tokenize_dataset(tokenizer, dataset, remove_columns=["text"])
    dataset = DatasetDict({corpus: group_texts(dataset=dataset[corpus],
                                               max_length=max_length,
                                               return_labels=return_labels)
                                for corpus in dataset.keys()})
    for corpus in dataset:
        dataset[corpus] = dataset[corpus].remove_columns("corpus").add_column("corpus", [corpus] * len(dataset[corpus]))
        if subset is None or len(dataset[corpus]) <= subset:
            continue
        dataset[corpus] = dataset[corpus].select(range(subset))
    print(dataset)
    dataset = concatenate_datasets(dataset.values())
    n_sequences = len(dataset)
    n_tokens = (len(dataset) * max_length)
    print(f"Total number of sequences: {n_sequences:,}. Total number of tokens: {n_tokens:,}")
    return dataset.shuffle()

Now we have defined all the required methods, we can the data preparation.

In your opinion, why do we group the sequences?

In [None]:
dataset_path = "TODO"
tokenizer = "TODO"