In [None]:
!pip install -q -U datasets peft accelerate peft transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import gc
import os
import sys
import threading
import warnings
import numpy as np
import psutil
import torch
from accelerate import Accelerator
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    default_data_collator,
    get_linear_schedule_with_warmup,
    set_seed,
)

from peft import LoraConfig, TaskType, get_peft_model
from typing import Union,List, Dict, Optional
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
import multiprocessing
def advanced_data_loader(input: Union[str, Dict[str, str]], format: Optional[str] = None, split_ratios: Optional[Dict[str, float]] = None) -> Optional[DatasetDict]:
    """
    Loads a dataset from a given input path or dictionary specifying file paths and splits it.

    :param input: A string representing the dataset name or directory, or a dictionary containing file paths.
    :param format: The format of the dataset if loading from a file (e.g., 'csv' or 'json').
    :param split_ratios: A dictionary with keys 'train', 'test', and 'eval' containing split ratios.
    :return: A loaded and split dataset or None in case of failure.
    """
    if split_ratios is None:
        split_ratios = {'train': 0.8, 'test': 0.1, 'eval': 0.1}

    try:
        # Load the dataset
        if isinstance(input, dict) and format in ['csv', 'json']:
            dataset = load_dataset(format, data_files=input)
        elif isinstance(input, str) and format == 'text':
            dataset = load_dataset(format, data_dir=input)
        elif isinstance(input, str) and format is None:
            dataset = load_dataset(input)
        else:
            warnings.warn("Invalid input or format. Please provide a valid dataset name, directory, or file paths.")
            return None
    except FileNotFoundError as e:
        warnings.warn(str(e))
        return None

    # Split the dataset
    if dataset:
        split_dataset = dataset['train'].train_test_split(test_size=split_ratios['test'] + split_ratios['eval'])
        test_eval_dataset = split_dataset['test'].train_test_split(test_size=split_ratios['eval'] / (split_ratios['test'] + split_ratios['eval']))
        dataset = DatasetDict({
            'train': split_dataset['train'],
            'test': test_eval_dataset['train'],
            'eval': test_eval_dataset['test']
        })

    print("Splits: ", dataset.keys())
    print("Columns: ", {split: dataset[split].column_names for split in dataset.keys()})
    return dataset

def levenshtein_distance(str1, str2):
    # TC: O(N^2)
    # SC: O(N^2)
    if str1 == str2:
        return 0
    num_rows = len(str1) + 1
    num_cols = len(str2) + 1
    dp_matrix = np.empty((num_rows, num_cols))
    dp_matrix[0, :] = range(num_cols)
    dp_matrix[:, 0] = range(num_rows)

    for i in range(1, num_rows):
        for j in range(1, num_cols):
            if str1[i - 1] == str2[j - 1]:
                dp_matrix[i, j] = dp_matrix[i - 1, j - 1]
            else:
                dp_matrix[i, j] = min(dp_matrix[i - 1, j - 1], dp_matrix[i - 1, j], dp_matrix[i, j - 1]) + 1

    return dp_matrix[num_rows - 1, num_cols - 1]


def get_closest_label(eval_pred, classes):
    min_id = sys.maxsize
    min_edit_distance = sys.maxsize
    for i, class_label in enumerate(classes):
        edit_distance = levenshtein_distance(eval_pred.strip(), class_label)
        if edit_distance < min_edit_distance:
            min_id = i
            min_edit_distance = edit_distance
    return classes[min_id]


# Converting Bytes to Megabytes
def b2mb(x):
    return int(x / 2**20)


# This context manager is used to track the peak memory usage of the process
class TorchTracemalloc:
    def __enter__(self):
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()  # reset the peak gauge to zero
        self.begin = torch.cuda.memory_allocated()
        self.process = psutil.Process()

        self.cpu_begin = self.cpu_mem_used()
        self.peak_monitoring = True
        peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
        peak_monitor_thread.daemon = True
        peak_monitor_thread.start()
        return self

    def cpu_mem_used(self):
        """get resident set size memory for the current process"""
        return self.process.memory_info().rss

    def peak_monitor_func(self):
        self.cpu_peak = -1

        while True:
            self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak)

            # can't sleep or will not catch the peak right (this comment is here on purpose)
            # time.sleep(0.001) # 1msec

            if not self.peak_monitoring:
                break

    def __exit__(self, *exc):
        self.peak_monitoring = False

        gc.collect()
        torch.cuda.empty_cache()
        self.end = torch.cuda.memory_allocated()
        self.peak = torch.cuda.max_memory_allocated()
        self.used = b2mb(self.end - self.begin)
        self.peaked = b2mb(self.peak - self.begin)

        self.cpu_end = self.cpu_mem_used()
        self.cpu_used = b2mb(self.cpu_end - self.cpu_begin)
        self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin)
        # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")


def main():
    accelerator = Accelerator()
    model_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    dataset_name = "math-ai/AutoMathText"
    peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
    text_column = "text"
    label_column = "meta"
    lr = 3e-3
    num_epochs = 2
    batch_size = 4
    seed = 42
    max_length = 512
    do_test = False
    set_seed(seed)
    dataset=advanced_data_loader(dataset_name)
    dataset=dataset['train']

    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

    def preprocess_function(examples):
        batch_size = len(examples[text_column])
        inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
        targets = [str(x) for x in examples[label_column]]
        model_inputs = tokenizer(inputs)
        labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs
        for i in range(batch_size):
            sample_input_ids = model_inputs["input_ids"][i]
            label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
            model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
            labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
            model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
        for i in range(batch_size):
            sample_input_ids = model_inputs["input_ids"][i]
            label_input_ids = labels["input_ids"][i]
            model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
                max_length - len(sample_input_ids)
            ) + sample_input_ids
            model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
                "attention_mask"
            ][i]
            labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
            model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
            model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
            labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def test_preprocess_function(examples):
        batch_size = len(examples[text_column])
        inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
        model_inputs = tokenizer(inputs)
        # print(model_inputs)
        for i in range(batch_size):
            sample_input_ids = model_inputs["input_ids"][i]
            model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
                max_length - len(sample_input_ids)
            ) + sample_input_ids
            model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
                "attention_mask"
            ][i]
            model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
            model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        return model_inputs

    with accelerator.main_process_first():
        processed_datasets = dataset.map(
            preprocess_function,
            batched=True,
            num_proc=1,
            # remove_columns=dataset["train"].column_names,
            load_from_cache_file=True,
            desc="Running tokenizer on dataset",
        )
    accelerator.wait_for_everyone()

    train_dataset = processed_datasets["train"]

    with accelerator.main_process_first():
        processed_datasets = dataset.map(
            test_preprocess_function,
            batched=True,
            num_proc=1,
            remove_columns=dataset["train"].column_names,
            load_from_cache_file=False,
            desc="Running tokenizer on dataset",
        )
    eval_dataset = processed_datasets["train"]
    test_dataset = processed_datasets["test"]

    train_dataloader = DataLoader(
        train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
    )
    eval_dataloader = DataLoader(
        eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
    )
    test_dataloader = DataLoader(
        test_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
    )



    # creating model
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    # optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    # lr scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=(len(train_dataloader) * num_epochs),
    )

    model, train_dataloader, eval_dataloader, test_dataloader, optimizer, lr_scheduler = accelerator.prepare(
        model, train_dataloader, eval_dataloader, test_dataloader, optimizer, lr_scheduler
    )
    accelerator.print(model)

    is_ds_zero_3 = False
    if getattr(accelerator.state, "deepspeed_plugin", None):
        is_ds_zero_3 = accelerator.state.deepspeed_plugin.zero_stage == 3

    for epoch in range(num_epochs):
        with TorchTracemalloc() as tracemalloc:
            model.train()
            total_loss = 0
            for step, batch in enumerate(tqdm(train_dataloader)):
                outputs = model(**batch)
                loss = outputs.loss
                total_loss += loss.detach().float()
                accelerator.backward(loss)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
        accelerator.print("GPU Memory before entering the train : {}".format(b2mb(tracemalloc.begin)))
        accelerator.print("GPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.used))
        accelerator.print("GPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.peaked))
        accelerator.print(
            "GPU Total Peak Memory consumed during the train (max): {}".format(
                tracemalloc.peaked + b2mb(tracemalloc.begin)
            )
        )

        accelerator.print("CPU Memory before entering the train : {}".format(b2mb(tracemalloc.cpu_begin)))
        accelerator.print("CPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.cpu_used))
        accelerator.print("CPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.cpu_peaked))
        accelerator.print(
            "CPU Total Peak Memory consumed during the train (max): {}".format(
                tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)
            )
        )
        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        accelerator.print(f"{epoch=}: {train_ppl=} {train_epoch_loss=}")

        model.eval()
        eval_preds = []
        with TorchTracemalloc() as tracemalloc:
            for _, batch in enumerate(tqdm(eval_dataloader)):
                batch = {k: v for k, v in batch.items() if k != "labels"}
                with torch.no_grad():
                    outputs = accelerator.unwrap_model(model).generate(
                        **batch, synced_gpus=is_ds_zero_3, max_new_tokens=10
                    )  # synced_gpus=True for DS-stage 3
                outputs = accelerator.pad_across_processes(outputs, dim=1, pad_index=tokenizer.pad_token_id)
                preds = accelerator.gather_for_metrics(outputs)
                preds = preds[:, max_length:].detach().cpu().numpy()
                eval_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))

        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
        accelerator.print("GPU Memory before entering the eval : {}".format(b2mb(tracemalloc.begin)))
        accelerator.print("GPU Memory consumed at the end of the eval (end-begin): {}".format(tracemalloc.used))
        accelerator.print("GPU Peak Memory consumed during the eval (max-begin): {}".format(tracemalloc.peaked))
        accelerator.print(
            "GPU Total Peak Memory consumed during the eval (max): {}".format(
                tracemalloc.peaked + b2mb(tracemalloc.begin)
            )
        )

        accelerator.print("CPU Memory before entering the eval : {}".format(b2mb(tracemalloc.cpu_begin)))
        accelerator.print("CPU Memory consumed at the end of the eval (end-begin): {}".format(tracemalloc.cpu_used))
        accelerator.print("CPU Peak Memory consumed during the eval (max-begin): {}".format(tracemalloc.cpu_peaked))
        accelerator.print(
            "CPU Total Peak Memory consumed during the eval (max): {}".format(
                tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)
            )
        )

        correct = 0
        total = 0
        assert len(eval_preds) == len(
            dataset["train"][label_column]
        ), f"{len(eval_preds)} != {len(dataset['train'][label_column])}"
        for pred, true in zip(eval_preds, dataset["train"][label_column]):
            if pred.strip() == true.strip():
                correct += 1
            total += 1
        accuracy = correct / total * 100
        accelerator.print(f"{accuracy=}")
        accelerator.print(f"{eval_preds[:10]=}")
        accelerator.print(f"{dataset['train'][label_column][:10]=}")

    if do_test:
        model.eval()
        test_preds = []
        for _, batch in enumerate(tqdm(test_dataloader)):
            batch = {k: v for k, v in batch.items() if k != "labels"}
            with torch.no_grad():
                outputs = accelerator.unwrap_model(model).generate(
                    **batch, synced_gpus=is_ds_zero_3, max_new_tokens=10
                )  # synced_gpus=True for DS-stage 3
            outputs = accelerator.pad_across_processes(outputs, dim=1, pad_index=tokenizer.pad_token_id)
            preds = accelerator.gather(outputs)
            preds = preds[:, max_length:].detach().cpu().numpy()
            test_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))

        test_preds_cleaned = []
        # for _, pred in enumerate(test_preds):
        #     test_preds_cleaned.append(get_closest_label(pred, classes))

        test_df = dataset["test"].to_pandas()
        assert len(test_preds_cleaned) == len(test_df), f"{len(test_preds_cleaned)} != {len(test_df)}"
        test_df[label_column] = test_preds_cleaned
        test_df["text_labels_orig"] = test_preds
        accelerator.print(test_df[[text_column, label_column]].sample(20))

        pred_df = test_df[["ID", label_column]]
        pred_df.columns = ["ID", "Label"]

        os.makedirs(f"data/{dataset_name}", exist_ok=True)
        pred_df.to_csv(f"data/{dataset_name}/predictions.csv", index=False)

    accelerator.wait_for_everyone()
    peft_model_id = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace(
        "/", "_"
    )
    model.save_pretrained(peft_model_id)
    # Option1: Pushing the model to Hugging Face Hub
    model.push_to_hub(
        f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
        token = "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"
    )
    # token (`bool` or `str`, *optional*):
    #     `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated
    #     when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
    #     is not specified.
    #     Or you can get your token from https://huggingface.co/settings/token
    # Option2: Saving the model locally

    accelerator.wait_for_everyone()


if __name__ == "__main__":
    main()


Splits:  dict_keys(['train', 'test', 'eval'])
Columns:  {'train': ['url', 'text', 'date', 'meta'], 'test': ['url', 'text', 'date', 'meta'], 'eval': ['url', 'text', 'date', 'meta']}


Running tokenizer on dataset:   0%|          | 0/2155654 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (5969 > 2048). Running this sequence through the model will result in indexing errors


In [None]:
# prompt: datasets names , i am volunteering not set any columns names , so all that columns are to be trained write code for it using datasets modules

dataset = load_dataset("my_dataset", split="train")
columns = list(dataset[0].keys())
train_dataset = dataset.map(lambda x: {k: x[k] for k in columns}, batched=False)


In [None]:
from typing import Union,List, Dict, Optional
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
import multiprocessing
def advanced_data_loader(input: Union[str, Dict[str, str]], format: Optional[str] = None, split_ratios: Optional[Dict[str, float]] = None) -> Optional[DatasetDict]:
    """
    Loads a dataset from a given input path or dictionary specifying file paths and splits it.

    :param input: A string representing the dataset name or directory, or a dictionary containing file paths.
    :param format: The format of the dataset if loading from a file (e.g., 'csv' or 'json').
    :param split_ratios: A dictionary with keys 'train', 'test', and 'eval' containing split ratios.
    :return: A loaded and split dataset or None in case of failure.
    """
    if split_ratios is None:
        split_ratios = {'train': 0.8, 'test': 0.1, 'eval': 0.1}

    try:
        # Load the dataset
        if isinstance(input, dict) and format in ['csv', 'json']:
            dataset = load_dataset(format, data_files=input)
        elif isinstance(input, str) and format == 'text':
            dataset = load_dataset(format, data_dir=input)
        elif isinstance(input, str) and format is None:
            dataset = load_dataset(input)
        else:
            warnings.warn("Invalid input or format. Please provide a valid dataset name, directory, or file paths.")
            return None
    except FileNotFoundError as e:
        warnings.warn(str(e))
        return None

    # Split the dataset
    if dataset:
        split_dataset = dataset['train'].train_test_split(test_size=split_ratios['test'] + split_ratios['eval'])
        test_eval_dataset = split_dataset['test'].train_test_split(test_size=split_ratios['eval'] / (split_ratios['test'] + split_ratios['eval']))
        dataset = DatasetDict({
            'train': split_dataset['train'],
            'test': test_eval_dataset['train'],
            'eval': test_eval_dataset['test']
        })

    print("Splits: ", dataset.keys())
    print("Columns: ", {split: dataset[split].column_names for split in dataset.keys()})
    return dataset


def create_tokenizer(model_name_or_path: str = 'gpt2') -> AutoTokenizer:
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    if tokenizer.bos_token_id is None:
        tokenizer.bos_token_id = tokenizer.pad_token_id
    if tokenizer.eos_token_id is None:
        tokenizer.eos_token_id = tokenizer.pad_token_id
    if tokenizer.unk_token_id is None:
        tokenizer.unk_token_id = tokenizer.pad_token_id
    if tokenizer.sep_token_id is None:
        tokenizer.sep_token_id = tokenizer.pad_token_id
    if tokenizer.cls_token_id is None:
        tokenizer.cls_token_id = tokenizer.pad_token_id
    if tokenizer.mask_token_id is None:
        tokenizer.mask_token_id = tokenizer.pad_token_id
    return tokenizer
def preprocess_function(examples, tokenizer, max_length, columns):
    """
    Preprocesses text examples for a tokenizer.

    Args:
        examples: A list or a dictionary of text examples.
        tokenizer: A tokenizer object that can encode text.
        max_length: An integer that specifies the maximum length of the encoded tokens.
        columns: A list of strings that indicates which columns of the examples to use.

    Returns:
        A dictionary of model inputs, which contains input_ids and attention_mask as tensors.
    """
    # Initialize empty lists for input_ids and attention_mask
    input_ids = []
    attention_mask = []

    # Loop through each column of the examples
    for column in columns:
        # Convert the examples in the column to a list of strings
        text = list(str(examples[column]))
        # Tokenize the text using the tokenizer
        tokenized_outputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length)

        # Loop through each tokenized output
        for i in range(len(tokenized_outputs['input_ids'])):
            # Get the input_ids for the i-th output
            row_tokens = tokenized_outputs['input_ids'][i]
            # Truncate or pad the input_ids to the max_length
            row_tokens = row_tokens[:max_length]
            row_tokens += [tokenizer.pad_token_id] * (max_length - len(row_tokens))
            # Append the input_ids to the input_ids list
            input_ids.append(row_tokens)

            # Create an attention_mask for the input_ids
            row_attention_mask = [int(token != tokenizer.pad_token_id) for token in row_tokens]
            # Append the attention_mask to the attention_mask list
            attention_mask.append(row_attention_mask)

    # Create a dictionary of model inputs from the input_ids and attention_mask lists
    model_inputs = {'input_ids': torch.tensor(input_ids), 'attention_mask': torch.tensor(attention_mask)}
    # Return the model inputs
    return model_inputs
dataset=advanced_data_loader({'train': '/content/drive/MyDrive/MetaMathQA-395K.csv', 'test': '/content/drive/MyDrive/MetaMathQA-395K.csv'}, 'csv')
# print(dataset)
processed_datasets = dataset['train'].map(
    lambda examples: preprocess_function(examples=dataset['train'],tokenizer=tokenizer,max_length=128,columns=dataset['train'].column_names),
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=columns,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

In [None]:
from typing import Union, Dict, Optional
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
import multiprocessing
import warnings
import torch

# Function to load and optionally split a dataset
def advanced_data_loader(input: Union[str, Dict[str, str]], format: Optional[str] = None, split_ratios: Optional[Dict[str, float]] = None) -> Optional[DatasetDict]:
    if split_ratios is None:
        split_ratios = {'train': 0.8, 'test': 0.1, 'eval': 0.1}

    try:
        if isinstance(input, dict) and format in ['csv', 'json']:
            dataset = load_dataset(format, data_files=input)
        elif isinstance(input, str) and format == 'text':
            dataset = load_dataset(format, data_dir=input)
        elif isinstance(input, str) and format is None:
            dataset = load_dataset(input)
        else:
            warnings.warn("Invalid input or format. Please provide a valid dataset name, directory, or file paths.")
            return None
    except FileNotFoundError as e:
        warnings.warn(str(e))
        return None

    if dataset:
        split_dataset = dataset['train'].train_test_split(test_size=split_ratios['test'] + split_ratios['eval'])
        test_eval_dataset = split_dataset['test'].train_test_split(test_size=split_ratios['eval'] / (split_ratios['test'] + split_ratios['eval']))
        dataset = DatasetDict({
            'train': split_dataset['train'],
            'test': test_eval_dataset['train'],
            'eval': test_eval_dataset['test']
        })
    return dataset
# def preprocess_function(batch: Dict[str, Any]) -> Dict[str, Any]:
#     """
#     Tokenizes the input and instruction pairs in a batch using the T5 tokenizer
#     from the Google/flan-t5-base model, and returns a dictionary containing the
#     encoded inputs and labels.

#     Args:
#         batch: A dictionary containing at least two keys, "instruction" and
#         "input", whose values are lists of strings.

#     Returns:
#         A dictionary containing the encoded inputs and labels, as returned by
#         the T5 tokenizer.
#     """
#     model_name = "google/flan-t5-base"
#     tokenizer = T5Tokenizer.from_pretrained(model_name)

#     encoded_inputs = tokenizer(
#         list(batch["instruction"]),
#         list(batch["input"]),
#         padding="max_length",
#         truncation=True,
#         return_tensors="np",
#     )

#     encoded_inputs["labels"] = encoded_inputs["input_ids"].copy()

#     return dict(encoded_inputs)
# Function to create a tokenizer with proper token configurations
def create_tokenizer(model_name_or_path: str = 'gpt2') -> AutoTokenizer:
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    # Set special tokens if they are not already set
    special_tokens = ['pad_token_id', 'bos_token_id', 'eos_token_id', 'unk_token_id', 'sep_token_id', 'cls_token_id', 'mask_token_id']
    for token in special_tokens:
        if getattr(tokenizer, token) is None:
            setattr(tokenizer, token, tokenizer.eos_token_id)
    return tokenizer

def preprocess_function(examples, tokenizer, max_length, columns):
    # Check for missing columns
    missing_columns = [col for col in columns if col not in examples]
    if missing_columns:
        raise ValueError(f"Missing columns: {missing_columns} in the examples")

    concatenated_texts = []
    # Iterate over multiple lists using zip()
    for texts in zip(*[examples[col] for col in columns]):
        # Concatenate all the column texts for each example
        text = ' '.join(str(t) for t in texts)
        concatenated_texts.append(text)


    # Tokenize the concatenated texts
    tokenized_outputs = tokenizer(concatenated_texts, padding='max_length', truncation=True, max_length=max_length)
    tokenized_outputs ["labels"] = tokenized_outputs ["input_ids"].copy()
    return dict(tokenized_outputs)

# Example of loading a dataset and preprocessing it
dataset = advanced_data_loader({'train': '/content/drive/MyDrive/MetaMathQA-395K.csv', 'test': '/content/drive/MyDrive/MetaMathQA-395K.csv'}, 'csv')
# dataset=advanced_data_loader("fka/awesome-chatgpt-prompts")
if dataset:
    tokenizer = create_tokenizer()
    columns = dataset['train'].column_names
    processed_datasets = dataset['train'].map(
        lambda examples: preprocess_function(examples=examples, tokenizer=tokenizer, max_length=128, columns=columns),
        batched=True,
        num_proc=1,
        remove_columns=columns,
        load_from_cache_file=False,
        desc="Running tokenizer on dataset",
    )

Running tokenizer on dataset:   0%|          | 0/316000 [00:00<?, ? examples/s]

In [None]:
print(processed_datasets['labels'][0])

[1532, 262, 3650, 13339, 257, 10147, 329, 720, 1238, 290, 6673, 257, 1542, 4, 7630, 10330, 11, 475, 262, 10147, 318, 3058, 319, 5466, 329, 2026, 4, 572, 262, 6301, 2756, 11, 644, 318, 262, 1459, 2756, 286, 262, 10147, 30, 402, 12310, 62, 6207, 11840, 839, 383, 3650, 8155, 262, 10147, 329, 720, 1238, 290, 2087, 257, 1542, 4, 7630, 10330, 11, 523, 262, 6301, 2756, 878, 262, 9780, 318, 720, 1238, 1343, 7198, 1238, 1635, 657, 13, 1270, 8, 796, 720, 1238, 1343, 720, 21, 796, 720, 2075, 13, 198, 464, 10147, 318, 3058, 319, 5466, 329, 2026, 4, 572, 11, 523, 262, 9780, 2033, 318, 720, 2075, 1635, 657, 13, 1120, 796, 720, 1485, 13, 198, 464, 1459, 2756, 286, 262, 10147, 318, 262, 6301]


In [None]:

from typing import Union, Dict, Optional
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    default_data_collator,
    get_linear_schedule_with_warmup,
    set_seed,
)
import multiprocessing
import warnings
import torch
from torch.utils.data import DataLoader
batch_size=16
train_dataloader = DataLoader(
         processed_datasets, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
    )

In [None]:
import os
from transformers import AdamW
from tqdm.auto import tqdm

# Set the seed for reproducibility
set_seed(42)

# Check if a GPU is available and use CPU otherwise
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model
model_name_or_path = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device)

# Prepare optimizer and schedule (linear warm-up and decay)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]

optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

{'input_ids': tensor([[ 1532, 23386,  7317,  ..., 50256, 50256, 50256],
         [35475,  2227,   284,  ...,  1321,  1813,    25],
         [  464,  2811,   286,  ...,  2160,  2229,   477],
         ...,
         [ 1532, 18496,   460,  ...,   198,  4242,  2310],
         [ 9771,  3129,   378,  ..., 50256, 50256, 50256],
         [ 2514,  4911,  1160,  ...,  7029,    12, 11645]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1]]),
 'labels': tensor([[ 1532, 23386,  7317,  ..., 50256, 50256, 50256],
         [35475,  2227,   284,  ...,  1321,  1813,    25],
         [  464,  2811,   286,  ...,  2160,  2229,   477],
         ...,
         [ 1532, 18496,   460,  ...,   198,  4242,  2310],
         [ 9771,  3129,   378,  ..., 50256, 50256, 50256],
         [ 2514,  4911,  1160,  ...,  7029,    12, 116

In [None]:
processed_datasets_eval = dataset['eval'].map(
    lambda examples: preprocess_function(examples=examples, tokenizer=tokenizer, max_length=128, columns=columns),
    batched=True,
    num_proc=1,
    remove_columns=columns,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)
eval_dataloader = DataLoader(
       processed_datasets_eval, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
    )

processed_datasets_test = dataset['test'].map(
    lambda examples: preprocess_function(examples=examples, tokenizer=tokenizer, max_length=128, columns=columns),
    batched=True,
    num_proc=1,
    remove_columns=columns,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)
test_dataloader = DataLoader(
        processed_datasets_test, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
    )


Running tokenizer on dataset:   0%|          | 0/39500 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/39500 [00:00<?, ? examples/s]

In [None]:
# For a dataset from Hugging Face
# advanced_data_loader('name_of_dataset')

# For a local CSV file
dataset=advanced_data_loader({'train': '/content/drive/MyDrive/MetaMathQA-395K.csv', 'test': '/content/drive/MyDrive/MetaMathQA-395K.csv'}, 'csv')
print(dataset)
# For a local JSON file
# advanced_data_loader({'train': 'path_to_train.json', 'test': 'path_to_test.json'}, 'json')

# For a local folder
# advanced_data_loader('path_to_your_folder', 'text')


Splits:  dict_keys(['train', 'test', 'eval'])
Columns:  {'train': ['query', 'type', 'response'], 'test': ['query', 'type', 'response'], 'eval': ['query', 'type', 'response']}
DatasetDict({
    train: Dataset({
        features: ['query', 'type', 'response'],
        num_rows: 316000
    })
    test: Dataset({
        features: ['query', 'type', 'response'],
        num_rows: 39500
    })
    eval: Dataset({
        features: ['query', 'type', 'response'],
        num_rows: 39500
    })
})


In [None]:
dataset=advanced_data_loader({'train': '/content/drive/MyDrive/MetaMathQA-395K.csv', 'test': '/content/drive/MyDrive/MetaMathQA-395K.csv'}, 'csv')
# print(dataset)

import multiprocessing
processed_datasets = dataset['train'].map(
    lambda examples: examples=dataset['train'],tokenizer=tokenizer,max_length=128,columns=dataset['train'].column_names,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=columns,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)
# Create the tokenizer
tokenizer = create_tokenizer('gpt2')
print(dataset['train'].column_names)
# # Preprocess the dataset
preprocessed_dataset = preprocess_function(examples=dataset['train'],tokenizer=tokenizer,max_length=512,columns=dataset['train'].column_names)

# # Create the data loaders
# data_loaders = create_data_loaders(preprocessed_dataset, batch_size=32)

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (<ipython-input-3-6d86c5cb460e>, line 6)

In [None]:
import multiprocessing
processed_datasets = dataset['train'].map(
    lambda examples: examples=dataset['train'],tokenizer=tokenizer,max_length=512,columns=dataset['train'].column_names,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=columns,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)


In [None]:
import warnings
from typing import Union, Dict, Optional
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from torch.utils.data import DataLoader

def advanced_data_loader(input: Union[str, Dict[str, str]], format: Optional[str] = None, split_ratios: Optional[Dict[str, float]] = None) -> Optional[DatasetDict]:
    if split_ratios is None:
        split_ratios = {'train': 0.8, 'test': 0.1, 'eval': 0.1}

    try:
        if isinstance(input, dict) and format in ['csv', 'json']:
            dataset = load_dataset(format, data_files=input)
        elif isinstance(input, str) and format == 'text':
            dataset = load_dataset(format, data_dir=input)
        elif isinstance(input, str) and format is None:
            dataset = load_dataset(input)
        else:
            warnings.warn("Invalid input or format. Please provide a valid dataset name, directory, or file paths.")
            return None
    except FileNotFoundError as e:
        warnings.warn(str(e))
        return None

    if dataset:
        split_dataset = dataset['train'].train_test_split(test_size=split_ratios['test'] + split_ratios['eval'])
        test_eval_dataset = split_dataset['test'].train_test_split(test_size=split_ratios['eval'] / (split_ratios['test'] + split_ratios['eval']))
        dataset = DatasetDict({
            'train': split_dataset['train'],
            'test': test_eval_dataset['train'],
            'eval': test_eval_dataset['test']
        })

    return dataset

def create_tokenizer(model_name_or_path: str = 'gpt2') -> AutoTokenizer:
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    return tokenizer

def preprocess_datasets(datasets: DatasetDict, tokenizer: PreTrainedTokenizerBase, text_column: str, label_column: str, max_length: int, padding: str = 'max_length', truncation: bool = True) -> DatasetDict:
    def tokenize_function(examples):
        text = examples[text_column]
        labels = examples[label_column]
        tokenized_inputs = tokenizer(text, padding=padding, truncation=truncation, max_length=max_length)
        tokenized_labels = tokenizer(labels, padding=padding, truncation=truncation, max_length=max_length)
        tokenized_inputs['input_ids'] += tokenized_labels['input_ids']
        tokenized_inputs['attention_mask'] += tokenized_labels['attention_mask']
        return tokenized_inputs

    return datasets.map(tokenize_function, batched=True)

def create_data_loaders(datasets: DatasetDict, batch_size: int) -> Dict[str, DataLoader]:
    return {split: DataLoader(datasets[split], batch_size=batch_size) for split in datasets.keys()}

# Load the dataset
try:
    dataset = advanced_data_loader({'train': '/content/drive/MyDrive/MetaMathQA-395K.csv', 'test': '/content/drive/MyDrive/MetaMathQA-395K.csv'}, 'csv')
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit(1)

# Create the tokenizer
try:
    tokenizer = create_tokenizer('gpt2')
except Exception as e:
    print(f"Error creating tokenizer: {e}")
    exit(1)

# Preprocess the dataset
try:
    preprocessed_dataset = preprocess_datasets(dataset, tokenizer, text_column='text', label_column='response', max_length=128)
except Exception as e:
    print(f"Error preprocessing dataset: {e}")
    exit(1)

# Create the data loaders
try:
    data_loaders = create_data_loaders(preprocessed_dataset, batch_size=32)
except Exception as e:
    print(f"Error creating data loaders: {e}")
    exit(1)


Map:   0%|          | 0/316000 [00:00<?, ? examples/s]

Error preprocessing dataset: 'text'
Error creating data loaders: name 'preprocessed_dataset' is not defined


In [None]:

# Create the tokenizer
tokenizer = create_tokenizer(model_name_or_path='gpt2')

# Preprocess the data
preprocessed_dataset = preprocess_data(dataset, tokenizer, max_length=512)

# Print the preprocessed data
print(preprocessed_dataset)


In [None]:
from transformers import AutoTokenizer

# Define the tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')

# Check if the tokenizer has a padding token, and if not, set it to the EOS token
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Check if the tokenizer has a bos (beginning of sentence) token, and if not, set it
if tokenizer.bos_token_id is None:
    tokenizer.bos_token_id = tokenizer.pad_token_id

# Check if the tokenizer has an eos (end of sentence) token, and if not, set it
if tokenizer.eos_token_id is None:
    tokenizer.eos_token_id = tokenizer.pad_token_id

# Check if the tokenizer has a unk (unknown) token, and if not, set it
if tokenizer.unk_token_id is None:
    tokenizer.unk_token_id = tokenizer.pad_token_id

# Check if the tokenizer has a sep (separator) token, and if not, set it
if tokenizer.sep_token_id is None:
    tokenizer.sep_token_id = tokenizer.pad_token_id

# Check if the tokenizer has a cls (classification) token, and if not, set it
if tokenizer.cls_token_id is None:
    tokenizer.cls_token_id = tokenizer.pad_token_id

# Check if the tokenizer has a mask (masking) token, and if not, set it
if tokenizer.mask_token_id is None:
    tokenizer.mask_token_id = tokenizer.pad_token_id


In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("fka/awesome-chatgpt-prompts")

# Get the splits
splits = list(dataset.keys())
print(f"Splits: {splits}")

# Get the columns for each split
for split in splits:
    columns = dataset[split].column_names
    print(f"Columns for {split}: {columns}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/274 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.6k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Splits: ['train']
Columns for train: ['act', 'prompt']


In [None]:
import os
import warnings
import multiprocessing
import torch
import torch.nn as nn

from typing import Union, List, Dict
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from transformers import AutoModelForCausalLM, AutoTokenizer

def create_tokenizer(model_name_or_path: str = 'gpt2') -> AutoTokenizer:
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    return tokenizer

def advanced_data_loader(input: Union[str, dict], format: str = None):
    if isinstance(input, dict):
        if format in ['csv', 'json']:
            try:
                dataset = load_dataset(format, data_files=input)
            except FileNotFoundError:
                warnings.warn("File not found. Please check your file path.")
                return None
        else:
            warnings.warn("Invalid format. Please choose 'csv' or 'json'.")
            return None
    elif isinstance(input, str):
        if format == 'text':
            if os.path.isdir(input):
                try:
                    dataset = load_dataset(format, data_dir=input)
                except FileNotFoundError:
                    warnings.warn("Directory not found. Please check your folder path.")
                    return None
            else:
                warnings.warn("Invalid directory. Please check your folder path.")
                return None
        elif format is None:
            try:
                dataset = load_dataset(input)
            except FileNotFoundError:
                warnings.warn("Dataset not found. Please check your dataset name.")
                return None
        else:
            warnings.warn("Invalid input. Please enter a valid dataset name or folder path.")
            return None
    else:
        warnings.warn("Invalid input type. Please enter a string (for dataset name or folder) or a dictionary (for CSV or JSON files).")
        return None
    print("Splits: ", dataset.keys())
    print("Columns: ", dataset.column_names)
    return dataset

def preprocess_function(examples, tokenizer, max_length, columns):
    input_ids = []
    attention_mask = []

    for column in columns:
        text = list(str(examples[column]))
        tokenized_outputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length)

        for i in range(len(tokenized_outputs['input_ids'])):
            row_tokens = tokenized_outputs['input_ids'][i]
            row_tokens = row_tokens[:max_length]
            row_tokens += [tokenizer.pad_token_id] * (max_length - len(row_tokens))
            input_ids.append(row_tokens)

            row_attention_mask = [int(token != tokenizer.pad_token_id) for token in row_tokens]
            attention_mask.append(row_attention_mask)

    model_inputs = {'input_ids': torch.tensor(input_ids), 'attention_mask': torch.tensor(attention_mask)}
    return model_inputs

max_length = 512
model_name_or_path = 'gpt2'
batch_size = 16

tokenizer = create_tokenizer(model_name_or_path)
dataset=advanced_data_loader('fka/awesome-chatgpt-prompts')

# dataset = advanced_data_loader({'train': '/home/khemanth/LLMs_Models/Adversarial_data/data/dataset1.csv', 'test': '/home/khemanth/LLMs_Models/Adversarial_data/data/dataset1.csv'}, 'csv')

split = list(dataset.keys())
columns = dataset[str(split[0])].column_names
processed_datasets = dataset.map(
    lambda examples: preprocess_function(examples, tokenizer, max_length, columns),
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=columns,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets[str(split[0])]
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)

test_dataset = processed_datasets[str(split[0])]
test_dataloader = DataLoader(
    test_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)

model = AutoModelForCausalLM.from_pretrained('gpt2')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

lr = 0.001
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

num_epochs = 2
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = loss_fn(outputs.logits.view(-1, outputs.logits.size(-1)), batch['input_ids'].view(-1))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


In [None]:
!pip install -q -U transformers accelerate evaluate deepspeed tqdm datasets peft

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15

In [None]:
import os
import warnings
import multiprocessing
import torch
from typing import Union
from datasets import load_dataset
from transformers import AutoTokenizer
# For a dataset from Hugging Face
# advanced_data_loader('name_of_dataset')
max_length=512
# Define the tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
# For a local folder
# advanced_data_loader('path_to_your_folder', 'text')
def advanced_data_loader(input: Union[str, dict], format: str = None):
    # Check if input is a dictionary (for CSV and JSON files)
    if isinstance(input, dict or str):
        if format in ['csv', 'json']:
            try:
                dataset = load_dataset(format, data_files=input)
            except FileNotFoundError:
                warnings.warn("File not found. Please check your file path.")
                return None
        else:
            warnings.warn("Invalid format. Please choose 'csv' or 'json'.")
            return None
    # Check if input is a string (for dataset name or folder)
    elif isinstance(input, str):
        if format == 'text':
            if os.path.isdir(input):
                try:
                    dataset = load_dataset(format, data_dir=input)
                except FileNotFoundError:
                    warnings.warn("Directory not found. Please check your folder path.")
                    return None
            else:
                warnings.warn("Invalid directory. Please check your folder path.")
                return None
        elif format is None:
            try:
                dataset = load_dataset(input)
            except FileNotFoundError:
                warnings.warn("Dataset not found. Please check your dataset name.")
                return None
        else:
            warnings.warn("Invalid input. Please enter a valid dataset name or folder path.")
            return None
    else:
        warnings.warn("Invalid input type. Please enter a string (for dataset name or folder) or a dictionary (for CSV or JSON files).")
        return None

    print("Splits: ", dataset.keys())
    print("Columns: ", dataset.column_names)
    return dataset
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
# For a local CSV file
# dataset=advanced_data_loader({'train': '/content/drive/MyDrive/MetaMathQA-395K.json', 'test': '/content/drive/MyDrive/MetaMathQA-395K.json'}, 'json')
dataset=advanced_data_loader('fka/awesome-chatgpt-prompts')
# For a local JSON file
# advanced_data_loader({'train': 'path_to_train.json', 'test': 'path_to_test.json'}, 'json')


columns = dataset[str(list(dataset.keys())[0])].column_names
def preprocess_function(examples):
    # Initialize lists for input_ids and attention_mask
    input_ids = []
    attention_mask = []

    # Iterate over each column
    for column in columns:
        print(column)
        # Tokenize the column
        tokenized_outputs = tokenizer(examples[column], padding='max_length', truncation=True, max_length=max_length)

        for i in range(len(tokenized_outputs['input_ids'])):  # Number of examples
            row_tokens = tokenized_outputs['input_ids'][i]
            row_tokens = row_tokens[:max_length]  # Truncate to max_length if necessary
            row_tokens += [tokenizer.pad_token_id] * (max_length - len(row_tokens))  # Pad if necessary
            input_ids.append(row_tokens)

            # Create a tensor for attention_mask based on input_ids
            row_attention_mask = [int(token != tokenizer.pad_token_id) for token in row_tokens]
            attention_mask.append(row_attention_mask)

    # Convert input_ids and attention_mask to tensors
    model_inputs = {'input_ids': torch.tensor(input_ids), 'attention_mask': torch.tensor(attention_mask)}
    return model_inputs


# Map the preprocessing function across the entire dataset
processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=columns,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup


train_dataset = processed_datasets[str(list(dataset.keys())[0])]

batch_size=16
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)

Splits:  dict_keys(['train'])
Columns:  {'train': ['act', 'prompt']}


Running tokenizer on dataset (num_proc=2):   0%|          | 0/153 [00:00<?, ? examples/s]

act
act
prompt
prompt


In [None]:
dataset=advanced_data_loader('/content/drive/MyDrive/MetaMathQA-395K.csv)



In [None]:
import os
import pandas as pd
import json
import torch
from transformers import AutoTokenizer
from typing import Dict, List, Union
import warnings

def load_dataset(local_path: str) -> Dict[str, pd.DataFrame]:
    dataset = {}
    for file in os.listdir(local_path):
        file_path = os.path.join(local_path, file)
        try:
            if file.endswith('.csv'):
                dataset[file] = pd.read_csv(file_path, error_bad_lines=False, warn_bad_lines=True)
            elif file.endswith('.json'):
                with open(file_path, 'r') as f:
                    dataset[file] = pd.read_json(f)
            else:
                warnings.warn(f"Unsupported file type: {file}")
        except Exception as e:
            warnings.warn(f"Error loading {file}: {str(e)}")
    return dataset

def preprocess_function(examples: Dict[str, pd.DataFrame], columns: List[str], tokenizer, max_length: int) -> Dict[str, torch.Tensor]:
    tokenized_outputs = {column: [] for column in columns}
    for column in columns:
        for example in examples.values():
            try:
                if column in example.columns:
                    column_tokens = tokenizer(example[column].tolist(), padding='max_length', truncation=True, max_length=max_length)
                    tokenized_outputs[column].extend(column_tokens['input_ids'])
                else:
                    warnings.warn(f"Column {column} not found in dataset.")
            except Exception as e:
                warnings.warn(f"Error in tokenizing column {column}: {str(e)}")
                continue

    input_ids, attention_mask = [], []
    for i in range(len(tokenized_outputs[columns[0]])):  # Number of examples
        row_tokens = []
        for column in columns:
            row_tokens.extend(tokenized_outputs[column][i])
        row_tokens = row_tokens[:max_length]  # Truncate to max_length if necessary
        row_tokens += [tokenizer.pad_token_id] * (max_length - len(row_tokens))  # Pad if necessary
        input_ids.append(row_tokens)

        row_attention_mask = [int(token != tokenizer.pad_token_id) for token in row_tokens]
        attention_mask.append(row_attention_mask)

    model_inputs = {'input_ids': torch.tensor(input_ids), 'attention_mask': torch.tensor(attention_mask)}
    return model_inputs

def main():
    local_path = '/content/drive/MyDrive/training.csv'
    dataset = load_dataset(local_path)

    if not dataset:
        raise ValueError("No valid dataset found at the specified path.")

    columns = list(next(iter(dataset.values())).columns)
    max_length = 512
    model_name_or_path = "gpt2"

    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.padding_side = 'left'

    # Assuming the dataset is a dictionary with keys as filenames and values as DataFrames
    model_inputs = preprocess_function(dataset, columns, tokenizer, max_length)

if __name__ == "__main__":
    main()

NotADirectoryError: [Errno 20] Not a directory: '/content/drive/MyDrive/training.csv'

In [None]:
import gc
import os
import sys
import threading
import multiprocessing
import numpy as np
import psutil
import torch
from accelerate import Accelerator
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    default_data_collator,
    get_linear_schedule_with_warmup,
    set_seed,
)

from peft import LoraConfig, TaskType, get_peft_model


def levenshtein_distance(str1, str2):
    # TC: O(N^2)
    # SC: O(N^2)
    if str1 == str2:
        return 0
    num_rows = len(str1) + 1
    num_cols = len(str2) + 1
    dp_matrix = np.empty((num_rows, num_cols))
    dp_matrix[0, :] = range(num_cols)
    dp_matrix[:, 0] = range(num_rows)

    for i in range(1, num_rows):
        for j in range(1, num_cols):
            if str1[i - 1] == str2[j - 1]:
                dp_matrix[i, j] = dp_matrix[i - 1, j - 1]
            else:
                dp_matrix[i, j] = min(dp_matrix[i - 1, j - 1], dp_matrix[i - 1, j], dp_matrix[i, j - 1]) + 1

    return dp_matrix[num_rows - 1, num_cols - 1]


def get_closest_label(eval_pred, classes):
    min_id = sys.maxsize
    min_edit_distance = sys.maxsize
    for i, class_label in enumerate(classes):
        edit_distance = levenshtein_distance(eval_pred.strip(), class_label)
        if edit_distance < min_edit_distance:
            min_id = i
            min_edit_distance = edit_distance
    return classes[min_id]


# Converting Bytes to Megabytes
def b2mb(x):
    return int(x / 2**20)


# This context manager is used to track the peak memory usage of the process
class TorchTracemalloc:
    def __enter__(self):
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()  # reset the peak gauge to zero
        self.begin = torch.cuda.memory_allocated()
        self.process = psutil.Process()

        self.cpu_begin = self.cpu_mem_used()
        self.peak_monitoring = True
        peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
        peak_monitor_thread.daemon = True
        peak_monitor_thread.start()
        return self

    def cpu_mem_used(self):
        """get resident set size memory for the current process"""
        return self.process.memory_info().rss

    def peak_monitor_func(self):
        self.cpu_peak = -1

        while True:
            self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak)

            # can't sleep or will not catch the peak right (this comment is here on purpose)
            # time.sleep(0.001) # 1msec

            if not self.peak_monitoring:
                break

    def __exit__(self, *exc):
        self.peak_monitoring = False

        gc.collect()
        torch.cuda.empty_cache()
        self.end = torch.cuda.memory_allocated()
        self.peak = torch.cuda.max_memory_allocated()
        self.used = b2mb(self.end - self.begin)
        self.peaked = b2mb(self.peak - self.begin)

        self.cpu_end = self.cpu_mem_used()
        self.cpu_used = b2mb(self.cpu_end - self.cpu_begin)
        self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin)
        # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")




def main():
    accelerator = Accelerator()
    dataset_name='Open-Orca/OpenOrca'
    dataset = load_dataset(dataset_name)
    columns = dataset[str(list(dataset.keys())[0])].column_names
    max_length = 512
    model_name_or_path = "gpt2"
    peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
    lr = 3e-3
    num_epochs = 20
    batch_size = 8
    seed = 42
    max_length = 64
    do_test = False
    set_seed(seed)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    if tokenizer.pad_token_id is None:

      tokenizer.pad_token_id = tokenizer.eos_token_id
    # Set the padding_side to 'left'
    tokenizer.padding_side = 'left'


    def preprocess_function(examples):
        # Tokenize each column separately and concatenate their token IDs.
        tokenized_outputs = {}
        for column in columns:
            column_tokens = tokenizer(examples[column], padding='max_length', truncation=True, max_length=max_length)
            tokenized_outputs[column] = column_tokens['input_ids']
        # Create a tensor for input_ids by concatenating the tokens from each column
        input_ids = []
        for i in range(len(tokenized_outputs[columns[0]])):  # Number of examples
            row_tokens = []
            for column in columns:
                row_tokens.extend(tokenized_outputs[column][i])
            row_tokens = row_tokens[:max_length]  # Truncate to max_length if necessary
            row_tokens += [tokenizer.pad_token_id] * (max_length - len(row_tokens))  # Pad if necessary
            input_ids.append(row_tokens)
        # Create a tensor for attention_mask based on input_ids
        attention_mask = []
        for i in range(len(input_ids)):
            row_attention_mask = []
            for token in input_ids[i]:
                row_attention_mask.append(int(token != tokenizer.pad_token_id))
            attention_mask.append(row_attention_mask)
        # Convert input_ids and attention_mask to tensors
        model_inputs = {'input_ids': torch.tensor(input_ids), 'attention_mask': torch.tensor(attention_mask)}
        return model_inputs

    with accelerator.main_process_first():
        processed_datasets = dataset.map(
            preprocess_function,
            batched=True,
            num_proc=multiprocessing.cpu_count(),
            remove_columns=columns,
            load_from_cache_file=False,
            desc="Running tokenizer on dataset",
          )

    accelerator.wait_for_everyone()

    train_dataset = processed_datasets[str(list(processed_datasets.keys())[0])]
    eval_dataset_name = str(list(processed_datasets.keys())[0])
    eval_dataset = processed_datasets[eval_dataset_name]
    test_dataset_name = str(list(processed_datasets.keys())[0])
    test_dataset = processed_datasets[test_dataset_name]

    train_dataloader = DataLoader(
        train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
    )
    eval_dataloader = DataLoader(
        eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
    )
    test_dataloader = DataLoader(
        test_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
    )


    model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
    model.config.eos_token_id = tokenizer.pad_token_id
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    # optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    # lr scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=(len(train_dataloader) * num_epochs),
    )

    model, train_dataloader, eval_dataloader, test_dataloader, optimizer, lr_scheduler = accelerator.prepare(
        model, train_dataloader, eval_dataloader, test_dataloader, optimizer, lr_scheduler
    )

    is_ds_zero_3 = False
    if getattr(accelerator.state, "deepspeed_plugin", None):
        is_ds_zero_3 = accelerator.state.deepspeed_plugin.zero_stage == 3

    for epoch in range(num_epochs):
        model.train()
        for batch in train_dataloader:
            # Shift input_ids to the right to create labels
            batch['labels'] = batch['input_ids'][:, 1:].clone().detach()
            batch['input_ids'] = batch['input_ids'][:, :-1].clone().detach()

            # Apply the attention mask to the labels as well, setting labels for padded tokens to -100
            # so that they are not taken into account for loss computation
            batch['labels'] = torch.where(
                batch['attention_mask'][:, 1:] == 1,
                batch['labels'],
                torch.tensor(-100).type_as(batch['labels'])
            )

            # Update the attention mask to reflect the shifted input_ids
            batch['attention_mask'] = batch['attention_mask'][:, :-1].clone().detach()

            # Send the batch through the model
            outputs = model(**batch)

            # outputs.loss should not be None as long as 'labels' are provided
            loss = outputs.loss
            if loss is None:
                raise ValueError("Loss is None - check if the model's forward pass outputs a loss")

            # Perform backward pass and optimization step
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

    model.eval()

    # Evaluation
    eval_preds = []
    for batch in eval_dataloader:
        with torch.no_grad():
            # Generate predictions
            generated_tokens = accelerator.unwrap_model(model).generate(
                **batch,
                synced_gpus=is_ds_zero_3  # Set to True if using DeepSpeed Stage 3, otherwise remove
            )
            # Decode generated tokens to text
            generated_texts = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            eval_preds.extend(generated_texts)

    # Do something with eval_preds, such as calculating metrics or comparing with the ground truth

    # Testing
    test_preds = []
    for batch in test_dataloader:
        with torch.no_grad():
            # Generate predictions
            generated_tokens = accelerator.unwrap_model(model).generate(
                **batch,
                synced_gpus=is_ds_zero_3  # Set to True if using DeepSpeed Stage 3, otherwise remove
            )
            # Decode generated tokens to text
            generated_texts = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            test_preds.extend(generated_texts)

    # Do something with test_preds, such as calculating metrics or comparing with the ground truth

    # save the model
    model.save_pretrained("model")
    # train_epoch_loss = total_loss / len(train_dataloader)
    # train_ppl = torch.exp(train_epoch_loss)
    # accelerator.print(f"{epoch=}: {train_ppl=} {train_epoch_loss=}")

    model.eval()
    eval_preds = []
    with TorchTracemalloc() as tracemalloc:
        for _, batch in enumerate(tqdm(eval_dataloader)):
            batch = {k: v for k, v in batch.items() if k != "labels"}
            with torch.no_grad():
                outputs = accelerator.unwrap_model(model).generate(
                    **batch, synced_gpus=is_ds_zero_3, max_new_tokens=10
                )  # synced_gpus=True for DS-stage 3
            outputs = accelerator.pad_across_processes(outputs, dim=1, pad_index=tokenizer.pad_token_id)
            preds = accelerator.gather_for_metrics(outputs)
            preds = preds[:, max_length:].detach().cpu().numpy()
            eval_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))

    # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
    accelerator.print("GPU Memory before entering the eval : {}".format(b2mb(tracemalloc.begin)))
    accelerator.print("GPU Memory consumed at the end of the eval (end-begin): {}".format(tracemalloc.used))
    accelerator.print("GPU Peak Memory consumed during the eval (max-begin): {}".format(tracemalloc.peaked))
    accelerator.print(
        "GPU Total Peak Memory consumed during the eval (max): {}".format(
            tracemalloc.peaked + b2mb(tracemalloc.begin)
        )
    )

    accelerator.print("CPU Memory before entering the eval : {}".format(b2mb(tracemalloc.cpu_begin)))
    accelerator.print("CPU Memory consumed at the end of the eval (end-begin): {}".format(tracemalloc.cpu_used))
    accelerator.print("CPU Peak Memory consumed during the eval (max-begin): {}".format(tracemalloc.cpu_peaked))
    accelerator.print(
        "CPU Total Peak Memory consumed during the eval (max): {}".format(
            tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)
        )
    )

    correct = 0
    total = 0
    eval_dataset=dataset[str(list(dataset.keys())[0])]
    label_column=columns[1]
    assert len(eval_preds) == len(
        eval_dataset[label_column]
    ), f"{len(eval_preds)} != {len(eval_dataset[label_column])}"
    for pred, true in zip(eval_preds, eval_dataset[label_column]):
        if pred.strip() == true.strip():
            correct += 1
        total += 1
    accuracy = correct / total * 100
    accelerator.print(f"{accuracy=}")
    accelerator.print(f"{eval_preds[:10]=}")
    accelerator.print(f"{eval_dataset[label_column][:10]=}")

    if do_test:
        model.eval()
        test_preds = []
        for _, batch in enumerate(tqdm(test_dataloader)):
            batch = {k: v for k, v in batch.items() if k != "labels"}
            with torch.no_grad():
                outputs = accelerator.unwrap_model(model).generate(
                    **batch, synced_gpus=is_ds_zero_3, max_new_tokens=10
                )  # synced_gpus=True for DS-stage 3
            outputs = accelerator.pad_across_processes(outputs, dim=1, pad_index=tokenizer.pad_token_id)
            preds = accelerator.gather(outputs)
            preds = preds[:, max_length:].detach().cpu().numpy()
            test_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))

        test_preds_cleaned = []
        for _, pred in enumerate(test_preds):
            test_preds_cleaned.append(get_closest_label(pred, classes))

        test_df = test_dataset.to_pandas()
        assert len(test_preds_cleaned) == len(test_df), f"{len(test_preds_cleaned)} != {len(test_df)}"
        test_df[label_column] = test_preds_cleaned
        test_df["text_labels_orig"] = test_preds
        accelerator.print(test_df[[text_column, label_column]].sample(20))

        pred_df = test_df[["ID", label_column]]
        pred_df.columns = ["ID", "Label"]

        os.makedirs(f"data/{dataset_name}", exist_ok=True)
        pred_df.to_csv(f"data/{dataset_name}/predictions.csv", index=False)

    accelerator.wait_for_everyone()
    # Option1: Pushing the model to Hugging Face Hub
    # model.push_to_hub(
    #     f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
    #     token = "hf_..."
    # )
    # token (`bool` or `str`,
    peft_model_id = 'hemanth_weights'
    model.save_pretrained(peft_model_id)
    accelerator.wait_for_everyone()


if __name__ == "__main__":
    main()


Downloading readme:   0%|          | 0.00/12.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Running tokenizer on dataset (num_proc=2):   0%|          | 0/4233923 [00:00<?, ? examples/s]

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
import multiprocessing
import torch
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup

dataset_name='fka/awesome-chatgpt-prompts'
# Load the dataset
dataset = load_dataset(dataset_name)
print(dataset.keys())
# Define the tokenizer
model_name_or_path='gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
split_name=str(list(dataset.keys())[0])
print(split_name)
# Get the list of columns for the first split
columns = dataset[str(list(dataset.keys())[0])].column_names
print(columns)
# Define the maximum sequence length
max_length = 512

def preprocess_function(examples):
    # example=example[str(dataset[list(dataset.keys())[0]])]
    # Tokenize each column separately and concatenate their token IDs.
    tokenized_outputs = {}
    for column in columns:
        column_tokens = tokenizer(examples[column], padding='max_length', truncation=True, max_length=max_length)
        tokenized_outputs[column] = column_tokens['input_ids']

    # Create a tensor for input_ids by concatenating the tokens from each column
    input_ids = []
    for i in range(len(tokenized_outputs[columns[0]])): # Number of examples
        row_tokens = []
        for column in columns:
            row_tokens.extend(tokenized_outputs[column][i])
        row_tokens = row_tokens[:max_length]  # Truncate to max_length if necessary
        row_tokens += [tokenizer.pad_token_id] * (max_length - len(row_tokens))  # Pad if necessary
        input_ids.append(row_tokens)

    # Create a tensor for attention_mask based on input_ids
    attention_mask = [[int(token_id != tokenizer.pad_token_id) for token_id in input_row] for input_row in input_ids]

    # Convert input_ids and attention_mask to tensors
    model_inputs = {'input_ids': torch.tensor(input_ids), 'attention_mask': torch.tensor(attention_mask)}

    return model_inputs

# Map the preprocessing function across the entire dataset
processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=columns,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup


train_dataset = processed_datasets[str(list(dataset.keys())[0])]

batch_size=16
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
# print("length of processed_datasets['train'],",processed_datasets['train'])
# print("len of all inputs ids",len(processed_datasets['train']['input_ids']))
# print("first input_is, length_of_token",processed_datasets['train']['input_ids'][0],len(processed_datasets['train']['input_ids'][0]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/274 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.6k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

dict_keys(['train'])


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

train
['act', 'prompt']


Running tokenizer on dataset (num_proc=2):   0%|          | 0/153 [00:00<?, ? examples/s]

In [None]:
next(train_dataloader

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
# Load the dataset
dataset = load_dataset("fka/awesome-chatgpt-prompts")

# Define the tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

split_name=str(list(dataset.keys()))

# Get the list of columns for the first split
columns = dataset[str(list(dataset.keys())[0])].column_names

# Define the maximum sequence length
max_length = 512

def preprocess_dataset(dataset_name):
    # Load a dataset
    dataset = load_dataset(dataset_name)

    # Function to preprocess an example based on detected feature types
    def preprocess_function(example):
        new_example = {}
        # Loop over all features in the example
        for feature_name, feature_value in example.items():
            if isinstance(feature_value, str):  # For text features
                # Apply text preprocessing like tokenization, lowercasing, etc.
                new_example[feature_name] = feature_value.lower()  # Example operation
            elif isinstance(feature_value, list):  # For categorical features
                if isinstance(feature_value[0], int):  # Label encoded list
                    # Convert numeric labels to strings if names are provided in the feature
                    label_feature = dataset[split_name].features[feature_name]
                    if hasattr(label_feature, 'names'):
                        new_example[feature_name] = [label_feature.names[label] for label in feature_value]
                    else:
                        new_example[feature_name] = feature_value  # Unchanged
                else:
                    # Handle other list data types if necessary
                    pass
            else:  # For numerical or other types of features
                # Apply numerical preprocessing like normalization, scaling, etc.
                new_example[feature_name] = feature_value  # Example operation

        return new_example

    # Apply the preprocessing function to all splits in the dataset
    processed_dataset = dataset.map(preprocess_function, batched=True, num_proc=1)

    return processed_dataset

# Replace 'some_specific_name' with the actual name of the dataset you want to load
dataset_name = "LDJnr/Capybara"
processed_dataset = preprocess_dataset(dataset_name)

# Display the processed dataset and the first example of the train split
print(processed_dataset)
print("length of dataset:",len(processed_dataset["train"]))
print("Sample_data:",processed_dataset["train"][0])

In [None]:
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup


train_dataset = processed_datasets["train"]

batch_size=16
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)

In [None]:
next(iter(train_dataloader))

{'input_ids': tensor([[ 3109,  5276, 21616,  ..., 50256, 50256, 50256],
         [31923,  7451, 50256,  ..., 50256, 50256, 50256],
         [13908, 34270, 50256,  ..., 50256, 50256, 50256],
         ...,
         [17931,    56, 25516,  ..., 50256, 50256, 50256],
         [25896,  5886, 11125,  ..., 50256, 50256, 50256],
         [   50, 43490, 11915,  ..., 50256, 50256, 50256]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 0,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
pip install accelerate

In [None]:
# Load the base model for CausalLM from the specified pre-trained model, considering the provided device mapping and max_memory settings
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained('gpt2')
print(model)



| Module | Description | Parameters |
| --- | --- | --- |
| GPT2LMHeadModel | The GPT2 model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings) | config |
| transformer | The main GPT2Model that contains the core transformer layers | wte, wpe, drop, h, ln_f |
| wte | The word token embedding matrix that maps the token ids to their corresponding vector representations | vocab_size, n_embd |
| wpe | The position embedding matrix that maps the position ids to their corresponding vector representations | n_positions, n_embd |
| drop | The dropout layer that is applied to the embeddings and the outputs of each block | p |
| h | The module list that contains the 12 GPT2Block modules | n_layer |
| GPT2Block | A single transformer block that consists of a layer normalization, a self-attention layer, another layer normalization, and a feed-forward layer | ln_1, attn, ln_2, mlp |
| ln_1 | The first layer normalization layer that is applied before the self-attention layer | normalized_shape, eps, elementwise_affine |
| attn | The self-attention layer that computes the attention scores and values for the input tokens | c_attn, c_proj, attn_dropout, resid_dropout |
| c_attn | The 1D convolution layer that projects the input tokens into query, key, and value vectors | in_channels, out_channels |
| c_proj | The 1D convolution layer that projects the attention output into a vector of the same size as the input | in_channels, out_channels |
| attn_dropout | The dropout layer that is applied to the attention scores | p |
| resid_dropout | The dropout layer that is applied to the residual connection | p |
| ln_2 | The second layer normalization layer that is applied before the feed-forward layer | normalized_shape, eps, elementwise_affine |
| mlp | The feed-forward layer that consists of two 1D convolution layers and a GELU activation function | c_fc, c_proj, act, dropout |
| c_fc | The first 1D convolution layer that projects the input into a hidden size | in_channels, out_channels |
| c_proj | The second 1D convolution layer that projects the hidden output back to the input size | in_channels, out_channels |
| act | The GELU activation function that is applied between the two convolution layers | - |
| dropout | The dropout layer that is applied to the feed-forward output | p |
| ln_f | The final layer normalization layer that is applied to the transformer output | normalized_shape, eps, elementwise_affine |
| lm_head | The linear layer that is tied to the word token embedding matrix and used for language modeling | in_features, out_features, bias |



In [None]:
import torch

from transformers import default_data_collator, get_linear_schedule_with_warmup
lr=0.001
num_epochs=5

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [None]:
import os
import pandas as pd
import torch
from datasets import load_dataset as hf_load_dataset
from transformers import AutoTokenizer
from typing import Dict, List, Union
import warnings

def load_local_dataset(local_path: str) -> Dict[str, pd.DataFrame]:
    dataset = {}
    if os.path.isfile(local_path):
        # Single file loading
        try:
            if local_path.endswith('.csv'):
                dataset[os.path.basename(local_path)] = pd.read_csv(local_path)
            elif local_path.endswith('.json'):
                dataset[os.path.basename(local_path)] = pd.read_json(local_path)
            else:
                warnings.warn(f"Unsupported file type: {local_path}")
        except Exception as e:
            warnings.warn(f"Error loading {local_path}: {str(e)}")
    elif os.path.isdir(local_path):
        # Directory of files loading
        for file in os.listdir(local_path):
            file_path = os.path.join(local_path, file)
            try:
                if file.endswith('.csv'):
                    dataset[file] = pd.read_csv(file_path)
                elif file.endswith('.json'):
                    dataset[file] = pd.read_json(file_path)
                else:
                    warnings.warn(f"Unsupported file type: {file}")
            except Exception as e:
                warnings.warn(f"Error loading {file}: {str(e)}")
    else:
        warnings.warn(f"Path does not exist: {local_path}")
    return dataset

def preprocess_function(examples: Union[Dict[str, pd.DataFrame], pd.DataFrame], columns: List[str], tokenizer, max_length: int) -> Dict[str, torch.Tensor]:
    if isinstance(examples, pd.DataFrame):
        examples = {'data': examples}  # Wrap single DataFrame for consistency

    tokenized_outputs = {column: [] for column in columns}
    for column in columns:
        print(column )
        for example in examples.values():
            if column in example.columns:
               texts = example[column].astype(str).tolist()

               column_tokens = tokenizer(texts, padding='max_length', truncation=True, max_length=max_length)
               tokenized_outputs[column].extend(column_tokens['input_ids'])

                # if pd.api.types.is_string_dtype(example[str(column)]):
                #     column_tokens = tokenizer(example[str(column)].tolist(), padding='max_length', truncation=True, max_length=max_length)
                #     tokenized_outputs[column].extend(column_tokens['input_ids'])
                # else:
                #     warnings.warn(f"Column {column} is not a string data type.")
            else:
                warnings.warn(f"Column {column} not found in dataset.")

    # Assume all columns are present in all dataframes and have the same length
    input_ids, attention_mask = [], []
    for i in range(len(tokenized_outputs[columns[0]])):  # Number of examples
        row_tokens = []
        for column in columns:
            row_tokens.extend(tokenized_outputs[column][i])
        row_tokens = row_tokens[:max_length]  # Truncate to max_length if necessary
        row_tokens += [tokenizer.pad_token_id] * (max_length - len(row_tokens))  # Pad if necessary
        input_ids.append(row_tokens)

        row_attention_mask = [int(token != tokenizer.pad_token_id) for token in row_tokens]
        attention_mask.append(row_attention_mask)

    model_inputs = {'input_ids': torch.tensor(input_ids), 'attention_mask': torch.tensor(attention_mask)}
    return model_inputs

def main():
    dataset_name_or_local_path = '/content/drive/MyDrive/new_output.csv' # Example Hugging Face dataset name or local path to your dataset
    max_length = 512
    model_name_or_path = "gpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.padding_side = 'left'

    try:
        # Try to load from Hugging Face datasets
        dataset = hf_load_dataset(dataset_name_or_local_path)
    except FileNotFoundError:
        # If not found, try to load from local path
        dataset = load_local_dataset(dataset_name_or_local_path)

    if not dataset:
        raise ValueError("No valid dataset found at the specified path or with the specified name.")

    # Assuming the first dataset loaded will be used for processing
    first_dataset_key = next(iter(dataset.keys()))
    examples = dataset[first_dataset_key]
    columns = examples.columns.tolist() if isinstance(examples, pd.DataFrame) else []

    # Preprocess the dataset
    model_inputs = preprocess_function(examples, columns, tokenizer, max_length)

if __name__ == "__main__":
    main()

AI models can be susceptible to data poisoning attacks, and assessing their vulnerability is crucial. To determine susceptibility, a thorough analysis of the specific model and dataset is required. Utilizing techniques such as detecting potential vulnerabilities and calculating a susceptibility score can help evaluate the risk. In cases where vulnerabilities are identified, implementing appropriate mitigation strategies is recommended to enhance the model's robustness against data poisoning attacks.


In [None]:
# training and evaluation
from tqdm import tqdm
device='cpu'
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        #         print(batch)
        #         print(batch["input_ids"].shape)
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()


In [None]:
!pip install -q -U  transformers accelerate evaluate deepspeed tqdm datasets

In [None]:
!pip install -q -U peft

In [None]:
from transformers import AutoModelForCausalLM,AutoTokenizer
from peft import PeftModel, PeftConfig
import torch
from datasets import load_dataset
import os
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from tqdm import tqdm
from datasets import load_dataset

device = "cuda"
model_name_or_path = "bigscience/bloomz-7b1"
tokenizer_name_or_path = "bigscience/bloomz-7b1"
dataset_name = "twitter_complaints"
text_column = "Tweet text"
label_column = "text_label"
max_length = 64
lr = 1e-3
num_epochs = 50
batch_size = 8

In [None]:
import logging
import os
import sys
import traceback
from datasets import load_dataset
from transformers import AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader
import multiprocessing

class DatasetProcessor:
    def __init__(self, dataset_name, model_name_or_path, max_length):
        self.dataset_name = dataset_name
        self.model_name_or_path = model_name_or_path
        self.max_length = max_length
        self.columns = None

    def load_dataset(self):
        try:
            logging.info(f"Loading dataset: {self.dataset_name}")
            self.dataset = load_dataset(self.dataset_name)
            logging.info("Dataset loaded successfully.")
            return True
        except Exception as e:
            logging.error(f"Error loading dataset: {e}")
            traceback.print_exc(file=sys.stderr)
            return False

    def initialize_tokenizer(self):
        try:
            logging.info(f"Initializing tokenizer: {self.model_name_or_path}")
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id if self.tokenizer.pad_token_id is None else self.tokenizer.pad_token_id
            logging.info("Tokenizer initialized successfully.")
            return True
        except Exception as e:
            logging.error(f"Error initializing tokenizer: {e}")
            traceback.print_exc(file=sys.stderr)
            return False

    def preprocess_example(self, examples):
        try:
            tokenized_outputs = {}
            for column in self.columns:
                column_tokens = self.tokenizer(examples[column], padding='max_length', truncation=True, max_length=self.max_length)
                tokenized_outputs[column] = column_tokens['input_ids']

            input_ids = []
            for i in range(len(tokenized_outputs[self.columns[0]])):
                row_tokens = []
                for column in self.columns:
                    row_tokens.extend(tokenized_outputs[column][i])
                row_tokens = row_tokens[:self.max_length]
                row_tokens += [self.tokenizer.pad_token_id] * (self.max_length - len(row_tokens))
                input_ids.append(row_tokens)

            attention_mask = [[int(token_id != self.tokenizer.pad_token_id) for token_id in input_row] for input_row in input_ids]

            model_inputs = {'input_ids': torch.tensor(input_ids), 'attention_mask': torch.tensor(attention_mask)}

            return model_inputs
        except Exception as e:
            logging.error(f"Error preprocessing example: {e}")
            traceback.print_exc(file=sys.stderr)
            return None

    def process_dataset(self):
        try:
            logging.info("Processing dataset...")
            self.get_columns()
            self.dataset = self.dataset.map(self.preprocess_example, batched=True, num_proc=multiprocessing.cpu_count(), remove_columns=self.columns, load_from_cache_file=False, desc="Running tokenizer on dataset")
            logging.info("Dataset processed successfully.")
            return True
        except Exception as e:
            logging.error(f"Error processing dataset: {e}")
            traceback.print_exc(file=sys.stderr)
            return False

    def get_columns(self):
        if self.columns is None:
            split_name = list(self.dataset.keys())[0]
            self.columns = self.dataset[split_name].column_names




logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler()])
dataset_name='fka/awesome-chatgpt-prompts'
model_name_or_path='gpt2'
max_length=512
batch_size=16
dataset_processor_c = DatasetProcessor(dataset_name, model_name_or_path, max_length)
if not dataset_processor_c.load_dataset():
    sys.exit(1)
if not dataset_processor_c.initialize_tokenizer():
    sys.exit(1)
if not dataset_processor_c.process_dataset():
    sys.exit(1)

train_dataset = dataset_processor_c.dataset["train"]
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
next(iter(train_dataloader))


In [None]:
from datasets import load_dataset

dataset = load_dataset("ought/raft", dataset_name)

classes = [k.replace("_", " ") for k in dataset["train"].features["Label"].names]
print(classes)
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["Label"]]},
    batched=True,
    num_proc=1,
)
print(dataset)
dataset["train"][0]

In [None]:
# data preprocessing
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
print(target_max_length)


def preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]


train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)

In [None]:
def test_preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    model_inputs = tokenizer(inputs)
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
    return model_inputs


processed_datasets = dataset.map(
    test_preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

eval_dataset = processed_datasets["train"]
test_dataset = processed_datasets["test"]

eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
test_dataloader = DataLoader(test_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
print(next(iter(eval_dataloader)))
print(next(iter(test_dataloader)))

In [None]:
!pip install --upgrade peft

In [None]:
peft_model_id = "llama2_all0.1_llama2-7B-hf_clip_lr0.0001_bs240_ip1536_op1664_ep3/checkpoint-39006"

peft_config = PeftConfig.from_pretrained(peft_model_id)

# language_model = LlamaForCausalLM.from_pretrained(
#             peft_config.base_model_name_or_path,
#             load_in_8bit=True,
#             device_map="auto",
#             torch_dtype=torch.float16
#         )

# language_model = PeftModel(language_model, peft_config)

In [None]:
from peft import PeftModel, PeftConfig

max_memory = {0: "1GIB", 1: "1GIB", 2: "2GIB", 3: "10GIB", "cpu": "30GB"}
peft_model_id = "hemanthkandimalla/twitter_complaints_bigscience_bloomz-7b1_LORA_CAUSAL_LM"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, device_map="auto", max_memory=max_memory)
model = PeftModel.from_pretrained(model, peft_model_id, device_map="auto", max_memory=max_memory)

In [None]:
%%time

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import plotly.graph_objects as go
from tqdm import tqdm
import imageio
import os

# Load pre-trained model
model_name = 'gpt2'
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set model to evaluation mode
model.eval()

# Define our input
input_text = "I have a dream"
inputs = tokenizer.encode_plus(input_text, return_tensors="pt")

# Compute the original loss
outputs = model(**inputs, labels=inputs["input_ids"])
original_loss = outputs.loss.item()

# Define two random directions
direction1 = [torch.randn_like(p) for p in model.parameters()]
direction2 = [torch.randn_like(p) for p in model.parameters()]

# Normalize vectors
for d1, d2 in zip(direction1, direction2):
    norm_d1 = torch.linalg.norm(d1.flatten())
    norm_d2 = torch.linalg.norm(d2.flatten())
    d1.div_(norm_d1)
    d2.div_(norm_d2)

# Define the range to explore
x = np.linspace(-1, 1, 20)
y = np.linspace(-1, 1, 20)
X, Y = np.meshgrid(x, y)

# Prepare to collect the losses
Z = np.zeros_like(X)

# Compute loss for each direction
for i in tqdm(range(x.size), desc="x progress"):
    for j in tqdm(range(y.size), desc="y progress", leave=False):
        # Perturb the model parameters
        for p, d1, d2 in zip(model.parameters(), direction1, direction2):
            p.data.add_(x[i]*d1 + y[j]*d2)

        # Compute the loss
        outputs = model(**inputs, labels=inputs["input_ids"])
        Z[i, j] = outputs.loss.item()

        # Revert the model parameters
        for p, d1, d2 in zip(model.parameters(), direction1, direction2):
            p.data.sub_(x[i]*d1 + y[j]*d2)

# Plot the loss landscape
fig = go.Figure(frames=[go.Frame(data=go.Scatter(x=X[i], y=Y[j], z=Z[i, j])) for i in range(X.shape[0])])
fig.update_layout(scene=dict(xaxis=dict(range=[-1, 1]), yaxis=dict(range=[-1, 1]), zaxis=dict(range=[-4, 10])))
fig.show()

In [None]:
%%capture

# Install transformers and graphviz
!sudo apt-get install graphviz graphviz-dev
!pip install transformers pygraphviz

# Make sure we're using UTF-8 as encoding
import locale
locale.getpreferredencoding = lambda: "UTF-8"

# Set seed
import torch
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
from transformers import AutoModelForCausalLM,AutoTokenizer
import torch
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import time

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AutoModelForCausalLM.from_pretrained('gpt2').to(device)
tokenizer = AutoTokenizer.from_pretrained('gpt2')
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

model.eval()

text = "I have a dream"
input_ids = tokenizer.encode(text, return_tensors='pt').to(device)

outputs = model.generate(input_ids, max_length=len(input_ids.squeeze())+5)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Generated text: {generated_text}")



def get_log_prob(logits, token_id):
    # Compute the softmax of the logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    log_probabilities = torch.log(probabilities)

    # Get the log probability of the token
    token_log_probability = log_probabilities[token_id].item()
    return token_log_probability

def greedy_search(input_ids, node, length=5):
    if length == 0:
        return input_ids

    outputs = model(input_ids)
    predictions = outputs.logits

    # Get the predicted next sub-word (here we use top-k search)
    logits = predictions[0, -1, :]
    token_id = torch.argmax(logits).unsqueeze(0)

    # Compute the score of the predicted token
    token_score = get_log_prob(logits, token_id)

    # Add the predicted token to the list of input ids
    new_input_ids = torch.cat([input_ids, token_id.unsqueeze(0)], dim=-1)

    # Add node and edge to graph
    next_token = tokenizer.decode(token_id, skip_special_tokens=True)
    current_node = list(graph.successors(node))[0]
    graph.nodes[current_node]['tokenscore'] = np.exp(token_score) * 100
    graph.nodes[current_node]['token'] = next_token + f"_{length}"

    # Recursive call
    input_ids = greedy_search(new_input_ids, current_node, length-1)

    return input_ids

# Parameters
length = 5
beams = 1

# Create a balanced tree with height 'length'
graph = nx.balanced_tree(1, length, create_using=nx.DiGraph())

# Add 'tokenscore', 'cumscore', and 'token' attributes to each node
for node in graph.nodes:
    graph.nodes[node]['tokenscore'] = 100
    graph.nodes[node]['token'] = text

# Start generating text
output_ids = greedy_search(input_ids, 0, length=length)
output = tokenizer.decode(output_ids.squeeze().tolist(), skip_special_tokens=True)
print(f"Generated text: {output}")

In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import matplotlib.colors as mcolors
from matplotlib.colors import LinearSegmentedColormap

def plot_graph(graph, length, beams, score):
    fig, ax = plt.subplots(figsize=(3+1.2*beams**length, max(5, 2+length)), dpi=300, facecolor='white')

    # Create positions for each node
    pos = nx.nx_agraph.graphviz_layout(graph, prog="dot")

    # Normalize the colors along the range of token scores
    if score == 'token':
        scores = [data['tokenscore'] for _, data in graph.nodes(data=True) if data['token'] is not None]
    elif score == 'sequence':
        scores = [data['sequencescore'] for _, data in graph.nodes(data=True) if data['token'] is not None]
    vmin = min(scores)
    vmax = max(scores)
    norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
    cmap = LinearSegmentedColormap.from_list('rg', ["r", "y", "g"], N=256)

    # Draw the nodes
    nx.draw_networkx_nodes(graph, pos, node_size=2000, node_shape='o', alpha=1, linewidths=4,
                          node_color=scores, cmap=cmap)

    # Draw the edges
    nx.draw_networkx_edges(graph, pos)

    # Draw the labels
    if score == 'token':
        labels = {node: data['token'].split('_')[0] + f"\n{data['tokenscore']:.2f}%" for node, data in graph.nodes(data=True) if data['token'] is not None}
    elif score == 'sequence':
        labels = {node: data['token'].split('_')[0] + f"\n{data['sequencescore']:.2f}" for node, data in graph.nodes(data=True) if data['token'] is not None}
    nx.draw_networkx_labels(graph, pos, labels=labels, font_size=10)
    plt.box(False)

    # Add a colorbar
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])
    if score == 'token':
        fig.colorbar(sm, ax=ax, orientation='vertical', pad=0, label='Token probability (%)')
    elif score == 'sequence':
        fig.colorbar(sm, ax=ax, orientation='vertical', pad=0, label='Sequence score')
    plt.show()

# Plot graph
plot_graph(graph, length, 1.5, 'token')

In [None]:
from tqdm.notebook import tqdm

def greedy_sampling(logits, beams):
    return torch.topk(logits, beams).indices

def beam_search(input_ids, node, bar, length, beams, sampling, temperature=0.1):
    if length == 0:
        return None

    outputs = model(input_ids)
    predictions = outputs.logits

    # Get the predicted next sub-word (here we use top-k search)
    logits = predictions[0, -1, :]

    if sampling == 'greedy':
        top_token_ids = greedy_sampling(logits, beams)
    elif sampling == 'top_k':
        top_token_ids = top_k_sampling(logits, temperature, 20, beams)
    elif sampling == 'nucleus':
        top_token_ids = nucleus_sampling(logits, temperature, 0.5, beams)

    for j, token_id in enumerate(top_token_ids):
        bar.update(1)

        # Compute the score of the predicted token
        token_score = get_log_prob(logits, token_id)
        cumulative_score = graph.nodes[node]['cumscore'] + token_score

        # Add the predicted token to the list of input ids
        new_input_ids = torch.cat([input_ids, token_id.unsqueeze(0).unsqueeze(0)], dim=-1)

        # Add node and edge to graph
        token = tokenizer.decode(token_id, skip_special_tokens=True)
        current_node = list(graph.successors(node))[j]
        graph.nodes[current_node]['tokenscore'] = np.exp(token_score) * 100
        graph.nodes[current_node]['cumscore'] = cumulative_score
        graph.nodes[current_node]['sequencescore'] = 1/(len(new_input_ids.squeeze())) * cumulative_score
        graph.nodes[current_node]['token'] = token + f"_{length}_{j}"

        # Recursive call
        beam_search(new_input_ids, current_node, bar, length-1, beams, sampling, 1)

# Parameters
length = 5
beams = 2

# Create a balanced tree with height 'length' and branching factor 'k'
graph = nx.balanced_tree(beams, length, create_using=nx.DiGraph())
bar = tqdm(total=len(graph.nodes))

# Add 'tokenscore', 'cumscore', and 'token' attributes to each node
for node in graph.nodes:
    graph.nodes[node]['tokenscore'] = 100
    graph.nodes[node]['cumscore'] = 0
    graph.nodes[node]['sequencescore'] = 0
    graph.nodes[node]['token'] = text

# Start generating text
beam_search(input_ids, 0, bar, length, beams, 'greedy', 1)

In [None]:
def get_log_prob(logits, token_id):
    # Compute the softmax of the logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    log_probabilities = torch.log(probabilities)
    print("log_probabilities",log_probabilities)
    # Get the log probability of the token
    token_log_probability = log_probabilities[token_id].item()
    print("token_log_probability ",token_log_probability )
    return token_log_probability

def greedy_search(input_ids, node, length=5):
    if length == 0:
        return input_ids

    outputs = model(input_ids)
    predictions = outputs.logits

    # Get the predicted next sub-word (here we use top-k search)
    logits = predictions[0, -1, :]
    token_id = torch.argmax(logits).unsqueeze(0)

    # Compute the score of the predicted token
    token_score = get_log_prob(logits, token_id)

    # Add the predicted token to the list of input ids
    new_input_ids = torch.cat([input_ids, token_id.unsqueeze(0)], dim=-1)

    # Add node and edge to graph
    next_token = tokenizer.decode(token_id, skip_special_tokens=True)
    current_node = list(graph.successors(node))[0]
    graph.nodes[current_node]['tokenscore'] = np.exp(token_score) * 100
    graph.nodes[current_node]['token'] = next_token + f"_{length}"

    # Recursive call
    input_ids = greedy_search(new_input_ids, current_node, length-1)

    return input_ids

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load the GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Helper function for generating text
def generate_text(model, tokenizer, context, max_length, method, **kwargs):
    # Encode the context
    input_ids = tokenizer.encode(context, return_tensors='pt')

    # Generate the text
    output = model.generate(
        input_ids,
        max_length=max_length,
        do_sample=True,
        method=method,
        **kwargs
    )

    # Decode the text
    return tokenizer.decode(output[0])

# Greedy Search
greedy_search = generate_text(model, tokenizer, 'Hello,', 50, 'greedy')
print(greedy_search)

# Beam Search
beam_search = generate_text(model, tokenizer, 'Hello,', 50, 'beam_search', num_beams=5)
print(beam_search)

# Nucleus Sampling
nucleus_sampling = generate_text(model, tokenizer, 'Hello,', 50, 'top_p_sample', top_p=0.9)
print(nucleus_sampling)

# Top-k Sampling
top_k_sampling = generate_text(model, tokenizer, 'Hello,', 50, 'top_k_sample', top_k=50)
print(top_k_sampling)