In [None]:
!git clone https://github.com/google/jax.git

In [None]:
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
!locale.getpreferredencoding = getpreferredencoding

In [5]:
!pip install -q -U transformers accelerate evaluate deepspeed tqdm datasets peft langchain  unstructured  unstructured[pdf]
!%reload_ext dotenv
!%dotenv

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m816.1/816.1 kB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m65.1 M

Based on your requirements, I've adjusted the `LoraConfig` arguments for fine-tuning the Mistral model. Please note that the exact best settings can vary depending on the specific characteristics of your dataset and task. Here's a possible configuration:

```python
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=64,
    lora_alpha=256,
    lora_dropout=0.2,
    fan_in_fan_out=True,
    bias="all",
    modules_to_save=["classifier/score", "pooler"],
    init_lora_weights="gaussian",
    target_modules=["q_proj", "k_proj"],
    layers_to_transform=list(range(12)),  # Assuming a 12-layer model, adjust as needed
    layers_pattern="custom_pattern",
    rank_pattern={
        "model.decoder.layers.0.encoder_attn.k_proj": 16,
        "model.decoder.layers.2.encoder_attn.k_proj": 32
    },
    alpha_pattern={
        "model.decoder.layers.0.encoder_attn.k_proj": 64,
        "model.decoder.layers.4.encoder_attn.k_proj": 128
    },
)
```

Here's a brief explanation of the key parameters:

- `task_type`: The type of task you're performing. In this case, it's a causal language modeling task.
- `inference_mode`: Whether the model is in inference mode or not. Here, it's set to `False` for training.
- `r`: The rank of the low-rank approximation in LoRA.
- `lora_alpha`: The alpha parameter for the LoRA transformation.
- `lora_dropout`: The dropout rate for the LoRA transformation.
- `fan_in_fan_out`: Whether to use the fan-in/fan-out initialization for LoRA.
- `bias`: The type of bias to use in the LoRA transformation. Here, it's set to "all".
- `modules_to_save`: The modules to save after the LoRA transformation.
- `init_lora_weights`: The method to initialize LoRA weights. Here, it's set to "gaussian".
- `target_modules`: The modules to apply the LoRA transformation to.
- `layers_to_transform`: The layers to apply the LoRA transformation to. Here, it's set to all layers in a 12-layer model, but adjust as needed.
- `layers_pattern`: The pattern to apply for transforming layers. Here, it's set to "custom_pattern".
- `rank_pattern`: A dictionary specifying the rank for each module.
- `alpha_pattern`: A dictionary specifying the alpha for each module.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
#________________________________*FINE-TUNING ON LARGE LANGUAGE MODELS*____________________________________
import warnings
import csv
import matplotlib.pyplot as plt
import torch
import os
import pandas as pd
from tqdm import tqdm
from accelerate import Accelerator
from torch.utils.data import DataLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from torch.utils.data import DataLoader
from typing import (Union,
                    List,
                    Dict,
                    Optional,
                    Dict,
                    Any
)
from datasets import (load_dataset,
                      DatasetDict
)
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          default_data_collator,
                          get_linear_schedule_with_warmup
)
from transformers import (set_seed,
                          Trainer,
                          TrainingArguments
)
from peft import (
                  get_peft_config,
                  get_peft_model,
                  PromptTuningInit,
                  PromptTuningConfig,
                  TaskType,
                  PeftType,
                  LoraConfig
)

warnings.filterwarnings('ignore')

DEVICE = "cuda"
MODEL_NAME_OR_PATH = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
TOKENIZER_NAME_OR_PATH = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MAX_LENGTH = 512
LR = 3e-2
NUM_EPOCHS = 3
BATCH_SIZE = 1

#=========*DATASET ARGUMENTS* =====================
FOLDER_PATH='/content/jax'
OUTPUTFILE_PATH=f"{FOLDER_PATH}/output.csv"
DATASET_NAME = "INPUT_DATASET_FROM_HUGGING_FACE"
TEXT_COLUMN = "CONTENT_COLUMN"
LABEL_COLUMN = "LABEL_COLUMN"

NEW_COLUMN_NAMES=["ID",TEXT_COLUMN,LABEL_COLUMN]
#=========*PEFT MODEL ARGUMENTS* =====================
peft_config =LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=64,
    lora_alpha=256,
    lora_dropout=0.2,
    fan_in_fan_out=True,
    bias="all",
    modules_to_save=["classifier/score", "pooler"],
    init_lora_weights="gaussian",
    # target_modules=["q_proj", "k_proj"],
    # modules_to_save=["lm_head"],

    # layers_to_transform=[2, 4, 6],
    # layers_pattern="custom_pattern",
    # rank_pattern={
    #     "model.decoder.layers.0.encoder_attn.k_proj": 16,
    #     "model.decoder.layers.2.encoder_attn.k_proj": 32
    # },
    # alpha_pattern={
    #     "model.decoder.layers.0.encoder_attn.k_proj": 64,
    #     "model.decoder.layers.4.encoder_attn.k_proj": 128
    # },
    # megatron_config={
    #     "hidden_size": 4096,
    #     "num_attention_heads": 32,
    #     "num_layers": 24
    # },
    # megatron_core="custom_megatron_core",
    # loftq_config=LoraConfig(
    #     quantization_bits=8,
    #     quantization_range=128
    # )
)

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME_OR_PATH)
#=========*creating model * =====================
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME_OR_PATH )
model = get_peft_model(model, peft_config)

print(f"Number of trainable parameters: {model.print_trainable_parameters()}")
print(f"Architecture:",model)

# accelerator = Accelerator()
model.to(DEVICE)
# optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
# scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=NUM_EPOCHS)

accelerator = Accelerator()

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 9,011,200 || all params: 1,109,059,584 || trainable%: 0.8125081943298008
Number of trainable parameters: None
Architecture: PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.2, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (

In [None]:
import csv
import pandas as pd
from typing import List, Dict
from langchain_community.document_loaders import DirectoryLoader

def save_to_csv(file_path: str, data: List[Dict[str, str]], mode: str = 'a') -> None:
    """
    Save or append data to a CSV file.

    :param file_path: Path to the CSV file to save or append data.
    :param data: List of dictionaries containing the data to be saved.
    :param mode: File opening mode ('a' for append, 'w' for write).
    """
    with open(file_path, mode, newline='', encoding='utf-8') as csvfile:
        fieldnames = data[0].keys() if data else []
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction='ignore', escapechar='\\', quoting=csv.QUOTE_NONE)

        if mode == 'w':
            writer.writeheader()

        for row in data:
            writer.writerow(row)

def rename_columns(csv_file: str, new_columns: List[str]) -> None:
    """
    This function renames the columns of a CSV file.

    Parameters:
    csv_file (str): The path to the CSV file.
    new_columns (List[str]): The new column names.

    Returns:
    None
    """
    # Load the CSV file
    df = pd.read_csv(csv_file, error_bad_lines=False)

    # Check if the number of new column names matches the number of columns in the CSV file
    if len(df.columns) != len(new_columns):
        raise ValueError("The number of new column names must match the number of columns in the CSV file.")

    # Rename the columns
    df.columns = new_columns

    # Save the dataframe to the same CSV file
    df.to_csv(csv_file, index=False)

def load_docs_from_folder(folder_path: str, file_glob: str) -> List[Dict[str, str]]:
    """
    Load documents from a folder with a specific file extension and return their content and metadata.

    :param folder_path: The path to the folder containing the files.
    :param file_glob: The file extension or pattern to filter files.
    :return: A list of dictionaries with file content and metadata.
    """
    directory_loader = DirectoryLoader(folder_path, glob=file_glob, show_progress=True,
                                       use_multithreading=True, silent_errors=True)
    return directory_loader.load()

def main(folder_path: str, csv_output_path: str, new_column_names: List[str] = None) -> None:
    """
    Main function to load documents from a directory and save them into a CSV file.

    :param folder_path: The path to the folder containing the files.
    :param csv_output_path: The path to the output CSV file.
    :param new_column_names: The new column names for the CSV file.
    """
    file_types = ['.md', '.pdf', '.py', '.csv','.txt']
    first_run = True
    doc_id = 1  # Initialize document ID

    for file_type in file_types:
        print(f"============================ *{file_type[1:]} files* ==============================")
        docs = load_docs_from_folder(folder_path, f"**/*{file_type}")

        if not docs:
            print(f"No {file_type[1:]} files found.")
            continue

        # Assuming that the Document object has 'page_content' and 'source' properties.
        data_to_save = [{'id': str(doc_id + i),  # Generate an incremental ID for each document.
                         'content': doc.page_content,
                         'source': doc.metadata['source']} for i, doc in enumerate(docs) if hasattr(doc, 'metadata')]

        # Update the document ID for the next batch of files
        doc_id += len(docs)

        # If it's the first run, write headers to the CSV, otherwise append without headers.
        save_to_csv(csv_output_path, data_to_save, 'w' if first_run else 'a')
        first_run = False
        if new_column_names is not None:
            rename_columns(csv_output_path, new_column_names)

folder_path = FOLDER_PATH
csv_output_path = OUTPUTFILE_PATH
new_column_names = NEW_COLUMN_NAMES  # Define NEW_COLUMN_NAMES before using it
main(folder_path, csv_output_path, new_column_names)



# if __name__ == "__main__":
#     # NEW_COLUMN_NAMES=["ID", "Content", "Source"]
#     # FOLDER_PATH="/content/research_papers-"
#     # OUTPUTFILE_PATH="/content/drive/MyDrive/hemanth.csv"
#     folder_path = FOLDER_PATH
#     csv_output_path = OUTPUTFILE_PATH
#     main(folder_path, csv_output_path)

    # rename_columns(csv_output_path, ["id", "hemanth", "path"])
    # folder_path = 'pytorch-stable-diffusion'
    # csv_output_path = 'output_data_structure.csv'
    # main(folder_path, csv_output_path)

In [None]:
!nivida-smi

In [None]:
import csv
import os
from typing import List
from langchain_community.document_loaders import PyPDFLoader
from torch.utils.data import DataLoader
def write_to_csv(file_path: str, data: dict, write_header: bool) -> None:
    """
    Function to append data into a CSV file.

    Args:
    file_path (str): The path to the CSV file.
    data (dict): The data to be appended into the CSV file.
    write_header (bool): Whether to write the header.
    """
    mode = 'a' if os.path.exists(file_path) else 'w'
    with open(file_path, mode, newline='', encoding='UTF-8', errors='ignore') as file:
        writer = csv.DictWriter(file, fieldnames=["content", "documents", "metasource"], quoting=csv.QUOTE_ALL, escapechar='\\')
        if write_header and mode == 'w':
            writer.writeheader()
        try:
            writer.writerow({k: data[k] for k in ["content", "documents", "metasource"]})
        except UnicodeEncodeError:
            print(f"Warning: UnicodeEncodeError encountered for file {data['documents']}. Skipping this file.")

def read_pdfs_from_folder(folder_path: str, csv_file_path: str) -> None:
    """
    Function to recursively read PDF files from a folder and its subfolders and extract their content.

    Args:
    folder_path (str): The path to the folder containing the PDF files.
    csv_file_path (str): The path to the CSV file.
    """
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith(".pdf"):
                full_file_path = os.path.join(root, file_name)
                loader = PyPDFLoader(full_file_path)
                pages = loader.load_and_split()
                for page in pages:
                    data = {
                        "content": page.page_content,
                        "documents": file_name,
                        "metasource": page.metadata['source']
                    }
                    write_to_csv(csv_file_path, data, True)

# Usage

folder_path =FOLDER_PATH
csv_file_path =
read_pdfs_from_folder(folder_path, csv_file_path)

In [None]:
import csv
import pandas as pd
from typing import List, Dict
from langchain_community.document_loaders import DirectoryLoader

def save_to_csv(file_path: str, data: List[Dict[str, str]], mode: str = 'a') -> None:
    """
    Save or append data to a CSV file.

    :param file_path: Path to the CSV file to save or append data.
    :param data: List of dictionaries containing the data to be saved.
    :param mode: File opening mode ('a' for append, 'w' for write).
    """
    with open(file_path, mode, newline='', encoding='utf-8') as csvfile:
        fieldnames = data[0].keys() if data else []
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        if mode == 'w':
            writer.writeheader()

        for row in data:
            writer.writerow(row)
def rename_columns(csv_file: str, new_columns: List[str]) -> None:
    """
    This function renames the columns of a CSV file.

    Parameters:
    csv_file (str): The path to the CSV file.
    new_columns (List[str]): The new column names.

    Returns:
    None
    """
    # Load the CSV file
    df = pd.read_csv(csv_file)

    # Check if the number of new column names matches the number of columns in the CSV file
    if len(df.columns) != len(new_columns):
        raise ValueError("The number of new column names must match the number of columns in the CSV file.")

    # Rename the columns
    df.columns = new_columns

    # Save the dataframe to the same CSV file
    df.to_csv(csv_file, index=False)
def load_docs_from_folder(folder_path: str, file_glob: str) -> List[Dict[str, str]]:
    """
    Load documents from a folder with a specific file extension and return their content and metadata.

    :param folder_path: The path to the folder containing the files.
    :param file_glob: The file extension or pattern to filter files.
    :return: A list of dictionaries with file content and metadata.
    """
    directory_loader = DirectoryLoader(folder_path, glob=file_glob, show_progress=True,
                                       use_multithreading=True, silent_errors=True)
    return directory_loader.load()
def main(folder_path: str, csv_output_path: str) -> None:

    """
    Main function to load documents from a directory and save them into a CSV file.

    :param folder_path: The path to the folder containing the files.
    :param csv_output_path: The path to the output CSV file.
    """
    file_types = ['.md', '.pdf', '.py', '.csv','.txt']
    first_run = True
    doc_id = 1  # Initialize document ID

    for file_type in file_types:
        print(f"============================*{file_type[1:]} files*==============================")
        docs = load_docs_from_folder(folder_path, f"**/*{file_type}")

        if not docs:
            print(f"No {file_type[1:]} files found.")
            continue

        # Assuming that the Document object has 'page_content' and 'source' properties.
        data_to_save = [{'id': str(doc_id + i),  # Generate an incremental ID for each document.
                         'content': doc.page_content,
                         'source': doc.metadata['source']} for i, doc in enumerate(docs) if hasattr(doc, 'metadata')]

        # Update the document ID for the next batch of files
        doc_id += len(docs)

        # If it's the first run, write headers to the CSV, otherwise append without headers.
        save_to_csv(csv_output_path, data_to_save, 'w' if first_run else 'a')
        first_run = False
        if NEW_COLUMN_NAMES is not None:
            rename_columns(csv_output_path, NEW_COLUMN_NAMES)
#============================== * Dataset creation's * =============================================
             # TEXT_COLUMN = "content"
             # LABEL_COLUMN = "source"



if __name__ == "__main__":
    NEW_COLUMN_NAMES=["ID", "Content", "Source"]
    FOLDER_PATH="/content/research_papers-"
    folder_path = FOLDER_PATH
    OUTPUTFILE_PATH="/content/research_papers-/output.csv"
    csv_output_path = OUTPUTFILE_PATH
    main(folder_path, csv_output_path)






In [4]:
def advanced_data_loader(input: Union[str, Dict[str, str]], format: Optional[str] = None, split_ratios: Optional[Dict[str, float]] = None) -> Optional[DatasetDict]:
    """
    Loads a dataset from a given input path or dictionary specifying file paths and splits it.

    :param input: A string representing the dataset name or directory, or a dictionary containing file paths.
    :param format: The format of the dataset if loading from a file (e.g., 'csv' or 'json').

    :param split_ratios: A dictionary with keys 'train', 'test', and 'eval' containing split ratios.

    : example

    File_path={'train': 'path_file.csv', 'test': 'path_file.csv'}, 'csv'
    :return: A loaded and split dataset or None in case of failure.
    """
    if split_ratios is None:
        split_ratios = {'train': 0.8, 'test': 0.1, 'eval': 0.1}

    try:
        # Load the dataset
        if isinstance(input, dict) and format in ['csv', 'json']:
            dataset = load_dataset(format, data_files=input)
        elif isinstance(input, str) and format == 'text':
            dataset = load_dataset(format, data_dir=input)
        elif isinstance(input, str) and format is None:
            dataset = load_dataset(input)
        else:
            warnings.warn("Invalid input or format. Please provide a valid dataset name, directory, or file paths.")
            return None
    except FileNotFoundError as e:
        warnings.warn(str(e))
        return None

    # Split the dataset
    if dataset:
        split_dataset = dataset['train'].train_test_split(test_size=split_ratios['test'] + split_ratios['eval'])
        test_eval_dataset = split_dataset['test'].train_test_split(test_size=split_ratios['eval'] / (split_ratios['test'] + split_ratios['eval']))
        dataset = DatasetDict({
            'train': split_dataset['train'],
            'test': test_eval_dataset['train'],
            'eval': test_eval_dataset['test']
        })

    print("Splits: ", dataset.keys())
    print("Columns: ", {split: dataset[split].column_names for split in dataset.keys()})
    return dataset


In [5]:
dataset=advanced_data_loader('fka/awesome-chatgpt-prompts')

Downloading readme:   0%|          | 0.00/274 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.6k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Splits:  dict_keys(['train', 'test', 'eval'])
Columns:  {'train': ['act', 'prompt'], 'test': ['act', 'prompt'], 'eval': ['act', 'prompt']}


In [6]:
train_dataset=dataset['train']
print(train_dataset.column_names)

['act', 'prompt']


**Optional FIM transformations**


Autoregressive language models typically generate sequences from left to right. By applying the FIM transformations, the model can also learn to infill text.  Check out ["Efficient Training of Language Models to Fill in the Middle" paper](https://arxiv.org/pdf/2207.14255.pdf) to learn more about the technique.
We'll define the FIM transformations here and will use them when creating the Iterable Dataset. However, if you want to omit transformations, feel free to set `fim_rate` to 0.

In [None]:
MODEL="bigcode/starcoderbase-1b" # Model checkpoint on the Hugging Face Hub
DATASET="smangrul/hf-stack-v1"   # Dataset on the Hugging Face Hub
DATA_COLUMN="content"            # Column name containing the code content

SEQ_LENGTH=2048                  # Sequence length

# Training arguments
MAX_STEPS=2000                   # max_steps
BATCH_SIZE=16                    # batch_size
GR_ACC_STEPS=1                   # gradient_accumulation_steps
LR=5e-4                          # learning_rate
LR_SCHEDULER_TYPE="cosine"       # lr_scheduler_type
WEIGHT_DECAY=0.01                # weight_decay
NUM_WARMUP_STEPS=30              # num_warmup_steps
EVAL_FREQ=100                    # eval_freq
SAVE_FREQ=100                    # save_freq
LOG_FREQ=25                      # log_freq
OUTPUT_DIR="peft-starcoder-lora-a100" # output_dir
BF16=True                        # bf16
FP16=False                       # no_fp16

# FIM trasformations arguments
FIM_RATE=0.5                     # fim_rate
FIM_SPM_RATE=0.5                 # fim_spm_rate

# LORA
LORA_R=8                         # lora_r
LORA_ALPHA=32                    # lora_alpha
LORA_DROPOUT=0.0                 # lora_dropout
LORA_TARGET_MODULES="c_proj,c_attn,q_attn,c_fc,c_proj"    # lora_target_modules

# bitsandbytes config
USE_NESTED_QUANT=True            # use_nested_quant
BNB_4BIT_COMPUTE_DTYPE="bfloat16"# bnb_4bit_compute_dtype

SEED=0

In [None]:
eval_dataset.dataset

Dataset({
    features: ['chosen', 'rejected'],
    num_rows: 16080
})

In [19]:
import torch
from transformers import AutoTokenizer


def preprocess_function(examples, text_column, label_column):
    """
    Preprocess the dataset.

    Args:
        examples (dict): A dictionary where keys are column names and values are lists of data.
        text_column (str): The name of the column containing text data.
        label_column (str): The name of the column containing label data.

    Returns:
        dict: A dictionary containing tokenized inputs and labels suitable for model training.
    """
    inputs = [f"{text}: {examples[text_column][i]} Label: " for i, text in enumerate(examples[text_column])]
    targets = [str(examples[label_column][i]) for i in range(len(examples[label_column]))]

    # Tokenize inputs and labels
    model_inputs = tokenizer(inputs, padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors='pt')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors='pt')

    # Replace padding token id's in labels with -100 so they are ignored in loss calculation
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_input]
        for label_input in labels["input_ids"]
    ]

    # Ensure labels are the same size as model_inputs
    if 'attention_mask' in model_inputs:
        labels["attention_mask"] = model_inputs['attention_mask']

    model_inputs["labels"] = torch.tensor(labels["input_ids"])

    return model_inputs

def preprocess_wrapper(examples):
    return preprocess_function(examples, text_column='prompt', label_column='act')

processed_datasets = dataset.map(
    preprocess_wrapper,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=True,
    desc="Running tokenizer on dataset",
)

In [21]:
train_dataset=processed_datasets['train']



In [22]:
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=BATCH_SIZE, pin_memory=True
)

In [17]:
processed_datasets = dataset.map(
    preprocess_wrapper,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=True,
    desc="Running tokenizer on dataset",
)


Running tokenizer on dataset:   0%|          | 0/122 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/15 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/16 [00:00<?, ? examples/s]

In [None]:
  for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        print(batch)
        # outputs = model(**batch)

In [13]:
preprocessed_data['input_ids']

tensor([[   1,  306,  864,  ...,    2,    2,    2],
        [   1,  306,  864,  ...,    2,    2,    2],
        [   1, 1094,  263,  ...,    2,    2,    2],
        ...,
        [   1,  306,  864,  ...,    2,    2,    2],
        [   1,  306,  864,  ...,    2,    2,    2],
        [   1,  306,  864,  ...,    2,    2,    2]])

In [15]:
processed_datasets = dataset.map(
            preprocess_function,
            batched=True,
            num_proc=1,
            remove_columns=dataset["train"].column_names,
            load_from_cache_file=True,
            desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/122 [00:00<?, ? examples/s]

TypeError: preprocess_function() missing 2 required positional arguments: 'text_column' and 'label_column'

In [None]:
def preprocess_function(examples):
  """
  Preprocess the dataset.
  """
  batch_size = len(examples[TEXT_COLUMN])
  inputs = [f"{TEXT_COLUMN } : {x} Label : " for x in examples[TEXT_COLUMN ]]
  targets = [str(x) for x in examples[LABEL_COLUMN]]
  model_inputs = tokenizer(inputs)
  labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs
  for i in range(batch_size):
      sample_input_ids = model_inputs["input_ids"][i]
      label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
      model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
      labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
      model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
  for i in range(batch_size):
      sample_input_ids = model_inputs["input_ids"][i]
      label_input_ids = labels["input_ids"][i]
      model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
          MAX_LENGTH - len(sample_input_ids)
      ) + sample_input_ids
      model_inputs["attention_mask"][i] = [0] * (MAX_LENGTH - len(sample_input_ids)) + model_inputs[
          "attention_mask"
      ][i]
      labels["input_ids"][i] = [-100] * (MAX_LENGTH - len(sample_input_ids)) + label_input_ids
      model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:MAX_LENGTH])
      model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:MAX_LENGTH])
      labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:MAX_LENGTH])
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

def test_preprocess_function(examples):
  batch_size = len(examples[TEXT_COLUMN])
  inputs = [f"{TEXT_COLUMN} : {x} Label : " for x in examples[TEXT_COLUMN]]
  model_inputs = tokenizer(inputs)
  # print(model_inputs)
  for i in range(batch_size):
      sample_input_ids = model_inputs["input_ids"][i]
      model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
          MAX_LENGTH - len(sample_input_ids)
      ) + sample_input_ids
      model_inputs["attention_mask"][i] = [0] * (MAX_LENGTH - len(sample_input_ids)) + model_inputs[
          "attention_mask"
      ][i]
      model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:MAX_LENGTH])
      model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:MAX_LENGTH])
  return model_inputs

In [None]:
dataset =advanced_data_loader(input={'train':OUTPUTFILE_PATH,"test":OUTPUTFILE_PATH}, format='csv', split_ratios={'train': 0.8, 'test': 0.1, 'eval': 0.1})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Splits:  dict_keys(['train', 'test', 'eval'])
Columns:  {'train': ['ID', 'Content', 'Source'], 'test': ['ID', 'Content', 'Source'], 'eval': ['ID', 'Content', 'Source']}


In [None]:
from torch.utils.data import DataLoader

In [None]:

with accelerator.main_process_first():
  processed_datasets = dataset.map(
            preprocess_function,
            batched=True,
            num_proc=1,
            remove_columns=dataset["train"].column_names,
            load_from_cache_file=True,
            desc="Running tokenizer on dataset",
)
train_dataset = processed_datasets["train"]
with accelerator.main_process_first():
  processed_datasets = dataset.map(
            test_preprocess_function,
            batched=True,
            num_proc=1,
            remove_columns=dataset["test"].column_names,
            load_from_cache_file=False,
            desc="Running tokenizer on dataset",
  )
test_dataset = processed_datasets["test"]
with accelerator.main_process_first():
  processed_datasets = dataset.map(
            test_preprocess_function,
            batched=True,
            num_proc=1,
            remove_columns=dataset["test"].column_names,
            load_from_cache_file=False,
            desc="Running tokenizer on dataset",
  )
eval_dataset = processed_datasets["eval"]
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=BATCH_SIZE, pin_memory=True
)
eval_dataloader = DataLoader(
    eval_dataset, shuffle=True,collate_fn=default_data_collator, batch_size=BATCH_SIZE, pin_memory=True
)
test_dataloader = DataLoader(
    test_dataset, collate_fn=default_data_collator, batch_size=BATCH_SIZE, pin_memory=True
)


In [None]:
# ================== * TRAINING AND EVALUATION * ============================

accelerator.wait_for_everyone()
model, train_dataloader, eval_dataloader, test_dataloader, optimizer, lr_scheduler = accelerator.prepare(
model, train_dataloader, eval_dataloader, test_dataloader, optimizer, lr_scheduler
    )
model.to(DEVICE)
accelerator.print(model)
train_losses = []
eval_losses = []
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    train_epoch_loss = total_loss / len(train_dataloader)
    train_losses.append(train_epoch_loss)  # Store train loss for this epoch

    # # model.eval()
    # eval_loss = 0
    # eval_preds = []
    # for step, batch in enumerate(tqdm(eval_dataloader)):
    #     batch = {k: v.to(DEVICE) for k, v in batch.items()}
    #     # with torch.no_grad():

    #     outputs = model(**batch)
    #     loss = outputs.loss
    #     eval_loss += loss.detach().float()
    #     # eval_preds.extend(
    #         # tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(),
    #                               #  skip_special_tokens=True)
    #     # )
    # eval_epoch_loss = eval_loss / len(eval_dataloader)
    # eval_losses.append(eval_epoch_loss)  # Store eval loss for this epoch

    # eval_ppl = torch.exp(eval_epoch_loss)
    # train_ppl = torch.exp(train_epoch_loss)
    # print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

plt.figure(figsize=(10, 5))
plt.plot(train_losses.cpu().numpy(), label='Training Loss')  # Move tensor to CPU and convert to numpy before plotting
# plt.plot(eval_losses.cpu().numpy(), label='Evaluation Loss')  # Do the same for eval_losses if needed
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Fine-tuning on Large Language Models - Training and Evaluation Loss')
plt.legend()
plt.show()  # Corrected typo here
plt.savefig('fine_tuning_loss_plot.png')




In [2]:
import os
import warnings
import torch
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    TaskType,
    get_peft_model,
    prepare_model_for_int8_training,
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    default_data_collator,
    get_linear_schedule_with_warmup,
    BitsAndBytesConfig
)
from tqdm import tqdm
from accelerate import Accelerator
from dataclasses import dataclass, field
from torch.utils.data import Dataset, DataLoader
from enum import Enum
from datasets import load_dataset,DatasetDict
from typing import Union , Dict,Optional,Any,List


MAX_LENGTH = 512
LR = 3e-2
NUM_EPOCHS = 3
BATCH_SIZE = 1
DEVICE='cuda'
#========= * DATASET ARGUMENTS * =====================

OUTPUT_DIR_PATH = " "
DATASET_LOCAL_PATH = " "
DATASET_NAME = " "
TEXT_COLUMN = " "
LABEL_COLUMN = " "

TEXT_COLUMN = " "
LABEL_COLUMN = " "
def advanced_data_loader(input: Union[str, Dict[str, str]], format: Optional[str] = None, split_ratios: Optional[Dict[str, float]] = None) -> Optional[DatasetDict]:
    """
    Loads a dataset from a given input path or dictionary specifying file paths and splits it.

    :param input: A string representing the dataset name or directory, or a dictionary containing file paths.
    :param format: The format of the dataset if loading from a file (e.g., 'csv' or 'json').
    :param split_ratios: A dictionary with keys 'train', 'test', and 'eval' containing split ratios.
    :return: A loaded and split dataset or None in case of failure.
    """
    if split_ratios is None:
        split_ratios = {'train': 0.8, 'test': 0.1, 'eval': 0.1}

    try:
        # Load the dataset
        if isinstance(input, dict) and format in ['csv', 'json']:
            dataset = load_dataset(format, data_files=input)
        elif isinstance(input, str) and format == 'text':
            dataset = load_dataset(format, data_dir=input)
        elif isinstance(input, str) and format is None:
            dataset = load_dataset(input)
        else:
            warnings.warn("Invalid input or format. Please provide a valid dataset name, directory, or file paths.")
            return None
    except FileNotFoundError as e:
        warnings.warn(str(e))
        return None

    # Split the dataset
    if dataset:
        split_dataset = dataset['train'].train_test_split(test_size=split_ratios['test'] + split_ratios['eval'])
        test_eval_dataset = split_dataset['test'].train_test_split(test_size=split_ratios['eval'] / (split_ratios['test'] + split_ratios['eval']))
        dataset = DatasetDict({
            'train': split_dataset['train'],
            'test': test_eval_dataset['train'],
            'eval': test_eval_dataset['test']
        })

    print("Splits: ", dataset.keys())
    print("Columns: ", {split: dataset[split].column_names for split in dataset.keys()})
    return dataset


def create_tokenizer(tokenizer_name_or_path: str = 'gpt2') -> AutoTokenizer:
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    if tokenizer.bos_token_id is None:
        tokenizer.bos_token_id = tokenizer.pad_token_id
    if tokenizer.eos_token_id is None:
        tokenizer.eos_token_id = tokenizer.pad_token_id
    if tokenizer.unk_token_id is None:
        tokenizer.unk_token_id = tokenizer.pad_token_id
    if tokenizer.sep_token_id is None:
        tokenizer.sep_token_id = tokenizer.pad_token_id
    if tokenizer.cls_token_id is None:
        tokenizer.cls_token_id = tokenizer.pad_token_id
    if tokenizer.mask_token_id is None:
        tokenizer.mask_token_id = tokenizer.pad_token_id
    return tokenizer


def Create_model_loader(model_name_or_path: str = 'gpt2') -> AutoModelForCausalLM:
    model=AutoModelForCausalLM.from_pretrained(model_name_or_path)
    return model

tokenizer=create_tokenizer("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

def preprocess_function(examples, text_column, label_column):
    """
    Preprocess the dataset.

    Args:
        examples (dict): A dictionary where keys are column names and values are lists of data.
        text_column (str): The name of the column containing text data.
        label_column (str): The name of the column containing label data.

    Returns:
        dict: A dictionary containing tokenized inputs and labels suitable for model training.
    """
    inputs = [f"{text}: {examples[text_column][i]} Label: " for i, text in enumerate(examples[text_column])]
    targets = [str(examples[label_column][i]) for i in range(len(examples[label_column]))]

    # Tokenize inputs and labels
    model_inputs = tokenizer(inputs, padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors='pt')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors='pt')

    # Replace padding token id's in labels with -100 so they are ignored in loss calculation
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_input]
        for label_input in labels["input_ids"]
    ]

    # Ensure labels are the same size as model_inputs
    if 'attention_mask' in model_inputs:
        labels["attention_mask"] = model_inputs['attention_mask']

    model_inputs["labels"] = torch.tensor(labels["input_ids"])

    return model_inputs



def main():
    dataset=advanced_data_loader('fka/awesome-chatgpt-prompts')

    tokenizer=create_tokenizer("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

    model=Create_model_loader(
        "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    )

    def preprocess_wrapper(examples):

        return preprocess_function(examples, text_column='prompt', label_column='act')
    processed_datasets = dataset.map(
    preprocess_wrapper,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=True,
    desc=f"""
    This is kandimalla hemanth fine-tuning of any given model for given dataset

    """,
     )

    train_dataset=processed_datasets['train']
    test_dataset=processed_datasets['test']
    eval_dataset=processed_datasets['eval']
    train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=BATCH_SIZE, pin_memory=True
    )
    eval_dataloader = DataLoader(
        eval_dataset, shuffle=True,collate_fn=default_data_collator, batch_size=BATCH_SIZE, pin_memory=True
    )
    test_dataloader = DataLoader(
        test_dataset, collate_fn=default_data_collator, batch_size=BATCH_SIZE, pin_memory=True
    )
    # optimizer
    model.to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
    # scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=NUM_EPOCHS)
    for epoch in range(NUM_EPOCHS):
        model.train()
        total_loss = 0
        for step, batch in enumerate(tqdm(train_dataloader)):
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.detach().float()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        train_epoch_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1} | Train Loss: {train_epoch_loss}")

    model.save_pretrained(f"{OUTPUT_DIR_PATH}")


main()

Splits:  dict_keys(['train', 'test', 'eval'])
Columns:  {'train': ['act', 'prompt'], 'test': ['act', 'prompt'], 'eval': ['act', 'prompt']}



    This is kandimalla hemanth fine-tuning of any given model for given dataset
    
    :   0%|          | 0…




    This is kandimalla hemanth fine-tuning of any given model for given dataset
    
    :   0%|          | 0…


    This is kandimalla hemanth fine-tuning of any given model for given dataset
    
    :   0%|          | 0…

  0%|          | 0/122 [00:02<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 44.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 19.06 MiB is free. Process 182008 has 14.73 GiB memory in use. Of the allocated memory 14.25 GiB is allocated by PyTorch, and 350.02 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
num_epochs=3
lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=(len(train_dataloader) * num_epochs),
    )

In [None]:
!python  /content/sample_data/python.py --config "/content/sample_data/config.yml"

In [None]:
from tqdm import tqdm

In [None]:
def append_to_csv(file_name: str, row: Dict[str, Any]) -> None:
    with open(file_name, 'a', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=row.keys())
        writer.writerow(row)

def generate_response(model, tokenizer,
                      dataset: Dict[str, Any], device: str, text_column: str, i: int) -> None:
    model.eval()
    inputs = tokenizer(f'{text_column} : {dataset["test"][i][text_column]} Label : ', return_tensors="pt")
    with torch.no_grad():
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model.generate(
            input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3
        )
    response = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)
    # Append to CSV
    row = {'id': i, 'prompt': dataset["test"][i][text_column], 'response': response}
    append_to_csv('output.csv', row)

# Call the function
generate_response(model, tokenizer, dataset, DEVICE, TEXT_COLUMN, len(test_dataloader))


In [None]:
model.push_to_hub(
        f"TRAINING_ON LLMS_ADVERICAL_PAPERS_{MODEL_NAME_OR_PATH}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
        token = "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"
    )

# news papers finding's

In [None]:

def chars_token_ratio(dataset, tokenizer, data_columns, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset for each column.
    """

    results = {}
    for data_column in data_columns:
        total_characters, total_tokens = 0, 0
        for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
            total_characters += len(example[data_column])
            total_tokens += len(tokenizer(example[data_column]).tokens())

        results[data_column] = total_characters / total_tokens

    return results

train_data=dataset['train']
data_columns =dataset['train'].column_names  # replace with your column names
chars_per_token = chars_token_ratio(train_data, tokenizer, data_columns)
for column, ratio in chars_per_token.items():
    print(f"The character to token ratio of {column} is: {ratio:.2f}")


In [None]:
import functools
import numpy as np


# Helper function to get token ids of the special tokens for prefix, suffix and middle for FIM transformations.
@functools.lru_cache(maxsize=None)
def get_fim_token_ids(tokenizer):
    try:
        FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map["additional_special_tokens"][1:5]
        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
            tokenizer.vocab[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]
        )
    except KeyError:
        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = None, None, None, None
    return suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id


## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py
def permute(
    sample,
    np_rng,
    suffix_tok_id,
    prefix_tok_id,
    middle_tok_id,
    pad_tok_id,
    fim_rate=0.5,
    fim_spm_rate=0.5,
    truncate_or_pad=False,
):
    """
    Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes:
    PSM and SPM (with a probability of fim_spm_rate).
    """

    # The if condition will trigger with the probability of fim_rate
    # This means FIM transformations will apply to samples with a probability of fim_rate
    if np_rng.binomial(1, fim_rate):

        # Split the sample into prefix, middle, and suffix, based on randomly generated indices stored in the boundaries list.
        boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2))
        boundaries.sort()

        prefix = np.array(sample[: boundaries[0]], dtype=np.int64)
        middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64)
        suffix = np.array(sample[boundaries[1] :], dtype=np.int64)

        if truncate_or_pad:
            # calculate the new total length of the sample, taking into account tokens indicating prefix, middle, and suffix
            new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3
            diff = new_length - len(sample)

            # trancate or pad if there's a difference in length between the new length and the original
            if diff > 0:
                if suffix.shape[0] <= diff:
                    return sample, np_rng
                suffix = suffix[: suffix.shape[0] - diff]
            elif diff < 0:
                suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)])

        # With the probability of fim_spm_rateapply SPM variant of FIM transformations
        # SPM: suffix, prefix, middle
        if np_rng.binomial(1, fim_spm_rate):
            new_sample = np.concatenate(
                [
                    [prefix_tok_id, suffix_tok_id],
                    suffix,
                    [middle_tok_id],
                    prefix,
                    middle,
                ]
            )
        # Otherwise, apply the PSM variant of FIM transformations
        # PSM: prefix, suffix, middle
        else:

            new_sample = np.concatenate(
                [
                    [prefix_tok_id],
                    prefix,
                    [suffix_tok_id],
                    suffix,
                    [middle_tok_id],
                    middle,
                ]
            )
    else:
        # don't apply FIM transformations
        new_sample = sample

    return list(new_sample), np_rng


In [None]:
from torch.utils.data import IterableDataset
from torch.utils.data.dataloader import DataLoader
import random

# Create an Iterable dataset that returns constant-length chunks of tokens from a stream of text files.

class ConstantLengthDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
            fim_rate (float): Rate (0.0 to 1.0) that sample will be permuted with FIM.
            fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permuations that will use SPM.
            seed (int): Seed for random number generator.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        infinite=False,
        seq_length=1024,
        num_of_sequences=1024,
        chars_per_token=3.6,
        content_field=dataset['train'].column_names,
        fim_rate=0.5,
        fim_spm_rate=0.5,
        seed=0,
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token[content_field[0]] * num_of_sequences
        self.content_field = content_field
        self.fim_rate = fim_rate
        self.fim_spm_rate = fim_spm_rate
        self.seed = seed

        (
            self.suffix_tok_id,
            self.prefix_tok_id,
            self.middle_tok_id,
            self.pad_tok_id,
        ) = get_fim_token_ids(self.tokenizer)
        if not self.suffix_tok_id and self.fim_rate > 0:
            print("FIM is not supported by tokenizer, disabling FIM")
            self.fim_rate = 0

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        np_rng = np.random.RandomState(seed=self.seed)
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    buffer.append(next(iterator)[self.content_field])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []

            for tokenized_input in tokenized_inputs:
                # optionally do FIM permutations
                if self.fim_rate > 0:
                    tokenized_input, np_rng = permute(
                        tokenized_input,
                        np_rng,
                        self.suffix_tok_id,
                        self.prefix_tok_id,
                        self.middle_tok_id,
                        self.pad_tok_id,
                        fim_rate=self.fim_rate,
                        fim_spm_rate=self.fim_spm_rate,
                        truncate_or_pad=False,
                    )

                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            examples = []
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    examples.append(input_ids)
            random.shuffle(examples)
            for example in examples:
                self.current_size += 1
                yield {
                    "input_ids": torch.LongTensor(example),
                    "labels": torch.LongTensor(example),
                }


train_dataset = ConstantLengthDataset(
        tokenizer,
        dataset['train'],
        infinite=True,
        seq_length=SEQ_LENGTH,
        chars_per_token=chars_per_token,
        content_field=dataset['train'].column_names,
        fim_rate=FIM_RATE,
        fim_spm_rate=FIM_SPM_RATE,
        seed=SEED,
)
eval_dataset = ConstantLengthDataset(
        tokenizer,
        dataset['eval'],
        infinite=False,
        seq_length=SEQ_LENGTH,
        chars_per_token=chars_per_token,
        content_field=dataset['train'].column_names ,
        fim_rate=FIM_RATE,
        fim_spm_rate=FIM_SPM_RATE,
        seed=SEED,
)

FIM is not supported by tokenizer, disabling FIM
FIM is not supported by tokenizer, disabling FIM


In [None]:
peft_model_id = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace(
    "/", "_"
)
model.save_pretrained(peft_model_id)
from peft import PeftModel, PeftConfig

peft_model_id = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace(
    "/", "_"
)

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)
model.to(DEVICE)
model.eval()
i = 4
inputs = tokenizer(f'{TEXT_COLUMN} : {dataset["test"][i][TEXT_COLUMN]} Label : ', return_tensors="pt")
print(dataset["test"][i][TEXT_COLUMN])
print(inputs)

with torch.no_grad():
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3
    )
    print(outputs)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

In [None]:
def preprocess_function(examples):
    """
    Preprocess the dataset.

    Args:
        examples (dict): A dictionary where keys are column names and values are lists of column values.

    Returns:
        model_inputs (dict): A dictionary containing tokenized inputs, attention masks, and labels.
    """
    batch_size = len(examples['column1'])
    inputs = []
    for i in range(batch_size):
        input_str = ""
        for column in examples.keys():
            if column != 'label_column':
                input_str += f"{column}: {examples[column][i]} "
        input_str += "Label : "
        inputs.append(input_str)
    targets = [str(x) for x in examples['label_column']]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets, add_special_tokens=False)
    for i in range(batch_size):
      sample_input_ids = model_inputs["input_ids"][i]
      label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
      model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
      labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
      model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    for i in range(batch_size):
      sample_input_ids = model_inputs["input_ids"][i]
      label_input_ids = labels["input_ids"][i]
      model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
          MAX_LENGTH - len(sample_input_ids)
      ) + sample_input_ids
      model_inputs["attention_mask"][i] = [0] * (MAX_LENGTH - len(sample_input_ids)) + model_inputs[
          "attention_mask"
      ][i]
      labels["input_ids"][i] = [-100] * (MAX_LENGTH - len(sample_input_ids)) + label_input_ids
      model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:MAX_LENGTH])
      model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:MAX_LENGTH])
      labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:MAX_LENGTH])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



def test_preprocess_function(examples):
    """
    Preprocess the test dataset.

    Args:
        examples (dict): A dictionary where keys are column names and values are lists of column values.

    Returns:
        model_inputs (dict): A dictionary containing tokenized inputs and attention masks.
    """
    batch_size = len(examples['column1'])
    inputs = []
    for i in range(batch_size):
        input_str = ""
        for column in examples.keys():
            if column != 'label_column':
                input_str += f"{column}: {examples[column][i]} "
        input_str += "Label : "
        inputs.append(input_str)

        for i in range(batch_size):
      sample_input_ids = model_inputs["input_ids"][i]
      label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
      model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
      labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
      model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    for i in range(batch_size):
      sample_input_ids = model_inputs["input_ids"][i]
      label_input_ids = labels["input_ids"][i]
      model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
          MAX_LENGTH - len(sample_input_ids)
      ) + sample_input_ids
      model_inputs["attention_mask"][i] = [0] * (MAX_LENGTH - len(sample_input_ids)) + model_inputs[
          "attention_mask"
      ][i]
      labels["input_ids"][i] = [-100] * (MAX_LENGTH - len(sample_input_ids)) + label_input_ids
      model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:MAX_LENGTH])
      model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:MAX_LENGTH])
      labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:MAX_LENGTH])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
