In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 PyPDF2 datasets

In [None]:
import os
import torch
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

import argparse
from argparse import ArgumentTypeError
import json
import csv
import os
import pandas as pd
import random
import string
import PyPDF2
import io
import sys
import traceback

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def extract_text_from_pdf(filepath, length_of_chunk):
    try:
        # Open the PDF file in read-binary mode
        with open(filepath, 'rb') as file:
            # Initialize a PDF file reader object
            reader = PyPDF2.PdfReader(file)

            # Initialize an empty string to hold the extracted text
            text = ''

            # Loop through each page in the PDF
            for page in reader.pages:
                # Extract the text from the page and add it to the text string
                text += page.extract_text()

        # Split the text into chunks of the specified length
        chunks = [text[i:i + length_of_chunk] for i in range(0, len(text), length_of_chunk)]

        # Return the chunks
        return chunks

    except FileNotFoundError:
        print(f"The file {filepath} does not exist.")
        return None
    except:
        print("An unexpected error occurred.")
        return None

def get_module_structure(directory_path):
  """ Generates a list of all files and their module structure within a given directory.

  Args:
    directory_path (str): Path to the directory you want to explore.

  Returns:
    A dictionary representing the module structure, with keys as module names and values as lists of file names.
  """

  module_structure = {}

  for root, dirs, files in os.walk(directory_path):
     # Normalize the root path to ensure consistent separators
    normalized_root = os.path.normpath(root).replace('\\', '/')
        # Get the module name from the directory path
    module_name = os.path.basename(normalized_root)
    # Add the module name to the dictionary if it doesn't exist
    if module_name not in module_structure:
      module_structure[module_name] = []

    # Add the file names to the list for the module
    module_structure[module_name].extend([os.path.join(normalized_root, file).replace('\\', '/') for file in files])

  return module_structure

def get_module_csv(directory_path):
    """ Generates a list of all .csv files and their module structure within a given directory.

    Args:
      directory_path (str): Path to the directory you want to explore.

    Returns:
      A dictionary representing the module structure, with keys as module names and values as lists of .csv file names.
    """

    module_csv = {}

    for root, dirs, files in os.walk(directory_path):
        # Normalize the root path to ensure consistent separators
        normalized_root = os.path.normpath(root).replace('\\', '/')
        # Get the module name from the directory path
        module_name = os.path.basename(normalized_root)

        # Add the module name to the dictionary if it doesn't exist
        if module_name not in module_csv:
            module_csv[module_name] = []

        # Add only .csv file names to the list for the module
        csv_files = [file for file in files if file.lower().endswith('.csv')]
        module_csv[module_name].extend([os.path.join(normalized_root, file).replace('\\', '/') for file in csv_files])

    return module_csv


def get_module_json(directory_path):
    """ Generates a list of all .json files and their module structure within a given directory.

    Args:
      directory_path (str): Path to the directory you want to explore.

    Returns:
      A dictionary representing the module structure, with keys as module names and values as lists of .json file names.
    """
    module_json = {}

    for root, dirs, files in os.walk(directory_path):
        # Normalize the root path to ensure consistent separators
        normalized_root = os.path.normpath(root).replace('\\', '/')

        # Get the module name from the directory path
        module_name = os.path.basename(normalized_root)

        # Add the module name to the dictionary if it doesn't exist
        if module_name not in module_json:
            module_json[module_name] = []

        # Add only .json file names to the list for the module
        json_files = [file for file in files if file.lower().endswith('.json')]
        module_json[module_name].extend([os.path.join(normalized_root, file).replace('\\', '/') for file in json_files])

    return module_json

def normalize_json_data(data):
    """
    Normalizes JSON data to a list of dictionaries if possible.
    """
    # If it's a dictionary, wrap it in a list
    if isinstance(data, dict):
        return [data]
    # If it's a list of lists or primitives, convert it to a list of dictionaries
    elif isinstance(data, list) and all(not isinstance(item, dict) for item in data):
        # Assume each item in the list can be a row in the CSV
        return [{str(index): value for index, value in enumerate(item)} if isinstance(item, list) else {'value': item} for item in data]
    # If it's already a list of dictionaries, no need to modify
    elif isinstance(data, list) and all(isinstance(item, dict) for item in data):
        return data
    else:
        raise ValueError('JSON data structure is not supported.')
def json_to_csv(json_file_path, csv_file_path):
    """
    Converts a JSON file into a CSV file.
    """
    try:
        with open(json_file_path, 'r', encoding='utf-8') as json_file:
            data = json.load(json_file)
        normalized_data = normalize_json_data(data)
        headers = set().union(*(d.keys() for d in normalized_data))
        with open(csv_file_path, 'w', encoding='utf-8', newline='') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=headers)
            writer.writeheader()
            writer.writerows(normalized_data)
        print(f"Converted JSON to CSV: {csv_file_path}")
    except Exception as e:
        print(f"Error converting JSON to CSV: {e}")
        traceback.print_exc()

# def json_to_csv(json_file_path, csv_file_path):
#     """
#     Converts a JSON file into a CSV file.

#     Args:
#       json_file_path (str): Path to the input JSON file.
#       csv_file_path (str): Path to the output CSV file.
#     """
#     try:
#         # Check if JSON file exists
#         if not os.path.isfile(json_file_path):
#             raise FileNotFoundError(f"JSON file does not exist: {json_file_path}")

#         # Read JSON data
#         with open(json_file_path, 'r', encoding='utf-8') as json_file:
#             data = json.load(json_file)

#         # Normalize the JSON data to a list of dictionaries
#         normalized_data = normalize_json_data(data)

#         # Aggregate headers from all data entries
#         headers = {key for item in normalized_data for key in item.keys()}

#         # Write CSV data
#         with open(csv_file_path, 'w', encoding='utf-8', newline='') as csv_file:
#             csv_writer = csv.DictWriter(csv_file, fieldnames=headers)
#             csv_writer.writeheader()  # Write the headers to the CSV file
#             csv_writer.writerows(normalized_data)  # Write data rows

#         print(f"Successfully converted {json_file_path} to {csv_file_path}.")

#     except json.JSONDecodeError:
#         print(f"Error: Invalid JSON data in file {json_file_path}.")
#     except FileNotFoundError as e:
#         print(str(e))
#     except ValueError as e:
#         print(str(e))
#     except Exception as e:
#         print(f"An unexpected error occurred: {e}")



def print_random_row(dataframe):
    """
    Prints a random row from a Pandas DataFrame.

    Args:
      dataframe (pd.DataFrame): A Pandas DataFrame object.

    Returns:
      str: A formatted string of the random row.
    """
    if dataframe.empty:
        return "The DataFrame is empty."

    random_index = random.randint(0, len(dataframe) - 1)
    random_row = dataframe.iloc[random_index]

    task = "\n".join(f"{column}: {value}" for column, value in random_row.items())
    prompt = f"Following the instructions:\n{task}"
    return prompt
def chunkify(text, length_of_chunk):
    """
    Splits text into chunks of a given length.

    Args:
        text (str): The text to be split.
        length_of_chunk (int): The maximum length of each chunk.

    Returns:
        list of str: A list of text chunks.
    """
    return [text[i:i+length_of_chunk] for i in range(0, len(text), length_of_chunk)]
def get_all_rows(dataframe, length_of_chunk=2048):
    """
    Returns all rows from a Pandas DataFrame in specified chunk lengths.

    Args:
        dataframe (pd.DataFrame): A Pandas DataFrame object.
        length_of_chunk (int): The maximum length of each chunk.

    Returns:
        List[str]: A list of formatted strings of all the rows, chunked by length.
    """
    if dataframe.empty:
        return ["The DataFrame is empty."]

    all_chunks = []
    for index, row in dataframe.iterrows():
        # Format the row into a string
        row_str = "\n".join(f"{column}: {value}" for column, value in row.items())
        prompt = f"Row {index}:\n{row_str}\n"
        # Split the formatted row string into chunks
        row_chunks = chunkify(prompt, length_of_chunk)
        all_chunks.extend(row_chunks)

    return all_chunks

def process_json_file(json_file_path, length_of_chunk=2048):
    # Convert the JSON file to CSV format
    csv_file_path = json_file_path.replace('.json', '.csv')
    json_to_csv(json_file_path, csv_file_path)

    # Load the CSV into a pandas DataFrame
    dataframe = pd.read_csv(csv_file_path)

    # Get all rows from the DataFrame, formatted and chunked
    return get_all_rows(dataframe, length_of_chunk)

def preprocess_text(text):
    """
    Preprocesses the text.

    Args:
      text (str): The text to preprocess.

    Returns:
      str: The preprocessed text.
    """
    # Example: simple preprocessing that removes punctuation and converts to lowercase.
    return text.translate(str.maketrans('', '', string.punctuation)).lower()

def process_files_text(file_path, length_of_chunk=2048):
    """
    Processes a file and returns a list of tuples containing the starting byte and the preprocessed chunk of text.

    Args:
      file_path (str): The path to the file to process.
      length_of_chunk (int): The size of each chunk to be read and processed.

    Returns:
      List[Tuple[int, str]]: A list of tuples containing the starting byte and the preprocessed chunk of text.
    """
    # Check if file exists and is a file
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"The file at path {file_path} does not exist.")

    processed_chunks = []

    with open(file_path, 'r', encoding='utf-8') as file:
        while True:
            start_byte = file.tell()  # Get the current position in the file
            chunk = file.read(length_of_chunk)
            if not chunk:
                break  # End of file reached
            preprocessed_chunk = preprocess_text(chunk)
            processed_chunks.append((start_byte, preprocessed_chunk))
            if len(chunk) < length_of_chunk:
                break  # Last chunk, smaller than max size, processed

    return processed_chunks
# def process_files(file_path, length_of_chunk=200):
#     try:
#         file_extension = get_file_extension(file_path)
#         if not file_extension:
#             # Optionally handle files with no extension differently here
#             raise ValueError(f'Error processing file {file_path}: File has no extension')
#         if file_extension == '.pdf':
#             return extract_text_from_pdf(file_path, length_of_chunk)
#         elif file_extension in ['.json', '.csv']:
#             return process_json_file(file_path, length_of_chunk)
#         elif file_extension in ['.txt', '.md', '.py', '.ipynb']:
#             return process_files_text(file_path, length_of_chunk)
#         else:
#             raise NotImplementedError(f'File type {file_extension} is not supported.')
#     except (ValueError, NotImplementedError) as e:
#         # Log error and continue with the next file
#         print(f"Skipping file {file_path} due to error: {e}")
#         return None

def get_file_extension(file_path):
    _, file_extension = os.path.splitext(file_path)
    return file_extension.lower()

def process_files(file_path, length_of_chunk=2048):
    """
    Processes a file and returns a list of processed chunks of text.
    """
    try:
        file_extension = get_file_extension(file_path)
        if not file_extension:
            raise ValueError(f'File {file_path} has no extension.')

        if file_extension == '.pdf':
            return extract_text_from_pdf(file_path, length_of_chunk)
        elif file_extension == '.json':
            return process_json_file(file_path, length_of_chunk)
        elif file_extension == '.csv':
            dataframe = pd.read_csv(file_path)
            return get_all_rows(dataframe, length_of_chunk)
        elif file_extension in ['.txt', '.md', '.py']:
            return process_files_text(file_path, length_of_chunk)
        else:
            raise NotImplementedError(f'File type {file_extension} is not supported.')

    except FileNotFoundError:
        print(f'File not found: {file_path}')
    except ValueError as ve:
        print(f'ValueError: {ve}')
    except NotImplementedError as nie:
        print(f'NotImplementedError: {nie}')
    except Exception as e:
        print(f'An unexpected error occurred while processing {file_path}: {e}')
        traceback.print_exc()

# directory_path = "/content/drive/MyDrive"
# try:
#     module_structure = get_module_structure(directory_path)
#     for module, files in module_structure.items():
#         print(f'Module: {module}')
#         for file in files:
#             try:
#                 result = process_files(file, length_of_chunk=1000)
#                 # Only print the result if it's not None
#                 if result is not None:
#                     print("\n", result[:1][0])
#             except Exception as e:
#                 # Catch any other unexpected exceptions and log them
#                 print(f"An error occurred while processing {file}: {e}")
#         print("\n")
# except Exception as e:
#     print(f"An error occurred while getting the module structure: {e}")


class CustomTextDataset(Dataset):
    def __init__(self, tokenizer, file_paths, length_of_chunk):
        self.tokenizer = tokenizer
        self.texts = []
        self.length_of_chunk = length_of_chunk
        self.file_paths = file_paths
        for file_path in file_paths:
            self.texts.extend(self._load_and_process_file(file_path))

    def _load_and_process_file(self, file_path):
        processed_data = process_files(file_path, self.length_of_chunk)
        if processed_data is None:
            return []
        return processed_data

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.tokenizer(self.texts[idx], return_tensors='pt', truncation=True, padding='max_length', max_length=512)
    # Get file paths from the module structure
directory_path = "C:/Users/heman/Desktop/Hemanth/LLM-Finetuning-Hub/"
file_paths = []
try:
    module_structure = get_module_structure(directory_path)
    for module, files in module_structure.items():
        file_paths.extend([os.path.join(module, f) for f in files])
except Exception as e:
    print(f"An error occurred while getting the module structure: {e}")

def positive_int(value):
    # Helper function to ensure the argument is a positive integer
    ivalue = int(value)
    if ivalue <= 0:
        raise ArgumentTypeError(f"{value} is an invalid positive int value")
    return ivalue

def non_negative_int(value):
    # Helper function to ensure the argument is a non-negative integer
    ivalue = int(value)
    if ivalue < 0:
        raise ArgumentTypeError(f"{value} is an invalid non-negative int value")
    return ivalue

def str_to_bool(value):
    # Helper function to convert string to boolean
    if value.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif value.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise ArgumentTypeError(f"{value} is not a valid boolean value")

def create_parser():
    # Initialize the argument parser
    parser = argparse.ArgumentParser(description="Fine-tune a transformer model with customizable parameters.")

    # Add arguments to the parser
    parser.add_argument("--model_name", type=str, default="gpt2" ,help="The model that you want to train from the Hugging Face hub.")
    parser.add_argument("--dataset_name", type=str,  default='mlabonne/guanaco-llama2-1k',help="The instruction dataset to use.")
    parser.add_argument("--Dataset_path", type=str, default='/content/drive/MyDrive/Applied-Deep-Learning/06 - Speech & Music', help="Directory where the training data will be stored")
    parser.add_argument("--new_model", type=str, default="hemanthkandimalla_models", help="Fine-tuned model name.")

    # QLoRA parameters
    parser.add_argument("--lora_r", type=positive_int, default=64, help="LoRA attention dimension.")
    parser.add_argument("--lora_alpha", type=positive_int, default=16, help="Alpha parameter for LoRA scaling.")
    parser.add_argument("--lora_dropout", type=float, default=0.1, help="Dropout probability for LoRA layers.")

    # bitsandbytes parameters
    parser.add_argument("--use_4bit", type=str_to_bool, default=True, help="Activate 4-bit precision base model loading.")
    parser.add_argument("--bnb_4bit_compute_dtype", type=str, default="float16", choices=["float16", "float32"], help="Compute dtype for 4-bit base models.")
    parser.add_argument("--bnb_4bit_quant_type", type=str, default="nf4", choices=["fp4", "nf4"], help="Quantization type (fp4 or nf4).")
    parser.add_argument("--use_nested_quant", type=str_to_bool, default=False, help="Activate nested quantization for 4-bit base models (double quantization).")

    # TrainingArguments parameters
    parser.add_argument("--output_dir", type=str, default="./results", help="Output directory for model predictions and checkpoints.")
    parser.add_argument("--num_train_epochs", type=positive_int, default=1, help="Number of training epochs.")
    parser.add_argument("--fp16", type=str_to_bool, default=False, help="Enable fp16 training.")
    parser.add_argument("--bf16", type=str_to_bool, default=False, help="Enable bf16 training (set to True with an A100).")
    parser.add_argument("--per_device_train_batch_size", type=positive_int, default=4, help="Batch size per GPU for training.")
    parser.add_argument("--per_device_eval_batch_size", type=positive_int, default=4, help="Batch size per GPU for evaluation.")
    parser.add_argument("--gradient_accumulation_steps", type=positive_int, default=1, help="Number of update steps to accumulate gradients for.")
    parser.add_argument("--gradient_checkpointing", type=str_to_bool, default=True, help="Enable gradient checkpointing.")
    parser.add_argument("--max_grad_norm", type=float, default=0.3, help="Maximum gradient norm (gradient clipping).")
    parser.add_argument("--learning_rate", type=float, default=2e-4, help="Initial learning rate (AdamW optimizer).")
    parser.add_argument("--weight_decay", type=float, default=0.001, help="Weight decay for all layers except bias/LayerNorm weights.")
    parser.add_argument("--optim", type=str, default="paged_adamw_32bit", choices=["adamw", "paged_adamw_32bit"], help="Optimizer to use.")
    parser.add_argument("--lr_scheduler_type", type=str, default="cosine", choices=["cosine", "linear"], help="Learning rate schedule.")
    parser.add_argument("--max_steps", type=non_negative_int, default=-1, help="Number of training steps (overrides num_train_epochs if positive).")
    parser.add_argument("--warmup_ratio", type=float, default=0.03, help="Ratio of steps for a linear warmup.")
    parser.add_argument("--group_by_length", type=str_to_bool, default=True, help="Group sequences into batches with same length.")
    parser.add_argument("--save_steps", type=non_negative_int, default=0, help="Save checkpoint every X update steps.")
    parser.add_argument("--logging_steps", type=positive_int, default=25, help="Log every X update steps.")

    # SFT parameters
    parser.add_argument("--max_seq_length", type=positive_int, default=None, help="Maximum sequence length to use. If None, use model's default.")
    parser.add_argument("--packing", type=str_to_bool, default=False, help="Pack multiple short examples in the same input sequence.")
    parser.add_argument("--device_map", type=str, default='{"": 0}', help="Load the entire model on the specified GPU.")

    return parser



def main():
    parser = create_parser()
    args = parser.parse_args()

    # Set up logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    try:
        # Load dataset
        if args.Dataset_path is not None:
            directory_path = args.Dataset_path
            file_paths = []
            try:
                module_structure = get_module_structure(directory_path)
                for module, files in module_structure.items():
                  file_paths.extend([os.path.join(module, f) for f in files])
                tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True)
                dataset = CustomTextDataset(tokenizer, file_paths, length_of_chunk=1000)
                data_loader = DataLoader(dataset, batch_size=1, shuffle=True)
            except Exception as e:
                print(f"An error occurred while getting the module structure: {e}")


        else:
            dataset = load_dataset(args.dataset_name, split="train")
    except Exception as e:
        logger.error(f"Error loading dataset: {e}")
        return




    try:
        # Set compute_dtype based on the argument
        compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)

        # Check GPU compatibility with bfloat16
        if compute_dtype == torch.float16 and args.use_4bit:
            major, _ = torch.cuda.get_device_capability()
            if major >= 8:
                logger.info("=" * 80)
                logger.info("Your GPU supports bfloat16: accelerate training with bf16=True")
                logger.info("=" * 80)

        # Load tokenizer and model with QLoRA configuration
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=args.use_4bit,
            bnb_4bit_quant_type=args.bnb_4bit_quant_type,
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=args.use_nested_quant,
        )

        model = AutoModelForCausalLM.from_pretrained(
            args.model_name,
            quantization_config=bnb_config,
            low_cpu_mem_usage=True
        )
        model.config.use_cache = False
        model.config.pretraining_tp = 1

        # Load LLaMA tokenizer and configure padding
        tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True)
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "right"

        # Load LoRA configuration
        peft_config = LoraConfig(
            lora_alpha=args.lora_alpha,
            lora_dropout=args.lora_dropout,
            r=args.lora_r,
            bias="none",
            task_type="CAUSAL_LM",
        )

        # Set training parameters
        training_arguments = TrainingArguments(
            output_dir=args.output_dir,
            num_train_epochs=args.num_train_epochs,
            per_device_train_batch_size=args.per_device_train_batch_size,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            optim=args.optim,
            save_steps=args.save_steps,
            logging_steps=args.logging_steps,
            learning_rate=args.learning_rate,
            weight_decay=args.weight_decay,
            fp16=args.fp16,
            bf16=args.bf16,
            max_grad_norm=args.max_grad_norm,
            max_steps=args.max_steps if args.max_steps >= 0 else None,
            warmup_ratio=args.warmup_ratio,
            group_by_length=args.group_by_length,
            lr_scheduler_type=args.lr_scheduler_type,
            report_to="tensorboard"
        )

    except Exception as e:
        logger.error(f"Error setting up model and tokenizer: {e}")
        return

    try:
        # Set supervised fine-tuning parameters
        trainer = SFTTrainer(
            model=model,
            train_dataset=dataset,
            peft_config=peft_config,
            dataset_text_field="text",
            max_seq_length=args.max_seq_length,
            tokenizer=tokenizer,
            args=training_arguments,
            packing=args.packing,
            data_collator=lambda data: {'input_ids': torch.stack([f['input_ids'][0] for f in data]),
                                'attention_mask': torch.stack([f['attention_mask'][0] for f in data]),
                                'labels': torch.stack([f['input_ids'][0] for f in data])},
        )

        # Train model
        trainer.train()

        # Save trained model
        # Save trained model
        trainer.save_model(args.output_dir)

        # Optionally, evaluate the model after training
        if args.do_eval:
            # Load evaluation dataset
            eval_dataset = load_dataset(args.dataset_name, split="validation")
            # Evaluate model
            results = trainer.evaluate(eval_dataset)
            logger.info(f"Evaluation results: {results}")

    except Exception as e:
        logger.error(f"Error during training or evaluation: {e}")
        return

    logger.info("Training and evaluation completed successfully.")

if __name__ == "__main__":
    main()

In [None]:
from google.colab import drive
drive.mount('/content/drive')