In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import os

In [3]:
os.chdir("../")

In [4]:
os.getcwd()

'/home/abubeker_shamil/Amharic_LLM_Finetuning'

In [5]:
import numpy as np

from datasets import load_dataset, load_metric, Dataset, DatasetDict, concatenate_datasets, load_from_disk
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

In [None]:
summary = pd.read_csv("data/raw/summary_cleaned.csv")

In [None]:
summary.isna().sum()

In [None]:
summary.dropna(inplace=True)

In [None]:
summary.isna().sum()

In [None]:
model_name = "Samuael/llama-2-7b-tebot-amharic"  # Replace with your chosen model name
# "rasyosef/bert-amharic-tokenizer"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
train_texts, val_texts, train_summaries, val_summaries = train_test_split(
    summary['clean_text'].tolist(), summary['clean_summary'].tolist(), test_size=0.1, random_state=42
)

In [None]:
# Tokenize the dataset in chunks to manage memory usage
chunk_size = 1000  # Reduce chunk size for easier debugging
train_chunks = [train_texts[i:i + chunk_size] for i in range(0, len(train_texts), chunk_size)]
train_summary_chunks = [train_summaries[i:i + chunk_size] for i in range(0, len(train_summaries), chunk_size)]
val_chunks = [val_texts[i:i + chunk_size] for i in range(0, len(val_texts), chunk_size)]
val_summary_chunks = [val_summaries[i:i + chunk_size] for i in range(0, len(val_summaries), chunk_size)]

# Function to tokenize and convert to Dataset
def tokenize_and_create_dataset(texts, summaries):
    # Debugging: Print types and samples of the inputs
    print(f"texts type: {type(texts)}, summaries type: {type(summaries)}")
    print(f"texts sample: {texts[:3]}, summaries sample: {summaries[:3]}")
    
    # Ensure the inputs are lists of strings
    if not isinstance(texts, list):
        raise ValueError("texts must be a list of strings")
    if not isinstance(summaries, list):
        raise ValueError("summaries must be a list of strings")
    if not all(isinstance(text, str) for text in texts):
        raise ValueError("All elements in texts must be strings")
    if not all(isinstance(summary, str) for summary in summaries):
        raise ValueError("All elements in summaries must be strings")

    # Tokenize
    encodings = tokenizer(texts, padding=True, truncation=True, max_length=512)
    labels = tokenizer(summaries, padding=True, truncation=True, max_length=512)
    
    # Ensure the labels are properly encoded
    encodings = {
        'input_ids': torch.tensor(encodings['input_ids']),
        'attention_mask': torch.tensor(encodings['attention_mask']),
        'labels': torch.tensor(labels['input_ids'])
    }
    
    return Dataset.from_dict(encodings)

In [None]:
# Tokenize and save train and validation datasets incrementally
train_datasets = []
val_datasets = []

for i in range(len(train_chunks)):
    print(f"Processing train chunk {i}")
    print(f"Type of train_summary_chunks[{i}]: {type(train_summary_chunks[i])}")
    print(f"Content of train_summary_chunks[{i}]: {train_summary_chunks[i][:5]}")
    print(f"Type of train_chunks[{i}]: {type(train_chunks[i])}")
    print(f"Content of train_chunks[{i}]: {train_chunks[i][:5]}")
    
    # Check the content before tokenizing
    if not all(isinstance(summary, str) for summary in train_summary_chunks[i]):
        raise ValueError(f"Chunk {i} in train_summary_chunks contains non-string elements.")
    if not all(isinstance(text, str) for text in train_chunks[i]):
        raise ValueError(f"Chunk {i} in train_chunks contains non-string elements.")
    
    train_dataset_chunk = tokenize_and_create_dataset(train_chunks[i], train_summary_chunks[i])
    train_datasets.append(train_dataset_chunk)

for i in range(len(val_chunks)):
    print(f"Processing val chunk {i}")
    print(f"Type of val_summary_chunks[{i}]: {type(val_summary_chunks[i])}")
    print(f"Content of val_summary_chunks[{i}]: {val_summary_chunks[i][:5]}")
    print(f"Type of val_chunks[{i}]: {type(val_chunks[i])}")
    print(f"Content of val_chunks[{i}]: {val_chunks[i][:5]}")
    
    # Check the content before tokenizing
    if not all(isinstance(summary, str) for summary in val_summary_chunks[i]):
        raise ValueError(f"Chunk {i} in val_summary_chunks contains non-string elements.")
    if not all(isinstance(text, str) for text in val_chunks[i]):
        raise ValueError(f"Chunk {i} in val_chunks contains non-string elements.")
    
    val_dataset_chunk = tokenize_and_create_dataset(val_chunks[i], val_summary_chunks[i])
    val_datasets.append(val_dataset_chunk)


In [None]:
# Concatenate the datasets
train_dataset = concatenate_datasets(train_datasets)
val_dataset = concatenate_datasets(val_datasets)

# Combine into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

# Save the dataset to disk
dataset.save_to_disk('tokenized_dataset')

In [6]:
dataset = load_from_disk('tokenized_dataset')

In [7]:
# Print a sample from the dataset
print(dataset['train'][0])
print(dataset['validation'][0])

{'input_ids': [1, 29871, 228, 141, 165, 228, 141, 152, 228, 140, 179, 228, 139, 176, 228, 141, 151, 228, 140, 184, 29871, 228, 142, 171, 228, 143, 139, 228, 141, 150, 29871, 228, 139, 160, 228, 141, 161, 228, 140, 180, 228, 141, 152, 29871, 228, 142, 139, 228, 142, 179, 29871, 228, 141, 163, 228, 143, 144, 228, 141, 160, 228, 140, 186, 228, 139, 144, 29871, 228, 140, 167, 228, 139, 142, 29871, 228, 141, 163, 228, 139, 132, 228, 141, 152, 29871, 228, 140, 163, 228, 142, 172, 228, 140, 182, 29871, 228, 142, 171, 228, 141, 168, 228, 141, 152, 228, 139, 184, 228, 139, 182, 228, 140, 184, 29871, 228, 139, 158, 228, 140, 137, 228, 142, 174, 29871, 228, 142, 144, 228, 139, 184, 228, 143, 168, 29871, 228, 139, 139, 228, 139, 155, 228, 141, 153, 228, 139, 176, 29871, 228, 141, 168, 228, 142, 171, 228, 140, 179, 228, 139, 142, 228, 141, 171, 29871, 228, 141, 147, 228, 142, 144, 29871, 228, 142, 171, 228, 141, 165, 228, 141, 152, 228, 142, 181, 228, 142, 174, 228, 141, 150, 29871, 228, 143, 131, 

In [8]:
cache_dir= './cache'

In [9]:
from transformers import LlamaTokenizer

checkpoint = "iocuydi/llama-2-amharic-3784m"
commit_hash = "04fcac974701f1dab0b8e39af9d3ecfce07b3773"
# The commit hash is needed, because the model repo was rearranged after this commit (files -> finetuned/files),
# and I couldn't load the model from the new structure

tokenizer = LlamaTokenizer.from_pretrained(checkpoint, revision=commit_hash, cache_dir=cache_dir)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [10]:
from peft import PeftModel
from transformers import LlamaForCausalLM, GenerationConfig

llama_model = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    load_in_8bit=True,
    device_map="auto",
    cache_dir=cache_dir, # optional
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
llama_model.resize_token_embeddings(len(tokenizer)) # needed because the fine-tuned model extended the tokenizer

Embedding(51008, 4096)

In [12]:
# this is the model we want:
garii_model = PeftModel.from_pretrained(llama_model, "iocuydi/llama-2-amharic-3784m",revision =commit_hash, cache_dir= cache_dir)

In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
# Prepare the model for int8 training (optional, but helps reduce memory usage)
model = prepare_model_for_kbit_training(garii_model)

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none"
)

# Apply LoRA to the model
lora_model = get_peft_model(model, lora_config)

# Tokenize example input
input_text = "ሰላም እንዴት ነህ?"
inputs = tokenizer(input_text, return_tensors="pt")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2, 
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Define trainer
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=dataset['train'],  # Assuming `dataset` is your prepared dataset
    eval_dataset=dataset['validation']
)

In [14]:
trainer.model.print_trainable_parameters()

trainable params: 19,988,480 || all params: 7,331,975,168 || trainable%: 0.2726


In [None]:
# Start training
trainer.train()

In [None]:
# Save the model
model.save_pretrained("./finetuned_model")

In [None]:
# Test the fine-tuned model
output = model.generate(**inputs, max_length=50)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
#print the generated text to a file
with open('generated_text.txt', 'w') as f:
    f.write(generated_text)