In [None]:
# run this 
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
!pip install torch==2.3.1

In [None]:
!pip install tqdm

In [None]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported, UnslothTrainer, UnslothTrainingArguments, FastLanguageModel
#from datasets import load_dataset, DatasetDict, Dataset
import torch
from tqdm import tqdm
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt

In [None]:
#pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
max_seq_length = 2048 # Choose any. Unsloth support RoPE Scaling internally
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)


In [None]:
model

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,
)

## cleaning further pre-training data - CUAD + ContractNLI

In [None]:
dataset_whole = load_dataset('theatticusproject/cuad-qa', trust_remote_code=True)

df_train = pd.DataFrame(dataset_whole['train'])
df_test = pd.DataFrame(dataset_whole['test'])

df_combined = pd.concat([df_train, df_test])

df_cuad_combined = df_combined.drop_duplicates(subset='context')

In [None]:
dataset_cuad =  Dataset.from_pandas(df_cuad_combined)

In [None]:
dataset_cuad

In [None]:
#########################################################

In [None]:
with open('nli_dataset/dev.json', 'r') as file:
    data = json.load(file)

documents = data['documents']

rows = []
for doc in documents:
    row = {
        'id': doc['id'],
        'file_name': doc['file_name'],
        'text': doc['text'],
        'url': doc['url']
    }
    rows.append(row)
    
df_dev = pd.DataFrame(rows)

In [None]:
with open('nli_dataset/test.json', 'r') as file:
    data = json.load(file)

documents = data['documents']

rows = []
for doc in documents:
    row = {
        'id': doc['id'],
        'file_name': doc['file_name'],
        'text': doc['text'],
        'url': doc['url']
    }
    rows.append(row)
    
df_test = pd.DataFrame(rows)

In [None]:
with open('nli_dataset/train.json', 'r') as file:
    data = json.load(file)

documents = data['documents']

rows = []
for doc in documents:
    row = {
        'id': doc['id'],
        'file_name': doc['file_name'],
        'text': doc['text'],
        'url': doc['url']
    }
    rows.append(row)
    
df_train = pd.DataFrame(rows)

In [None]:
df_nli_combined = pd.concat([df_dev,df_test,df_train])

In [None]:
dataset_nli = Dataset.from_pandas(df_nli_combined)

In [None]:
dataset_nli

## counting number of tokens for further pre-training

In [None]:
def create_token_list(dataset, field):
    """
    field: name of the column that contains the contracts
    """
    no_of_tokens_list = []
    for i in tqdm(range(len(dataset))):
        contract = dataset[i][field]
        tokens = tokenizer.encode(contract)
        no_of_tokens = len(tokens)
        no_of_tokens_list.append(no_of_tokens)
    return no_of_tokens_list


In [None]:
#print(f"The number of total tokens for CUAD (test + train combined) is {np.sum(create_token_list(dataset_cuad, 'context'))}")

In [None]:
#print(f"The number of total tokens for NLI (test + train + dev combined) is {np.sum(create_token_list(dataset_nli, 'text'))}")

## producing the pie chart for pre-training data

In [None]:
df_cuad = pd.read_csv('cuad_dataset.csv')

def get_token_number(contract):
    """
    field: name of the column that contains the contract
    """
    tokens = tokenizer.encode(contract)
    no_of_tokens = len(tokens)
    return no_of_tokens

df_cuad['token count'] = df_cuad['context'].apply(get_token_number)

contract_type_sums = df_cuad.groupby('contract type')['token count'].sum().to_dict()

contract_type_sums['non-disclosure agreement'] = 1302791

import matplotlib.colors as mcolors

labels = list(contract_type_sums.keys())
sizes = list(contract_type_sums.values())

# Create a pie chart
#colors =  plt.get_cmap('cool').colors

colormap = plt.cm.tab20b
colors = colormap(np.linspace(0, 1, len(labels)))


plt.figure(figsize=(10, 7))
#plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.pie(sizes, labels=labels, colors=plt.cm.Set3.colors, autopct='%1.1f%%', startangle=140, textprops={'fontsize': 6.5})
#plt.legend(labels, loc="best", fontsize=10)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

# Add a title
plt.title('Token Count by Contract Type')

plt.savefig('pie_chart.pdf', format='pdf')

# Display the pie chart
plt.show()

## merging the data

In [None]:
# Extract and rename columns from df_cuad_combined
df_cuad = df_cuad_combined[['id', 'context']].rename(columns={'context': 'contract'})
df_cuad['source'] = 'CUAD'

In [None]:
# Extract and rename columns from df_nli_combined
df_nli = df_nli_combined[['id', 'text']].rename(columns={'text': 'contract'})
df_nli['source'] = 'NLI'

In [None]:
# Combine the DataFrames
df_combined = pd.concat([df_cuad, df_nli], ignore_index=True)
# make sure id type is consistent
df_combined['id'] = df_combined['id'].astype(str)

In [None]:
dataset_combined = Dataset.from_pandas(df_combined)

In [None]:
#print(f"The number of total tokens for the combined dataset (CUAD + NLI) is {np.sum(create_token_list(dataset_combined, 'contract'))}")

In [None]:
dataset_combined

## training

In [None]:
trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    #train_dataset = dataset,
    train_dataset = dataset_combined,
    #dataset_text_field = "text",
    dataset_text_field = 'contract',
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,
        warmup_ratio = 0.1,
        #max_steps = 2000,
        #max_steps = 5,
        num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-6,
        embedding_learning_rate = 1e-6,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        #save_steps = 100,
        save_steps = 50,
        save_total_limit = 10,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "cosine",
        seed = 3407,
        #output_dir = "./drive/MyDrive/Llama-3-8B-fineweb-edu-r128a32wd0lstcosinelr5e06-10BT",
        output_dir = 'actual' # not sure?
    ),
)

trainer_stats = trainer.train()

In [None]:
# saving the LoRA adapters locally
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

In [None]:
# installing ollama
#!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
# Save to 8bit Q8_0
if True: model.save_pretrained_gguf("model_1_epoch", tokenizer,)

In [None]:
print(tokenizer._ollama_modelfile)

In [None]:
#!ollama create unsloth_model_1_epoch -f ./model_1_epoch/Modelfile

## training stats

In [None]:
# change the file path to the last checkpoint
file_path = 'actual/checkpoint-69/trainer_state.json'
with open(file_path, 'r') as file:
    data = json.load(file)

log_history = data['log_history']
steps = [entry['step'] for entry in log_history]
loss = [entry['loss'] for entry in log_history]

#plt.figure(figsize=(10, 6))
plt.plot(steps, loss, linestyle='-', color='green')
plt.xlabel('Step')
plt.ylabel('Loss')
plt.title('Training Loss over Steps for 1 Epoch')
plt.show()