# Overview

The model we'll fine tune is Meta Llama 3.1

Requirements:

- Output file from 3-merge-data.ipynb
- Meta Llama3 Model downloaded

# Install Libraries

In [1]:
pip install sentencepiece

Note: you may need to restart the kernel to use updated packages.


# Required Packages

In [2]:
import pandas as pd
from datasets import Dataset

# Hyper Parameters

In [3]:
# Load the data
SAVE_DIRECTORY = "../artifacts"


# Step 1: Load and Prepare the Dataset
Let's load the CSV and ensure that we handle only the necessary columns (reason and singleMessage). We'll also create the label mapping based on the reason column.

In [4]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Load the CSV file
data = pd.read_csv(f'{SAVE_DIRECTORY}/data/output/1.0.0-3-merge-data.csv')

# Generate label mapping from unique labels in 'reason'
unique_labels = data['reason'].unique()
label_mapping = {i: label for i, label in enumerate(unique_labels)}
inv_label_mapping = {v: k for k, v in label_mapping.items()}

print("Label Mapping:", label_mapping)

# Encode the labels in the DataFrame
data['labels'] = data['reason'].apply(lambda x: inv_label_mapping[x])

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(data)

# Remove unnecessary columns for training
dataset = dataset.remove_columns(["reason"])

# Check the dataset
print(dataset)


Label Mapping: {0: 'Clean', 1: 'Politics not allowed outside of references to the market.', 2: 'Off-topic', 3: 'Caps for tickers only.', 4: 'Third-party links / content not allowed.', 5: 'Inappropriate comment.', 6: 'Personal or sensitive information not allowed in chat.', 7: 'Bullying a member or moderator.', 8: '"Any discussion related in any way to market manipulation is strictly prohibited, as is advising others on whether to buy, sell, or hold."', 9: 'Please provide more information when making comments like these. For example "AFRM is being shorted every candle, so I think it\'s manipulated" ', 10: 'Reviewed by admin internally; not necessary to post to public chat.', 11: 'Bypassing the chat filters is not allowed.', 12: 'False information or no source.', 13: 'Account number visible. Please remove from content before reposting.', 14: 'Perv is an inappropriate term please refrain from these kinds of discussions here', 15: 'False or misleading information, or no source.', 16: 'comm

# Step 2: Tokenize and Prepare for Training
Now we'll tokenize the dataset and prepare it for training:

In [6]:
from transformers import LlamaTokenizer

# Load the tokenizer
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")

# Tokenize the 'singleMessage' text
def tokenize_function(example):
    return tokenizer(example["singleMessage"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Remove the original 'singleMessage' column after tokenization
tokenized_dataset = tokenized_dataset.remove_columns(["singleMessage"])

# Convert to PyTorch tensors
tokenized_dataset.set_format("torch")

# Split into train and validation datasets
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

datasets = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset
})


OSError: Can't load tokenizer for 'meta-llama/Meta-Llama-3.1-8B'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'meta-llama/Meta-Llama-3.1-8B' is the correct path to a directory containing all relevant files for a LlamaTokenizer tokenizer.

# Step 3: Fine-Tune the Llama Model
Now that the dataset is prepared, you can proceed with the fine-tuning process as previously outlined:

In [None]:
from transformers import LlamaForSequenceClassification, TrainingArguments, Trainer

# Load the pre-trained model
model = LlamaForSequenceClassification.from_pretrained("meta-llama/Llama-3.1", num_labels=len(label_mapping))

# Define training arguments
training_args = TrainingArguments(
    output_dir=f"{SAVE_DIRECTORY}/output/llama3-1",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the model and tokenizer
trainer.save_model(f'{SAVE_DIRECTORY}/models/llama3-1')
tokenizer.save_pretrained(f'{SAVE_DIRECTORY}/models/llama3-1')


# Step 4: Perform Inference with the Fine-Tuned Model
You can now use the fine-tuned model to predict the labels for new sentences:

In [None]:
import torch.nn.functional as F

def predict(text, model, tokenizer, label_mapping):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1).squeeze().cpu().numpy()
    result = {label_mapping[idx]: float(probs[idx]) for idx in range(len(label_mapping))}
    return result

# Test the prediction
text = "Why the hell is he not in prison?!?!??"
prediction = predict(text, model, tokenizer, label_mapping)
print(prediction)
