In [None]:
# Install requirements for sagemaker
!pip install -r requirements.txt

In [None]:
import torch
import pandas as pd
from datasets import Dataset
from datasets.utils import tqdm
from huggingface_hub import login
from peft import LoraConfig, get_peft_model
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import AutoTokenizer, pipeline, TrainingArguments, BitsAndBytesConfig, AutoModelForSequenceClassification, Trainer

In [None]:
login(token="###")

In [None]:
base_model_name = "meta-llama/Llama-3.1-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config, 
    num_labels=2,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
pipe = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [None]:
df_training = pd.read_csv('training_llama.csv')

In [None]:
df_training.rename(columns={'polarizing': 'label'}, inplace=True)

In [None]:
df_val = pd.read_csv('validation_llama.csv')

In [None]:
df_val.rename(columns={'polarizing': 'label'}, inplace=True)

In [None]:
def preprocess_function(row, tokenizer: tokenizer):
    # Tokenize the text
    tokenized = tokenizer(
        row["speech_content"],
        max_length=512,
        padding="max_length",
        truncation=True
    )
    # Add the labels
    tokenized["labels"] = row["label"]
    return tokenized

In [None]:
def preprocess_dataframe(df: pd.DataFrame, tokenizer: tokenizer):
    return df.apply(lambda row: preprocess_function(row, tokenizer), axis=1).tolist()

In [None]:
# Tokenize the training and validation datasets
train_data = preprocess_dataframe(df_training, tokenizer)
val_data = preprocess_dataframe(df_val, tokenizer)

In [None]:
# Convert to Hugging Face Dataset
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

In [None]:
def find_all_linear_names(model):
    target_classes = (torch.nn.Linear,)  # Target torch.nn.Linear layers
    lora_module_names = set()

    for name, module in model.named_modules():
        if isinstance(module, target_classes):
            if 'self_attn' in name or 'mlp' in name:  # Target self-attention and MLP layers
                lora_module_names.add(name.split('.')[-1])  # Add the specific module name (e.g., q_proj)

    # Debugging output to ensure the correct layers are identified
    print(f"LoRA target modules: {lora_module_names}")

    return list(lora_module_names)

# Find LoRA target modules
modules = find_all_linear_names(model)
if not modules:
    raise ValueError("No target modules found. Please verify the model architecture.")

# Output the selected modules
print(f"Selected modules for LoRA: {modules}")

In [None]:
new_model = "Llama-3.1-8B-Instruct-Finetuned"

In [None]:
# LoRA config
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    task_type="SEQ_CLS",
    target_modules=modules
)
model = get_peft_model(model, peft_config)

In [None]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=True,
    bf16=False,
    group_by_length=False 
)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary', zero_division=0)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    args=training_arguments,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
# Save trained model and tokenizer
trainer.save_model(new_model)
tokenizer.save_pretrained(new_model)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(new_model).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(new_model)

In [None]:
df_speeches = pd.read_csv('llama20.csv')

In [None]:
ds_speeches = Dataset.from_pandas(df_speeches[["split_speeches"]])

In [None]:
# Define the text column to classify
text_column = "speech_content"

# Get logits from the model
def get_logits(row, model, tokenizer, text_column):
    text = row[text_column]

    # Tokenize the input
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        padding="max_length"
    )

    # Ensure inputs are on the same device as the model
    inputs = {key: value.to(model.device) for key, value in inputs.items()}

    # Get logits from the model
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # Extract logits

    return logits.cpu().numpy()  # Return logits as a NumPy array for easier handling

In [None]:
# Apply the model to the dataset with progress bar
logits_list = []
for _, row in tqdm(df_speeches.iterrows(), total=len(df_speeches), desc="Getting Logits"):
    logits = get_logits(row, model, tokenizer, text_column)
    logits_list.append(logits)

In [None]:
# Add logits to the dataframe
df_speeches["logits"] = logits_list

In [None]:
# Compute probabilities and predicted labels
df_speeches["positive_score"] = df_speeches["logits"].apply(lambda x: torch.sigmoid(torch.tensor(x)).item())
df_speeches["predicted_label"] = df_speeches["positive_score"].apply(lambda x: 1 if x >= 0.5 else 0)  # Predicted class (0 or 1)

In [None]:
# Save the results
df_speeches.to_csv("llama20_out.csv", index=False)