This code generates model2 of the ensemble by training (Mistral fine tuned on Suicidal dataset) on the Mass Shooter Manifestos dataset

---



In [None]:
# Upload the following files in the runtime:
# train_dataset.csv
# suicidal_model.zip

In [None]:
%%capture

!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes

Import the model

In [None]:
import zipfile
import os

# Define the path to the uploaded zip file
zip_file_path = './suicidal_model.zip'

# Define the directory where you want to extract the contents
extract_dir = './suicidal_model'

# Create the extraction directory if it doesn't exist
if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("Extraction complete.")

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "suicidal_model",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,

)

Import the dataset

In [None]:
import pandas as pd
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import Dataset, load_dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.utils import shuffle

# Load data from CSV file
train_dataset = pd.read_csv('./train_dataset.csv')
val_dataset = pd.read_csv('./val_dataset.csv')

X_train = train_dataset['text'].tolist()
y_train = train_dataset['label'].tolist()

X_val = val_dataset['text'].tolist()
y_val = val_dataset['label'].tolist()

train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
val_dataset = Dataset.from_dict({"text": X_val, "label": y_val})

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Define prompt for training

In [None]:
from transformers import PreTrainedTokenizerFast

prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Output:
{}"""

EOS_TOKEN = tokenizer.eos_token

texts=[]
def formatting_prompts_func(examples):
    instruction = """You are an expert in identifying texts made by individuals associated with mass shootings. Your task is to analyze the provided text and determine if it exhibits characteristics typical of individuals associated with mass shootings.

Chain of Thought:

1. Begin by reading the provided text thoroughly, paying attention to language, tone, and underlying emotions.
2. Evaluate the text within this context, considering whether it demonstrates signs such as feelings of depression, suicidal thoughts, racist mentality, or a fascination with violence.
3. Based on your analysis, classify the text into one of the following categorical labels:
   - 1: If the text demonstrates patterns or behaviors commonly associated with individuals prone to mass shootings.
   - 0: If the text does not exhibit such patterns or behaviors.
"""

    input_text = examples["text"]
    output_label = examples["label"]

    s = prompt_template.format(instruction, input_text, output_label) + ' ' + EOS_TOKEN

    texts.append(s)
    return { "text" : s }

# Assuming you have already loaded your dataset
dataset = train_dataset.map(formatting_prompts_func)
print(dataset)


Train the model using N-pair loss

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the NpairLoss class
class NpairLoss(nn.Module):
    def __init__(self, l2_reg=0.02):
        super(NpairLoss, self).__init__()
        self.l2_reg = l2_reg

    def forward(self, anchor, positive, target):
        batch_size = anchor.size(0)
        target = target.view(target.size(0), 1)

        target = (target == torch.transpose(target, 0, 1)).float()
        target = target / torch.sum(target, dim=1, keepdim=True).float()

        logit = torch.matmul(anchor, torch.transpose(positive, 0, 1))
        loss_ce = F.cross_entropy(logit, target)
        l2_loss = torch.sum(anchor**2) / batch_size + torch.sum(positive**2) / batch_size

        loss = loss_ce + self.l2_reg*l2_loss*0.25
        return loss

# Create an instance of the NpairLoss
npair_loss = NpairLoss()

# Instantiate the trainer without specifying compute_loss argument
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    )
)

# Set the custom loss function to the model
trainer.model.compute_loss = npair_loss
trainer_stats = trainer.train()


Define the prompt for testing

In [None]:
from transformers import PreTrainedTokenizerFast

prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Output:
"""

EOS_TOKEN = tokenizer.eos_token

texts = []
def formatting_prompts_func(examples):
    instruction = """You are an expert in identifying texts made by individuals associated with mass shootings. Your task is to analyze the provided text and determine if it exhibits characteristics typical of individuals associated with mass shootings.

Chain of Thought:

1. Begin by reading the provided text thoroughly, paying attention to language, tone, and underlying emotions.
2. Evaluate the text within this context, considering whether it demonstrates signs such as feelings of depression, suicidal thoughts, racist mentality, or a fascination with violence.
3. Based on your analysis, classify the text into one of the following categorical labels:
   - 1: If the text demonstrates patterns or behaviors commonly associated with individuals prone to mass shootings.
   - 0: If the text does not exhibit such patterns or behaviors.
"""

    input_text = examples["text"]
    #output_label = examples["label"]

    s = prompt_template.format(instruction.strip(), input_text)

    texts.append(s)
    return { "text" : s }

# Assuming you have already loaded your dataset
test_dataset = val_dataset.map(formatting_prompts_func)
print(test_dataset)

Test the model

In [None]:
%%capture
import time

generated_outputs = []
FastLanguageModel.for_inference(model)

# Iterate through the test dataset
for idx in range(len(test_dataset['text'])):
    # Generate input prompt
    prompt_input = tokenizer(test_dataset['text'][idx],return_tensors="pt").to("cuda")

    # Generate output
    outputs = model.generate(**prompt_input, max_new_tokens = 64, use_cache = True).to("cuda")
    #print(outputs)

    #print(tokenizer.batch_decode(outputs))
    # Append generated output to the list
    generated_outputs.append(tokenizer.batch_decode(outputs))

In [None]:
generated_labels = [item[0].split("### Output:\n")[1].strip().split(" ")[0] for item in generated_outputs]

print(list(set(generated_labels)))

Save the predicted labels and the trained model

In [None]:
# save generated labels
import pickle

# File path in the Colab runtime
file_path = '/content/generated_labels2.pkl'

# Save the list to a file
with open(file_path, 'wb') as file:
    pickle.dump(generated_labels, file)

print(f"Labels saved to {file_path}")

In [None]:
model.save_pretrained("model_2")
tokenizer.save_pretrained("model_2")

In [None]:
import shutil

# Zip the folder containing model and tokenizer files
shutil.make_archive("model_2", "zip", "model_2")