In [None]:
%%capture

!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes

Unzip the fine tuned model


In [None]:
import zipfile
import os

# Define the path to the uploaded zip file
zip_file_path = './supremacist_suicidal_terrorist.zip'

# Define the directory where you want to extract the contents
extract_dir = './supremacist_suicidal_terrorist'

# Create the extraction directory if it doesn't exist
if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("Extraction complete.")

Extraction complete.


Import current model

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "supremacist_suicidal_terrorist",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )

Unused kwargs: ['quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Import the manifesto dataset

In [None]:
import pandas as pd
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import Dataset, load_dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.utils import shuffle

# Load data from CSV file
df = pd.read_csv('./MainDataset.csv')

df = shuffle(df).reset_index(drop=True)

df = df.head(500)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Split data into train and validation sets
X = df['text'].tolist()
y = df['label'].tolist()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y)

# Create train and validation datasets using Hugging Face Datasets library
train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
val_dataset = Dataset.from_dict({"text": X_val, "label": y_val})

In [None]:
print(len(list(val_dataset['label'])))

50


Define the prompt for training the model

In [None]:
from transformers import PreTrainedTokenizerFast

prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Output:
{}"""

EOS_TOKEN = tokenizer.eos_token

texts=[]
def formatting_prompts_func(examples):
    instruction = """You are an expert in identifying texts made by individuals associated with mass shootings. Your task is to analyze the provided text and determine if it exhibits characteristics typical of individuals associated with mass shootings.

Chain of Thought:

1. Begin by reading the provided text thoroughly, paying attention to language, tone, and underlying emotions.
2. Evaluate the text within this context, considering whether it demonstrates signs such as feelings of depression, suicidal thoughts, racist mentality, or a fascination with violence.
3. Based on your analysis, classify the text into one of the following categorical labels:
   - 1: If the text demonstrates patterns or behaviors commonly associated with individuals prone to mass shootings.
   - 0: If the text does not exhibit such patterns or behaviors.
"""

    input_text = examples["text"]
    output_label = examples["label"]

    s = prompt_template.format(instruction, input_text, output_label) + ' ' + EOS_TOKEN

    texts.append(s)
    return { "text" : s }

# Assuming you have already loaded your dataset
dataset = train_dataset.map(formatting_prompts_func)
print(dataset)


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 450
})


Train the model on cross entropy loss

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # Set num_train_epochs = 1 for full training runs
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/450 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,2.6549
2,2.7721
3,2.4875
4,2.612
5,2.5567
6,2.5423
7,2.5665
8,2.6543
9,2.6542
10,2.4606


Define the prompts for testing the model

In [None]:
from transformers import PreTrainedTokenizerFast

prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Output:
{}"""

EOS_TOKEN = tokenizer.eos_token

texts = []
def formatting_prompts_func(examples):
    instruction = """You are an expert in identifying texts made by individuals associated with mass shootings. Your task is to analyze the provided text and determine if it exhibits characteristics typical of individuals associated with mass shootings.

Chain of Thought:

1. Begin by reading the provided text thoroughly, paying attention to language, tone, and underlying emotions.
2. Evaluate the text within this context, considering whether it demonstrates signs such as feelings of depression, suicidal thoughts, racist mentality, or a fascination with violence.
3. Based on your analysis, classify the text into one of the following categorical labels:
   - 1: If the text demonstrates patterns or behaviors commonly associated with individuals prone to mass shootings.
   - 0: If the text does not exhibit such patterns or behaviors.
"""

    input_text = examples["text"]
    #output_label = examples["label"]

    s = prompt_template.format(instruction, input_text)

    texts.append(s)
    return { "text" : s }

# Assuming you have already loaded your dataset
test_dataset = val_dataset.map(formatting_prompts_func)
print(dataset)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 450
})



Test the model on Mass shooter manifesto Dataset

In [None]:
%%capture
import time

generated_outputs = []
FastLanguageModel.for_inference(model)

# Iterate through the test dataset
for idx in range(len(test_dataset['text'])):
    # Generate input prompt
    prompt_input = tokenizer(test_dataset['text'][idx],return_tensors="pt").to("cuda")

    # Generate output
    outputs = model.generate(**prompt_input, max_new_tokens = 64, use_cache = True).to("cuda")
    #print(outputs)

    #print(tokenizer.batch_decode(outputs))
    # Append generated output to the list
    generated_outputs.append(tokenizer.batch_decode(outputs))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token

In [None]:
generated_outputs[0]

['<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are an expert in identifying texts made by individuals associated with mass shootings. Your task is to analyze the provided text and determine if it exhibits characteristics typical of individuals associated with\xa0mass\xa0shootings.\n\n### Input:\nAs I’m sure you all know, a few days ago there was a shooting in Virginia. A man named Vester Flanagan opened fire on two former colleagues on live tv. He also recorded his own footage of the event. While reading about the event, I read some excerpts of his manifesto the media was releasing. And I have to say, anyone who knew him could have seen this coming. People like him have nothing left to live for, and the only thing left to do is lash out at a society that has abandoned them. His family described him as alone, no partner/lover. A victim not only of

In [None]:
generated_labels = [item[0].split("### Output:\n")[1].strip().split(" ")[0] for item in generated_outputs]

print(list(set(generated_labels)))

['0', '1']


Check performance of the model on the test data

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert generated_outputs to binary labels (0 or 1)
predicted_labels = [1 if label == '1' else 0 for label in generated_labels]

print("Predicted: \n", predicted_labels)

# Convert y_val to binary labels (0 or 1)
actual_labels = [1 if label == 1 else 0 for label in y_val]

print("Actual: \n",actual_labels)

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)

# Calculate precision
precision = precision_score(actual_labels, predicted_labels)

# Calculate recall
recall = recall_score(actual_labels, predicted_labels)

# Calculate F1 score
f1 = f1_score(actual_labels, predicted_labels)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Predicted: 
 [1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0]
Actual: 
 [1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


Save the final fine tuned model

In [None]:
model.save_pretrained("supremacist_suicidal")
tokenizer.save_pretrained("supremacist_suicidal")

In [None]:
import shutil

# Zip the folder containing model and tokenizer files
shutil.make_archive("supremacist_suicidal", "zip", "supremacist_suicidal")