Unsloth installation, make sure you are running the code below on a GPU

In [1]:
%%capture

!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes

Model and tokenizer intialization

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,

)

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Adding LoRA adapters to the quantized model. We only need to update 1 to 10% of all parameters.  

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",

    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Data Preprocessing for White Suprimacists. We classify each sentence in the dataset as Hate or No Hate

In [None]:
import pandas as pd
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Load data from JSON files
with open('./annotated_data-train.json', 'r') as f:
    train_data = json.load(f)

with open('./annotated_data-test.json', 'r') as f:
    test_data = json.load(f)


train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)


df = pd.concat([train_df, test_df])

data = df[['text', 'label']]


# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Split data into train and validation sets
X = list(data["text"])
y = list(data["label"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y)


from datasets import Dataset, load_dataset
train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
val_dataset = Dataset.from_dict({"text": X_val, "label": y_val})

# Mapping labels to numerical values
label_map = {'hate': 1, 'noHate': 0}

# Apply mapping to train and validation labels
train_dataset = train_dataset.map(lambda example: {'label': label_map[example['label']]})
val_dataset = val_dataset.map(lambda example: {'label': label_map[example['label']]})

Map:   0%|          | 0/2152 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

In [None]:
print(list(set(train_dataset['label'])))

[0, 1]


Creating Prompts

In [None]:
from transformers import PreTrainedTokenizerFast

prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Output:
{}"""

EOS_TOKEN = tokenizer.eos_token

texts=[]
def formatting_prompts_func(examples):
    instruction = "You are an expert in identifying whether a statement contains sentiments against certain racial groups or ethnic communities. Given a statement, your task is to classify the given text to determine if it holds sentiments against racial groups or ethnic communities."
    input = examples["text"]
    #print("input: ",input)
    output = examples["label"]
    #print("output:",output)

    s=prompt.format(instruction, input, output)+' '+EOS_TOKEN
    #print(s)
    texts.append(s)
    return { "text" : s }


from datasets import load_dataset


dataset=train_dataset
dataset = dataset.map(formatting_prompts_func)
print(dataset)



Map:   0%|          | 0/2152 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 2152
})


Training Model on Cross Entropy loss. This is the default loss Mistral is pretrained on, so we don't have to write the function for this loss

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # Set num_train_epochs = 1 for full training runs
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/2152 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,152 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.5731
2,2.7161
3,2.3536
4,2.0533
5,1.7233
6,1.5231
7,0.9812
8,1.0129
9,0.9088
10,0.8511


Training the Model on InfoNCELoss

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Define the InfoNCELoss class
class InfoNCELoss(torch.nn.Module):
    def __init__(self):
        super(InfoNCELoss, self).__init__()

    def forward(self, outputs, labels):
        # Retrieve the projected and predicted representations
        projected_representation = outputs.logits
        predicted_representation = outputs.logits

        # Calculate dot product between projected and predicted representations
        dot_product = torch.matmul(projected_representation, predicted_representation.t())

        # Compute softmax along rows
        softmax_output = torch.nn.functional.softmax(dot_product, dim=1)

        # Calculate loss using negative log likelihood
        loss = -torch.mean(torch.log(torch.diag(softmax_output) + 1e-4))

        return loss

# Create an instance of the InfoNCELoss
info_nce_loss = InfoNCELoss()

# Instantiate the trainer without specifying compute_loss argument
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    )
)

# Set the custom loss function to the model
trainer.model.compute_loss = info_nce_loss
trainer_stats = trainer.train()

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/2152 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,2.5924
2,2.7258
3,2.3773
4,2.0624
5,1.7391
6,1.5349
7,1.0268
8,1.0316
9,0.9108
10,0.8424


Training Model on Triplet loss

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the TripletLoss class
class TripletLoss(nn.Module):
    def __init__(self, margin=0.01):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative):
        # Ensure the shapes of anchor, positive, and negative tensors are compatible
        anchor = anchor.view(anchor.size(0), -1)
        positive = positive.view(positive.size(0), -1)
        negative = negative.view(negative.size(0), -1)

        # Calculate pairwise distances
        distance_positive = torch.norm(anchor - positive, p=2, dim=1)
        distance_negative = torch.norm(anchor - negative, p=2, dim=1)

        # Compute loss
        loss = F.relu(distance_positive - distance_negative + self.margin)
        return loss.mean()

# Create an instance of the TripletLoss
triplet_loss = TripletLoss()

# Instantiate the trainer without specifying compute_loss argument
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    )
)

# Set the custom loss function to the model
trainer.model.compute_loss = triplet_loss
trainer_stats = trainer.train()


  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/2152 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,152 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.5924
2,2.7258
3,2.3774
4,2.0626
5,1.7392
6,1.5344
7,1.0266
8,1.0314
9,0.9113
10,0.8426


Training Model on N-pair loss

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the NpairLoss class
class NpairLoss(nn.Module):
    def __init__(self, l2_reg=0.02):
        super(NpairLoss, self).__init__()
        self.l2_reg = l2_reg

    def forward(self, anchor, positive, target):
        batch_size = anchor.size(0)
        target = target.view(target.size(0), 1)

        target = (target == torch.transpose(target, 0, 1)).float()
        target = target / torch.sum(target, dim=1, keepdim=True).float()

        logit = torch.matmul(anchor, torch.transpose(positive, 0, 1))
        loss_ce = F.cross_entropy(logit, target)
        l2_loss = torch.sum(anchor**2) / batch_size + torch.sum(positive**2) / batch_size

        loss = loss_ce + self.l2_reg*l2_loss*0.25
        return loss

# Create an instance of the NpairLoss
npair_loss = NpairLoss()

# Instantiate the trainer without specifying compute_loss argument
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    )
)

# Set the custom loss function to the model
trainer.model.compute_loss = npair_loss
trainer_stats = trainer.train()


  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/2152 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,0.5308
2,0.5981
3,0.5974
4,0.6333
5,0.6596
6,0.9463
7,0.6036
8,0.8212
9,0.8394
10,0.6488


Inference/Testing. You can this for every loss function and check

In [None]:
from transformers import PreTrainedTokenizerFast

prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Output:
"""

texts=[]
def formatting_prompts_func(examples):
    instruction = "You are an expert in identifying whether a statement contains sentiments against certain racial groups or ethnic communities. Given a statement, your task is to classify the given text to determine if it holds sentiments against racial groups or ethnic communities."
    input = examples["text"]
    #print("input: ",input)
    output = examples["label"]
    #print("output:",output)

    s=prompt.format(instruction, input)
    #print(s)
    texts.append(s)
    return { "text" : s }


from datasets import load_dataset

test_dataset=val_dataset.map(formatting_prompts_func)


Map:   0%|          | 0/240 [00:00<?, ? examples/s]

In [None]:
%%capture
import time

generated_outputs = []
FastLanguageModel.for_inference(model)

# Iterate through the test dataset
for idx in range(len(test_dataset['text'])):
    # Generate input prompt
    prompt_input = tokenizer(test_dataset['text'][idx],return_tensors="pt").to("cuda")

    # Generate output
    outputs = model.generate(**prompt_input, max_new_tokens = 64, use_cache = True).to("cuda")

    print(tokenizer.batch_decode(outputs))
    # Append generated output to the list
    generated_outputs.append(tokenizer.batch_decode(outputs))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [None]:
generated_labels = [item[0].split("### Output:\n")[1].strip().split(" ")[0] for item in generated_outputs]

print(list(set(generated_labels)))

['1', '0']


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert generated_outputs to binary labels (0 or 1)
predicted_labels = [1 if label == "hate" else 0 for label in generated_outputs]

# Convert y_val to binary labels (0 or 1)
actual_labels = [1 if label == "hate" else 0 for label in y_val]

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)

# Calculate precision
precision = precision_score(actual_labels, predicted_labels)

# Calculate recall
recall = recall_score(actual_labels, predicted_labels)

# Calculate F1 score
f1 = f1_score(actual_labels, predicted_labels)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.5
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
model.save_pretrained("supremacist")
tokenizer.save_pretrained("supremacist")



('supremacist/tokenizer_config.json',
 'supremacist/special_tokens_map.json',
 'supremacist/tokenizer.model',
 'supremacist/added_tokens.json',
 'supremacist/tokenizer.json')

In [None]:
import shutil

# Zip the folder containing model and tokenizer files
shutil.make_archive("supremacist_model", "zip", "supremacist")


'/content/supremacist_model.zip'

Loading the Model fine tuned on Supremacist data

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "supremacist",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )


Unused kwargs: ['quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Load the Suicidal Thoughts Dataset


In [None]:
import pandas as pd
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import Dataset, load_dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Load data from CSV file
df = pd.read_csv('./SuicidalThoughts.csv')

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Split data into train and validation sets
X = df['text'].tolist()
y = df['label'].tolist()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y)

# Create train and validation datasets using Hugging Face Datasets library
train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
val_dataset = Dataset.from_dict({"text": X_val, "label": y_val})

# Mapping labels to numerical values
label_map = {'suicide': 1, 'non-suicide': 0}

# Apply mapping to train and validation labels
train_dataset = train_dataset.map(lambda example: {'label': label_map[example['label']]})
val_dataset = val_dataset.map(lambda example: {'label': label_map[example['label']]})


Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
print(list(set(val_dataset['label'])))

[0, 1]


Define prompts for the Suicidal Thoughts data fine tuning

In [None]:
from transformers import PreTrainedTokenizerFast

prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Output:
{}"""

EOS_TOKEN = tokenizer.eos_token

texts=[]
def formatting_prompts_func(examples):
    instruction = "You are an expert in identifying whether a given statement reflects suicidal thoughts or not. Your task is to classify the provided text to determine if it contains suicidal ideation."
    input = examples["text"]
    #print("input: ",input)
    output = examples["label"]
    #print("output:",output)

    s=prompt.format(instruction, input, output)+' '+EOS_TOKEN
    #print(s)
    texts.append(s)
    return { "text" : s }


from datasets import load_dataset


dataset=train_dataset
dataset = dataset.map(formatting_prompts_func)
print(dataset)

Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 2700
})


Training the model using Cross Entropy loss


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # Set num_train_epochs = 1 for full training runs
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/2700 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,700 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.7924
2,2.682
3,2.7215
4,2.5354
5,2.6661
6,2.8418
7,2.6153
8,2.2774
9,2.6143
10,2.7605


Testing the model

In [None]:
from transformers import PreTrainedTokenizerFast

prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Output:
"""

texts=[]
def formatting_prompts_func(examples):
    instruction = "You are an expert in identifying whether a given statement reflects suicidal thoughts or not. Your task is to classify the provided text to determine if it contains suicidal ideation."
    input = examples["text"]
    #print("input: ",input)
    output = examples["label"]
    #print("output:",output)

    s=prompt.format(instruction, input)
    #print(s)
    texts.append(s)
    return { "text" : s }


from datasets import load_dataset

test_dataset=val_dataset.map(formatting_prompts_func)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

output: 0
output: 1
output: 1
output: 0
output: 1
output: 1
output: 1
output: 0
output: 1
output: 0
output: 1
output: 0
output: 1
output: 1
output: 1
output: 1
output: 1
output: 0
output: 0
output: 0
output: 1
output: 0
output: 1
output: 1
output: 1
output: 1
output: 1
output: 1
output: 1
output: 1
output: 0
output: 1
output: 1
output: 0
output: 1
output: 1
output: 1
output: 0
output: 1
output: 1
output: 0
output: 1
output: 1
output: 1
output: 1
output: 0
output: 1
output: 1
output: 1
output: 1
output: 0
output: 1
output: 0
output: 1
output: 1
output: 1
output: 1
output: 1
output: 1
output: 0
output: 1
output: 0
output: 1
output: 1
output: 1
output: 1
output: 1
output: 1
output: 1
output: 1
output: 1
output: 1
output: 1
output: 1
output: 1
output: 0
output: 1
output: 0
output: 0
output: 1
output: 0
output: 0
output: 1
output: 1
output: 1
output: 1
output: 0
output: 1
output: 0
output: 1
output: 0
output: 1
output: 1
output: 0
output: 1
output: 0
output: 0
output: 1
output: 0
output: 1


In [None]:
%%capture
import time

generated_outputs = []
FastLanguageModel.for_inference(model)

# Iterate through the test dataset
for idx in range(len(test_dataset['text'])):
    # Generate input prompt
    prompt_input = tokenizer(test_dataset['text'][idx],return_tensors="pt").to("cuda")

    # Generate output
    outputs = model.generate(**prompt_input, max_new_tokens = 64, use_cache = True).to("cuda")
    #print(outputs)

    #print(tokenizer.batch_decode(outputs))
    # Append generated output to the list
    generated_outputs.append(tokenizer.batch_decode(outputs))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [None]:
generated_outputs[0]

['<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are an expert in identifying whether a given statement reflects suicidal thoughts or not. Your task is to classify the provided text to determine if it contains suicidal ideation.\n\n### Input:\nI had a fun day already Cut down a fallen tree with my dad and got paid a hundred USD.... Fun way ti start spring break\n\n### Output:\n0 </s>']

In [None]:
generated_labels = [item[0].split("### Output:\n")[1].strip().split(" ")[0] for item in generated_outputs]

print(list(set(generated_labels)))

['0', '1']


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert generated_outputs to binary labels (0 or 1)
predicted_labels = [1 if label == '1' else 0 for label in generated_labels]

print(predicted_labels)

# Convert y_val to binary labels (0 or 1)
actual_labels = [1 if label == "suicide" else 0 for label in y_val]

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)

# Calculate precision
precision = precision_score(actual_labels, predicted_labels)

# Calculate recall
recall = recall_score(actual_labels, predicted_labels)

# Calculate F1 score
f1 = f1_score(actual_labels, predicted_labels)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

[0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1]
Accuracy: 0.9233333333333333
Precision: 0.9545454545454546
Recall: 0.9417040358744395
F1 Score: 0.9

Save the model

In [None]:
model.save_pretrained("supremacist_suicidal")
tokenizer.save_pretrained("supremacist_suicidal")



('supremacist_suicidal/tokenizer_config.json',
 'supremacist_suicidal/special_tokens_map.json',
 'supremacist_suicidal/tokenizer.model',
 'supremacist_suicidal/added_tokens.json',
 'supremacist_suicidal/tokenizer.json')

Create a zip file of the fine tuned model

In [None]:
import shutil

# Zip the folder containing model and tokenizer files
shutil.make_archive("supremacist_suicidal", "zip", "supremacist_suicidal")

'/content/supremacist_suicidal.zip'

Run the code below to extract contents of a zip file in a folder on the runtime memory

In [2]:
import zipfile
import os

# Define the path to the uploaded zip file
zip_file_path = './supremacist_suicidal.zip'

# Define the directory where you want to extract the contents
extract_dir = './supremacist_suicidal'

# Create the extraction directory if it doesn't exist
if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("Extraction complete.")

Extraction complete.


Load the fine tuned model on Supremacist and Suicidal data

In [3]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "supremacist_suicidal",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )



config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Load the Terrorist dataset

In [5]:
import pandas as pd
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import Dataset, load_dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Load data from CSV file
df = pd.read_csv('./TerroristStmtsCSV.csv')

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Split data into train and validation sets
X = df['text'].tolist()
y = df['label'].tolist()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y)

# Create train and validation datasets using Hugging Face Datasets library
train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
val_dataset = Dataset.from_dict({"text": X_val, "label": y_val})


In [8]:
print(list(set(val_dataset['label'])))

[0, 1]


Define Prompts for the Terrorist dataset

In [9]:
from transformers import PreTrainedTokenizerFast

prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Output:
{}"""

EOS_TOKEN = tokenizer.eos_token

texts=[]
def formatting_prompts_func(examples):
    instruction = "You are an expert in identifying whether a given statement reflects terrorist behavior or not. Your task is to classify the provided text to determine if it contains indications of terrorist activities."
    input = examples["text"]
    #print("input: ",input)
    output = examples["label"]
    #print("output:",output)

    s=prompt.format(instruction, input, output)+' '+EOS_TOKEN
    #print(s)
    texts.append(s)
    return { "text" : s }


from datasets import load_dataset


dataset=train_dataset
dataset = dataset.map(formatting_prompts_func)
print(dataset)

Map:   0%|          | 0/1187 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 1187
})


Training the model using Cross Entropy Loss

In [10]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # Set num_train_epochs = 1 for full training runs
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/1187 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,187 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.6994
2,2.531
3,2.1196
4,2.4702
5,2.1772
6,2.3319
7,2.3857
8,2.3817
9,2.309
10,2.3641


Testing the model

In [11]:
from transformers import PreTrainedTokenizerFast

prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Output:
"""

texts=[]
def formatting_prompts_func(examples):
    instruction = "You are an expert in identifying whether a given statement reflects terrorist behavior or not. Your task is to classify the provided text to determine if it contains indications of terrorist activities."
    input = examples["text"]
    #print("input: ",input)
    output = examples["label"]
    #print("output:",output)

    s=prompt.format(instruction, input)
    #print(s)
    texts.append(s)
    return { "text" : s }


from datasets import load_dataset

test_dataset=val_dataset.map(formatting_prompts_func)

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

In [12]:
%%capture
import time

generated_outputs = []
FastLanguageModel.for_inference(model)

# Iterate through the test dataset
for idx in range(len(test_dataset['text'])):
    # Generate input prompt
    prompt_input = tokenizer(test_dataset['text'][idx],return_tensors="pt").to("cuda")

    # Generate output
    outputs = model.generate(**prompt_input, max_new_tokens = 64, use_cache = True).to("cuda")
    #print(outputs)

    #print(tokenizer.batch_decode(outputs))
    # Append generated output to the list
    generated_outputs.append(tokenizer.batch_decode(outputs))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [13]:
generated_outputs[0]

['<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are an expert in identifying whether a given statement reflects terrorist behavior or not. Your task is to classify the provided text to determine if it contains indications of terrorist activities.\n\n### Input:\nWith mongrel monstrosities walking the streets, soaring "hate"\ncrime against White people everywhere, a government in the hands of the most bestial gang in history - the Jews, and the White masses largely in stupor, it is actually surprising that more of our White Racial Comrades have not acted as August acted\nThat the Church does notcondone his acts does not affect the reality that when a people is kicked around like a dog, someone might indeed be bitten\nOur Brother August Smith will continue to live on in all of us\n\n### Output:\n1 </s>']

In [14]:
generated_labels = [item[0].split("### Output:\n")[1].strip().split(" ")[0] for item in generated_outputs]

print(list(set(generated_labels)))

['1', '0']


In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert generated_outputs to binary labels (0 or 1)
predicted_labels = [1 if label == '1' else 0 for label in generated_labels]

print("Predicted: \n", predicted_labels)

# Convert y_val to binary labels (0 or 1)
actual_labels = [1 if label == 1 else 0 for label in y_val]

print("Actual: \n"actual_labels)

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)

# Calculate precision
precision = precision_score(actual_labels, predicted_labels)

# Calculate recall
recall = recall_score(actual_labels, predicted_labels)

# Calculate F1 score
f1 = f1_score(actual_labels, predicted_labels)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

[1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1]
Accuracy: 0.8787878787878788
Precision: 0.9811320754716981
Recall: 0.7761194029850746
F1 Score: 0.8666666666666666


In [17]:
model.save_pretrained("supremacist_suicidal_terrorist")
tokenizer.save_pretrained("supremacist_suicidal_terrorist")



('supremacist_suicidal_terrorist/tokenizer_config.json',
 'supremacist_suicidal_terrorist/special_tokens_map.json',
 'supremacist_suicidal_terrorist/tokenizer.model',
 'supremacist_suicidal_terrorist/added_tokens.json',
 'supremacist_suicidal_terrorist/tokenizer.json')

Create a zip file of the fine tuned model

In [18]:
import shutil

# Zip the folder containing model and tokenizer files
shutil.make_archive("supremacist_suicidal_terrorist", "zip", "supremacist_suicidal_terrorist")

'/content/supremacist_suicidal_terrorist.zip'