In [2]:
!pip install accelerate transformers torch -q -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [22]:
import os
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, classification_report

In [23]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)


##  DATA PATHS & HYPERPARAMETERS


In [24]:
train_path = 'https://raw.githubusercontent.com/deek2689/CERC_AI/refs/heads/main/SafeCity%20Datasets/Ogling/train.csv'
test_path = 'https://raw.githubusercontent.com/deek2689/CERC_AI/refs/heads/main/SafeCity%20Datasets/Ogling/test.csv'
val_path = 'https://raw.githubusercontent.com/deek2689/CERC_AI/refs/heads/main/SafeCity%20Datasets/Ogling/dev.csv'
batch_size = 8
learning_rate = 5e-5
num_epochs = 10
patience = 2     # number of epochs to wait for improvement before stopping
max_length = 512

## READING DATA

In [25]:
train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)
test_data = pd.read_csv(test_path)

In [26]:
train_data.head()

Unnamed: 0,Description,Category
0,"Was walking along crowded street, holding mums...",0
1,This incident took place in the evening.I was ...,1
2,I WAS WAITING FOR THE BUS. A MAN CAME ON A BIK...,0
3,Incident happened inside the train,0
4,I witnessed an incident when a chain was bruta...,0


In [27]:
train_data['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
0,5675
1,1526


## PREPARING THE DATA

In [28]:
## Defining Instruction text
#instruction = "Classify if the following statement falls under ogling related to sexual harassment. The output must be a single label: 'True' or 'False'."

def format_dataset(row):
    """
    Formats the dataset into the required structure for GPT-2 training.
    """
    formatted_text = (
        #f"### Instruction:\n{instruction}\n\n"
        f"### Input:\n{row['Description']}\n\n"
        f"### Response:\n"
    )
    label = 1 if row['Category'] == 1 else 0  # Converting category to binary
    return formatted_text, label

def process_dataset(df):
    """
    Processes the data into the defined format
    """
    formatted_texts = []
    labels = []
    for _, row in df.iterrows():
        formatted_text, label = format_dataset(row)
        formatted_texts.append(formatted_text)
        labels.append(label)
    return formatted_texts, labels

# Applying the function
formatted_texts_train, labels_train = process_dataset(train_data)
formatted_texts_val, labels_val = process_dataset(val_data)
formatted_texts_test, labels_test = process_dataset(test_data)


In [29]:
print(formatted_texts_train[0])

### Input:
Was walking along crowded street, holding mums hand, when an elderly man groped butt, I turned to look at h7m and he looked away, and did it again after a while.I was 12 yrs old then.

### Response:



In [30]:
labels_train[0]

0

## Tokenization

In [31]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.bos_token  # or eos_token // Since GPT2 doesnt have a pad token
tokenizer.padding_side = "left" # GPT2 being a decoder model, it uses the last token for prediction so padding on the left

def tokenize_dataset(formatted_texts, labels, max_length=512):
    tokenized = tokenizer(
        formatted_texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    input_ids = tokenized['input_ids']
    attention_mask = tokenized['attention_mask']
    labels_tensor = torch.tensor(labels)
    return TensorDataset(input_ids, attention_mask, labels_tensor)

train_dataset = tokenize_dataset(formatted_texts_train, labels_train, max_length=max_length)
val_dataset = tokenize_dataset(formatted_texts_val, labels_val, max_length=max_length)
test_dataset = tokenize_dataset(formatted_texts_test, labels_test, max_length=max_length)


## CREATING DATALOADERS

In [32]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

## SETTING UP DEVICE

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [34]:
device


device(type='cuda')

## MODEL AND OPTIMIZER

In [35]:
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id
model.to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## TRAINING - EVAL LOOP WITH EARLY STOPPING

In [None]:
best_val_loss = float("inf")
epochs_unimproved = 0
train_losses = [] # To store training loss after each epoch
val_losses = [] ## Similarly for validation losses

for epoch in range(num_epochs):
  print(f"Epoch {epoch + 1}/{num_epochs}")
  model.train()
  total_train_loss = 0

  for batch in tqdm(train_dataloader, desc = 'Training'):
    input_ids, attention_masks, batch_labels = batch
    input_ids, attention_masks, batch_labels = input_ids.to(device), attention_masks.to(device), batch_labels.to(device)
    optimizer.zero_grad()
    outputs = model(
        input_ids = input_ids,
        attention_mask = attention_masks,
        labels = batch_labels
    )
    loss = outputs.loss
    loss.backward()
    optimizer.step()

    total_train_loss += loss.item()
  avg_train_loss = total_train_loss/len(train_dataloader)
  train_losses.append(avg_train_loss)
  print(f"Training Loss (Average): {avg_train_loss}")

  #Validation Loop
  model.eval()
  total_val_loss = 0
  val_predictions = []
  val_true_labels = []

  with torch.no_grad():
    for batch in tqdm(val_dataloader, desc = 'Validation'):
      input_ids, attention_masks, batch_labels = batch
      input_ids, attention_masks, batch_labels = input_ids.to(device), attention_masks.to(device), batch_labels.to(device)
      outputs = model(
          input_ids = input_ids,
          attention_mask = attention_masks,
          labels = batch_labels
      )
      loss = outputs.loss
      total_val_loss += loss.item()

      logits = outputs.logits
      probs = torch.nn.functional.softmax(logits, dim=-1)
      pred_classes = torch.argmax(probs, dim=-1)

      val_predictions.extend(pred_classes.cpu().numpy())
      val_true_labels.extend(batch_labels.cpu().numpy())

  avg_val_loss = total_val_loss / len(val_dataloader)
  val_losses.append(avg_val_loss)
  val_macro_f1 = f1_score(val_true_labels, val_predictions, average = 'macro')
  val_f1_score = f1_score(val_true_labels, val_predictions)

  print(f"  Validation loss: {avg_val_loss}")
  print(f"  Validation Macro F1 score: {val_macro_f1}")
  print(f"  Validation F1 score: {val_f1_score}")

  ## Early Stopping

  if avg_val_loss < best_val_loss:
    best_val_loss = avg_val_loss
    epochs_unimproved = 0
    torch.save(model.state_dict(), "best_model.pt")
  else:
    epochs_unimproved += 1
    print(f" No improvement in validation loss for {epochs_unimproved} epoch(s).")
    if epochs_unimproved >= patience:
      print("Stopping early due to no improvement in validation loss")
      break



Epoch 1/10


Training:  88%|████████▊ | 797/901 [10:34<01:22,  1.26it/s]

In [None]:
# Reload the best model
model.load_state_dict(torch.load("best_model.pt"))
print("Loaded the best model for final evaluation.")

## Visualizing training and validation losses

In [None]:
import matplotlib.pyplot as plt

epochs = range(1, len(train_losses) + 1)

plt.figure(figsize=(8, 6))
plt.plot(epochs, train_losses, label='Training Loss')
plt.plot(epochs, val_losses, label='Validation Loss')
plt.title("Training & Validation Loss by Epoch")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

## TESTING LOOP

In [None]:
model.eval()
test_predictions = []
test_true_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids, attention_masks, batch_labels = batch
        input_ids, attention_masks, batch_labels = input_ids.to(device), attention_masks.to(device), batch_labels.to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_masks,
            labels=batch_labels
        )

        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=1)
        pred_classes = torch.argmax(probs, dim=1)

        test_predictions.extend(pred_classes.cpu().numpy())
        test_true_labels.extend(batch_labels.cpu().numpy())



## Calculating Performance Metrics

In [None]:
test_macro_f1 = f1_score(test_true_labels, test_predictions, average='macro')
test_F1 = f1_score(test_true_labels, test_predictions)
test_accuracy = accuracy_score(test_true_labels, test_predictions)
test_precision = precision_score(test_true_labels, test_predictions)
test_recall = recall_score(test_true_labels, test_predictions)

print(f"\nTest Macro F1 score: {test_macro_f1}")
print(f"Test Regular F1 score: {test_F1}")
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")
print("\nClassification Report:")
print(classification_report(test_true_labels, test_predictions))