<a href="https://colab.research.google.com/github/deek2689/CERC_AI/blob/main/BERT_Probs_Groping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import torch
import pandas as pd
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification
from google.colab import drive
from tqdm import tqdm
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
model_path = "/content/drive/MyDrive/groping_model"

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
model.eval()

test_df = pd.read_csv('https://raw.githubusercontent.com/deek2689/CERC_AI/refs/heads/main/SafeCity%20Datasets/Groping/test.csv')

test_texts = test_df['Description'].tolist()
true_labels = test_df['Category'].tolist()


In [9]:
batch_size = 16
predictions = []
probabilities = []


In [11]:
for i in tqdm(range(0, len(test_texts), batch_size), desc="Processing Batches"):
    batch_texts = test_texts[i:i+batch_size]

    encodings = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")

    encodings = {key: val.to(device) for key, val in encodings.items()}

    with torch.no_grad():
        outputs = model(**encodings)
        logits = outputs.logits


    probs = F.softmax(logits, dim=1).cpu().numpy()

    preds = probs.argmax(axis=1)

    predictions.extend(preds.tolist())
    probabilities.extend(probs.tolist())

Processing Batches: 100%|██████████| 107/107 [00:11<00:00,  9.57it/s]


In [15]:
output_df = pd.DataFrame({
    "Text": test_texts,
    "True_Label": true_labels,
    "Predicted_Label": predictions,
    "Probability_Non_Commenting": [p[0] for p in probabilities],
    "Probability_Commenting": [p[1] for p in probabilities]
})

output_csv_path = "bert_finetuned_predictions.csv"
output_df.to_csv(output_csv_path, index=False)
print(f"Predictions saved to {output_csv_path}")


Predictions saved to bert_finetuned_predictions.csv
