In [None]:
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

df_train = pd.read_csv('/content/drive/MyDrive/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/test.csv')
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop = stopwords.words('english')
def update_dataframe(df):
  df['crimeaditionalinfo'] = df['crimeaditionalinfo'].fillna('')
  df['crimeaditionalinfo'] = df['crimeaditionalinfo'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

  df_category = df[['category', 'sub_category']].sort_values(['category', 'sub_category'])
  df_category = df_category.drop_duplicates().reset_index()
  df_category['sub_category'] = df_category['sub_category'].fillna('')
  df_category['label'] = df_category['category'] + ' - ' + (df_category['sub_category'])

  df['sub_category'] = df['sub_category'].fillna('')
  df['label'] = df['category'] + ' - ' + (df['sub_category'])
  dx = df[['crimeaditionalinfo', 'label']]
  dx.columns = ['text', 'label']
  return dx, df_category[['label']]

print('Transforming Train DF')
df_train, df_category = update_dataframe(df_train)
print('Transforming Test DF')
df_test, _ = update_dataframe(df_test)
df_test

In [None]:
id2label = df_category[['label']].to_dict()['label']
label2id = {id: label for label, id in id2label.items()}
print(label2id)
print('-----')
print(id2label)

{'Any Other Cyber Crime - Other': 0, 'Child Pornography CPChild Sexual Abuse Material CSAM - ': 1, 'Cryptocurrency Crime - Cryptocurrency Fraud': 2, 'Cyber Attack/ Dependent Crimes - Data Breach/Theft': 3, 'Cyber Attack/ Dependent Crimes - Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks': 4, 'Cyber Attack/ Dependent Crimes - Hacking/Defacement': 5, 'Cyber Attack/ Dependent Crimes - Malware Attack': 6, 'Cyber Attack/ Dependent Crimes - Ransomware Attack': 7, 'Cyber Attack/ Dependent Crimes - SQL Injection': 8, 'Cyber Attack/ Dependent Crimes - Tampering with computer source documents': 9, 'Cyber Terrorism - Cyber Terrorism': 10, 'Hacking  Damage to computercomputer system etc - Damage to computer computer systems etc': 11, 'Hacking  Damage to computercomputer system etc - Email Hacking': 12, 'Hacking  Damage to computercomputer system etc - Tampering with computer source documents': 13, 'Hacking  Damage to computercomputer system etc - Unauthorised AccessData Breach

In [None]:
def load_dataset(df, label2id, is_train = True) -> Dataset:
    """Load dataset."""

    df["label"] = df["label"].astype(str)
    df["label"] = df["label"].map(
            label2id
        )


    return df


train_dataset_df = load_dataset(df_train, label2id)
test_dataset_df = load_dataset(df_test, label2id)

old_train = train_dataset_df.copy()
old_test = test_dataset_df.copy()


In [None]:
# prompt: help me balance this df so that the distribution of data is not so bad

from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Separate features (text) and target (label)
X = train_dataset_df['text'].values.reshape(-1, 1)  # Reshape for RandomOverSampler
y = train_dataset_df['label'].values

# Initialize RandomOverSampler
oversampler = RandomOverSampler(random_state=42)

# Resample the data
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Create a new balanced DataFrame
balanced_df = pd.DataFrame({'text': X_resampled.flatten(), 'label': y_resampled})

# Now balanced_df contains the oversampled data
print("Original class distribution:", Counter(y))
print("Resampled class distribution:", Counter(y_resampled))

# Use balanced_df for training instead of train_dataset_df
# Example:
# train_dataset_df = balanced_df

Original class distribution: Counter({23: 26856, 0: 10878, 18: 10805, 22: 8872, 21: 5803, 26: 4089, 20: 4047, 36: 2822, 28: 2299, 33: 2073, 25: 1988, 39: 1838, 38: 1552, 14: 1114, 31: 912, 19: 761, 5: 540, 7: 534, 6: 521, 9: 517, 8: 508, 4: 504, 3: 484, 2: 480, 24: 444, 34: 417, 1: 379, 12: 349, 17: 290, 16: 183, 10: 161, 27: 157, 32: 132, 11: 108, 15: 89, 35: 56, 13: 50, 29: 44, 30: 29, 37: 1})
Resampled class distribution: Counter({26: 26856, 21: 26856, 24: 26856, 31: 26856, 23: 26856, 22: 26856, 36: 26856, 0: 26856, 33: 26856, 18: 26856, 20: 26856, 3: 26856, 25: 26856, 4: 26856, 28: 26856, 2: 26856, 38: 26856, 39: 26856, 6: 26856, 17: 26856, 12: 26856, 5: 26856, 14: 26856, 8: 26856, 34: 26856, 7: 26856, 10: 26856, 1: 26856, 9: 26856, 19: 26856, 16: 26856, 32: 26856, 15: 26856, 11: 26856, 29: 26856, 27: 26856, 13: 26856, 35: 26856, 30: 26856, 37: 26856})


In [None]:
# prompt: jumble up the order indexes of balanced_df so data is not sorted as per label after 93000 which happens currently

import numpy as np

# Shuffle the index of the balanced_df DataFrame
shuffled_index = np.random.permutation(balanced_df.index)
balanced_df = balanced_df.loc[shuffled_index].reset_index(drop=True)
balanced_df

Unnamed: 0,text,label
0,It threatening mail business account fear clie...,30
1,I given facebook Id random player free fire ha...,33
2,facebook account hacked means profile informat...,33
3,They created whatsapp group told give health p...,0
4,Dear sir One loans apps online loan payment du...,34
...,...,...
1074235,Mujhe call aya tha ek typing job ke liye inhon...,31
1074236,My friend mentioned something like happened mo...,7
1074237,Sir I invest money online cryptocurrency calle...,2
1074238,I going married soon since marriage fixed I re...,30


In [None]:
# prompt: what do you suggest what preprocessing I should do on this before training a senternce transformer or distillbert

import re

def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Removing special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Removing extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply preprocessing to the 'text' column of your DataFrame
balanced_df['text'] = balanced_df['text'].apply(preprocess_text)




In [None]:
print(balanced_df)

                                                      text  label  text_length
0        my admin id got hacked fund transferred unknow...     15          152
1        i dont getting type messages different numbers...     10          168
2        sir mere pass whatsapp ek link aya mene link p...     14          340
3        some android app hacked google account get con...     24          216
4        this person telling name tanisha whatsapp vide...     38          288
...                                                    ...    ...          ...
1074235  respected sir a serious matter i want inform f...     36         1126
1074236  dear sirmuhje pichle kuch dino se whatsapp pr ...     33          786
1074237  mere saath dhoka hua hai please meri help kro ...     14          209
1074238  the lady attached videos goes name swathi iyer...     37          372
1074239  ticket id email id abhishektiwari gmailcom amo...      2          944

[1074240 rows x 3 columns]


# Train the model frombalanced df


In [None]:
from sentence_transformers import SentenceTransformer
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
import os

import torch
import wandb
wandb.init(project="z21demo")

# Hyperparameters
MAX_LEN = 512
TRAIN_BATCH_SIZE = 256
VALID_BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 2e-05  # Smaller learning rate for SBERT fine-tuning


nltk.download('stopwords')
stop = stopwords.words('english')


# Define n_classes based on the label2id
#label2id = {label: idx for idx, label in enumerate(df_train['label'].unique())}
n_classes = len(label2id)


print(label2id)
print('-----')
print(id2label)
def load_dataset(df, label2id, is_train = True) -> Dataset:
    """Load dataset."""

    df["label"] = df["label"].astype(str)
    df["label"] = df["label"].map(
            label2id
        )


    return df


#train_dataset_df = load_dataset(df_train, label2id)
train_dataset_df = balanced_df
test_dataset_df = load_dataset(df_test, label2id)



# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can replace with a different SBERT variant
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)



class MixtureOfExpertsClassifier(nn.Module):
  def __init__(self, embedding_dim, num_classes, num_experts=8):
        super(MixtureOfExpertsClassifier, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, 512)  # First layer matches embedding dimension
        self.fc2 = nn.Linear(512, 512)  # Output layer
        self.fc3 = nn.Linear(512, 512)  # First layer matches embedding dimension
        self.fc4 = nn.Linear(512, 512)  # Output layer
        self.fc5 = nn.Linear(512, 512)  # First layer matches embedding dimension
        self.fc6 = nn.Linear(512, num_classes)  # Output layer
        # self.sigmoid = nn.Sigmoid()

  def forward(self, x):
        x = F.relu(self.fc1(x))  # Apply ReLU activation
        # x = F.relu(self.fc2(x))  # Output layer
        # x = F.relu(self.fc3(x))  # Apply ReLU activation
        # x = F.relu(self.fc4(x))  # Output layer
        # x = F.relu(self.fc5(x))  # Apply ReLU activation
        x = self.fc6(x)  # Output layer
        return x
  #     self.num_experts = num_experts

  #     # Define the experts (sub-networks)
  #     self.experts = nn.ModuleList([
  #         nn.Sequential(
  #             nn.Linear(embedding_dim, 512),
  #             nn.ReLU(),
  #             nn.Dropout(0.3),
  #             nn.Linear(512, num_classes)
  #         )
  #         for _ in range(num_experts)
  #     ])

  #     # Define the gating network
  #     self.gating_network = nn.Sequential(
  #         nn.Linear(embedding_dim, num_experts),  # Outputs scores for each expert
  #         nn.Softmax(dim=-1)                     # Normalize scores into probabilities
  #     )

  # def forward(self, embeddings):
  #     # Get gating weights
  #     gate_weights = self.gating_network(embeddings)  # Shape: (batch_size, num_experts)

  #     # Compute expert outputs
  #     expert_outputs = torch.stack([expert(embeddings) for expert in self.experts], dim=1)  # Shape: (batch_size, num_experts, num_classes)

  #     # Weight the expert outputs by the gate weights
  #     weighted_output = torch.einsum("be,bec->bc", gate_weights, expert_outputs)  # Shape: (batch_size, num_classes)

  #     return weighted_output

class SentenceTransformerModel(nn.Module):
  def __init__(self, model_name):
      super(SentenceTransformerModel, self).__init__()
      self.sentence_transformer = SentenceTransformer(model_name)

  def forward(self, sentences):
      # Get token embeddings for input sentences
      return self.sentence_transformer.encode(
          sentences, convert_to_tensor=True, show_progress_bar=False
      )

model = SentenceTransformerModel('all-MiniLM-L6-v2')
classifier = MixtureOfExpertsClassifier(embedding_dim=384, num_classes=n_classes)  # Use the appropriate embedding dimension
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
classifier.to(device)

#model.sentence_transformer.save('/content/drive/MyDrive/sentence_transformer_model')


def train_model():


    # Sample Dataset Preparation
    # Your dataset format: a list of (sentence, label) pairs


     # Define Dataset Class Compatible with DataFrames
    class SentenceDataset(Dataset):
        def __init__(self, dataframe, text_column, label_column):
            """
            Args:
            - dataframe (pd.DataFrame): Input DataFrame.
            - text_column (str): Column name for the text data.
            - label_column (str): Column name for the labels.
            """
            self.texts = dataframe[text_column].tolist()
            self.labels = dataframe[label_column].tolist()

        def __len__(self):
            return len(self.texts)

        def __getitem__(self, idx):
            return self.texts[idx], self.labels[idx]

    # Load datasets from DataFrame
    train_dataset = SentenceDataset(train_dataset_df, text_column="text", label_column="label")
    val_dataset = SentenceDataset(test_dataset_df, text_column="text", label_column="label")


    train_dataloader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE)



    def train_model_int():
        # Initialize the model and classifier


        loss_fn = nn.CrossEntropyLoss()  # Typically used for multi-class classification
        # optimizer = optim.AdamW(model.parameters(), lr=0.00001)
        optimizer = optim.AdamW([
        {'params': model.parameters(), 'lr': 2e-5},
        {'params': classifier.parameters(), 'lr': 2e-3}
        ])
        # Assuming `train_dataloader` is already set up and provides batches of (sentences, labels)
        classifier.train()
        for epoch in tqdm(range(EPOCHS)):
            # model.train()
            epoch_loss = 0
            epoch_correct = 0
            epoch_accu = 0
            step_count = 0
            epoch_total = 0


            for sentences, labels in train_dataloader:
            # Prepare input data
              step_count += 1
              labels = torch.tensor(labels, dtype=torch.long).to(device)
              optimizer.zero_grad()

              # Compute embeddings and forward pass
              embeddings = model(sentences)  # Forward pass through SentenceTransformer
              logits = classifier(embeddings)

              # Compute loss and accuracy
              loss = loss_fn(logits, labels)
              loss.backward()

              # Gradient clipping
              torch.nn.utils.clip_grad_norm_(classifier.parameters(), max_norm=1.0)
              optimizer.step()

              preds = torch.argmax(logits, dim=1)
              epoch_correct += (preds == labels).sum().item()
              epoch_total += labels.size(0)
              epoch_loss += loss.item()

              # Log step-level loss to TensorBoard
              # writer.add_scalar("Loss/train_step", loss.item(), epoch * len(train_dataloader) + step_count)
              wandb.log({"step": step_count, "step_loss": loss.item()})


              if step_count % 100 == 0:
                    avg_loss = epoch_loss / epoch_total
                    avg_accuracy = (epoch_correct * 100)/ epoch_total
                    print(f"Step {step_count} - Loss: {avg_loss:.4f} - Accuracy: {avg_accuracy:.2f}%")
                    # print("_" * 20)
                    # print(labels)
                    # print("_" * 20)
                    # print(preds)
                    # print("_" * 20)
                    # print(logits)
                    # print("_" * 20)

            # Log epoch stats
            avg_loss = epoch_loss / len(train_dataloader)
            avg_accuracy = epoch_correct / epoch_total * 100
            print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}, Accuracy = {avg_accuracy:.2f}%")
            # writer.add_scalar("Loss/train_epoch", avg_loss, epoch)
            # writer.add_scalar("Accuracy/train_epoch", avg_accuracy, epoch)
            # Log to W&B
            wandb.log({"epoch": epoch + 1, "loss": avg_loss, "accuracy": avg_accuracy})
            # writer.flush()
            # writer.close()

            gradient_norm = torch.nn.utils.clip_grad_norm_(classifier.parameters(), max_norm=1.0)
            wandb.log({"gradient_norm": gradient_norm})




            # Save the Sentence Transformer model after each epoch
            #model_save_path = os.path.join(MODEL_DIR, f"a_sentence_transformer_epoch_{epoch+1}")
            #torch.save(classifier.state_dict(), model_save_path)  # Use the `save` method for SentenceTransformer
            #torch.save(model.state_dict(), model_save_path + "_sent_model")  # Use the `save` method for SentenceTransformer

            torch.save({'sentence_transformer': model, 'classifier': classifier.state_dict()}, f"/content/drive/MyDrive/sentence_transformer_modelnew_{epoch+1}")


            #print(f"Model saved to {model_save_path}")





    train_model_int()

    # Validation Loop

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'Any Other Cyber Crime - Other': 0, 'Child Pornography CPChild Sexual Abuse Material CSAM - ': 1, 'Cryptocurrency Crime - Cryptocurrency Fraud': 2, 'Cyber Attack/ Dependent Crimes - Data Breach/Theft': 3, 'Cyber Attack/ Dependent Crimes - Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks': 4, 'Cyber Attack/ Dependent Crimes - Hacking/Defacement': 5, 'Cyber Attack/ Dependent Crimes - Malware Attack': 6, 'Cyber Attack/ Dependent Crimes - Ransomware Attack': 7, 'Cyber Attack/ Dependent Crimes - SQL Injection': 8, 'Cyber Attack/ Dependent Crimes - Tampering with computer source documents': 9, 'Cyber Terrorism - Cyber Terrorism': 10, 'Hacking  Damage to computercomputer system etc - Damage to computer computer systems etc': 11, 'Hacking  Damage to computercomputer system etc - Email Hacking': 12, 'Hacking  Damage to computercomputer system etc - Tampering with computer source documents': 13, 'Hacking  Damage to computercomputer system etc - Unauthorised AccessData Breach

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

cuda


In [None]:
train_model()

  labels = torch.tensor(labels, dtype=torch.long).to(device)


Step 100 - Loss: 0.0104 - Accuracy: 25.21%
Step 200 - Loss: 0.0093 - Accuracy: 30.04%
Step 300 - Loss: 0.0088 - Accuracy: 33.00%
Step 400 - Loss: 0.0084 - Accuracy: 35.11%
Step 500 - Loss: 0.0081 - Accuracy: 36.87%
Step 600 - Loss: 0.0079 - Accuracy: 38.30%
Step 700 - Loss: 0.0077 - Accuracy: 39.59%
Step 800 - Loss: 0.0075 - Accuracy: 40.77%
Step 900 - Loss: 0.0074 - Accuracy: 41.79%
Step 1000 - Loss: 0.0072 - Accuracy: 42.70%
Step 1100 - Loss: 0.0071 - Accuracy: 43.58%
Step 1200 - Loss: 0.0070 - Accuracy: 44.41%
Step 1300 - Loss: 0.0069 - Accuracy: 45.19%
Step 1400 - Loss: 0.0068 - Accuracy: 45.89%
Step 1500 - Loss: 0.0067 - Accuracy: 46.60%
Step 1600 - Loss: 0.0066 - Accuracy: 47.25%
Step 1700 - Loss: 0.0065 - Accuracy: 47.88%
Step 1800 - Loss: 0.0064 - Accuracy: 48.47%
Step 1900 - Loss: 0.0064 - Accuracy: 49.00%
Step 2000 - Loss: 0.0063 - Accuracy: 49.55%
Step 2100 - Loss: 0.0062 - Accuracy: 50.06%
Step 2200 - Loss: 0.0061 - Accuracy: 50.55%
Step 2300 - Loss: 0.0061 - Accuracy: 51.0

 10%|█         | 1/10 [19:58<2:59:47, 1198.60s/it]

Step 100 - Loss: 0.0038 - Accuracy: 67.34%
Step 200 - Loss: 0.0038 - Accuracy: 67.42%
Step 300 - Loss: 0.0038 - Accuracy: 67.55%
Step 400 - Loss: 0.0038 - Accuracy: 67.74%
Step 500 - Loss: 0.0037 - Accuracy: 67.91%
Step 600 - Loss: 0.0037 - Accuracy: 68.02%
Step 700 - Loss: 0.0037 - Accuracy: 68.17%
Step 800 - Loss: 0.0037 - Accuracy: 68.27%
Step 900 - Loss: 0.0037 - Accuracy: 68.33%
Step 1000 - Loss: 0.0037 - Accuracy: 68.44%
Step 1100 - Loss: 0.0037 - Accuracy: 68.52%
Step 1200 - Loss: 0.0037 - Accuracy: 68.58%
Step 1300 - Loss: 0.0036 - Accuracy: 68.67%
Step 1400 - Loss: 0.0036 - Accuracy: 68.75%
Step 1500 - Loss: 0.0036 - Accuracy: 68.80%
Step 1600 - Loss: 0.0036 - Accuracy: 68.89%
Step 1700 - Loss: 0.0036 - Accuracy: 68.96%
Step 1800 - Loss: 0.0036 - Accuracy: 69.03%
Step 1900 - Loss: 0.0036 - Accuracy: 69.10%
Step 2000 - Loss: 0.0036 - Accuracy: 69.19%
Step 2100 - Loss: 0.0036 - Accuracy: 69.28%
Step 2200 - Loss: 0.0036 - Accuracy: 69.35%
Step 2300 - Loss: 0.0035 - Accuracy: 69.4

 20%|██        | 2/10 [40:13<2:41:03, 1207.98s/it]

Step 100 - Loss: 0.0030 - Accuracy: 73.68%
Step 200 - Loss: 0.0030 - Accuracy: 73.89%
Step 300 - Loss: 0.0030 - Accuracy: 74.04%
Step 400 - Loss: 0.0030 - Accuracy: 73.91%
Step 500 - Loss: 0.0030 - Accuracy: 74.02%
Step 600 - Loss: 0.0030 - Accuracy: 74.10%
Step 700 - Loss: 0.0030 - Accuracy: 74.12%
Step 800 - Loss: 0.0030 - Accuracy: 74.19%
Step 900 - Loss: 0.0030 - Accuracy: 74.22%
Step 1000 - Loss: 0.0030 - Accuracy: 74.24%
Step 1100 - Loss: 0.0030 - Accuracy: 74.29%
Step 1200 - Loss: 0.0030 - Accuracy: 74.29%
Step 1300 - Loss: 0.0029 - Accuracy: 74.29%
Step 1400 - Loss: 0.0029 - Accuracy: 74.35%
Step 1500 - Loss: 0.0029 - Accuracy: 74.39%
Step 1600 - Loss: 0.0029 - Accuracy: 74.45%
Step 1700 - Loss: 0.0029 - Accuracy: 74.49%
Step 1800 - Loss: 0.0029 - Accuracy: 74.53%
Step 1900 - Loss: 0.0029 - Accuracy: 74.59%
Step 2000 - Loss: 0.0029 - Accuracy: 74.65%
Step 2100 - Loss: 0.0029 - Accuracy: 74.70%
Step 2200 - Loss: 0.0029 - Accuracy: 74.72%
Step 2300 - Loss: 0.0029 - Accuracy: 74.7

 30%|███       | 3/10 [1:00:19<2:20:51, 1207.34s/it]

Step 100 - Loss: 0.0026 - Accuracy: 77.52%
Step 200 - Loss: 0.0026 - Accuracy: 77.58%
Step 300 - Loss: 0.0026 - Accuracy: 77.58%
Step 400 - Loss: 0.0026 - Accuracy: 77.60%
Step 500 - Loss: 0.0026 - Accuracy: 77.68%
Step 600 - Loss: 0.0026 - Accuracy: 77.63%
Step 700 - Loss: 0.0026 - Accuracy: 77.66%
Step 800 - Loss: 0.0026 - Accuracy: 77.68%
Step 900 - Loss: 0.0026 - Accuracy: 77.71%
Step 1000 - Loss: 0.0026 - Accuracy: 77.71%
Step 1100 - Loss: 0.0026 - Accuracy: 77.76%
Step 1200 - Loss: 0.0026 - Accuracy: 77.80%
Step 1300 - Loss: 0.0025 - Accuracy: 77.85%
Step 1400 - Loss: 0.0025 - Accuracy: 77.89%
Step 1500 - Loss: 0.0025 - Accuracy: 77.96%
Step 1600 - Loss: 0.0025 - Accuracy: 78.00%
Step 1700 - Loss: 0.0025 - Accuracy: 78.03%
Step 1800 - Loss: 0.0025 - Accuracy: 78.06%
Step 1900 - Loss: 0.0025 - Accuracy: 78.07%
Step 2000 - Loss: 0.0025 - Accuracy: 78.08%
Step 2100 - Loss: 0.0025 - Accuracy: 78.09%
Step 2200 - Loss: 0.0025 - Accuracy: 78.10%
Step 2300 - Loss: 0.0025 - Accuracy: 78.1

 40%|████      | 4/10 [1:20:21<2:00:30, 1205.09s/it]

Step 100 - Loss: 0.0023 - Accuracy: 80.33%
Step 200 - Loss: 0.0023 - Accuracy: 80.49%
Step 300 - Loss: 0.0023 - Accuracy: 80.30%
Step 400 - Loss: 0.0023 - Accuracy: 80.28%
Step 500 - Loss: 0.0023 - Accuracy: 80.32%
Step 600 - Loss: 0.0023 - Accuracy: 80.33%
Step 700 - Loss: 0.0023 - Accuracy: 80.32%
Step 800 - Loss: 0.0023 - Accuracy: 80.25%
Step 900 - Loss: 0.0023 - Accuracy: 80.28%
Step 1000 - Loss: 0.0023 - Accuracy: 80.34%
Step 1100 - Loss: 0.0023 - Accuracy: 80.29%
Step 1200 - Loss: 0.0023 - Accuracy: 80.29%
Step 1300 - Loss: 0.0023 - Accuracy: 80.32%
Step 1400 - Loss: 0.0023 - Accuracy: 80.33%
Step 1500 - Loss: 0.0023 - Accuracy: 80.36%
Step 1600 - Loss: 0.0023 - Accuracy: 80.36%
Step 1700 - Loss: 0.0023 - Accuracy: 80.37%
Step 1800 - Loss: 0.0023 - Accuracy: 80.40%
Step 1900 - Loss: 0.0023 - Accuracy: 80.42%
Step 2000 - Loss: 0.0023 - Accuracy: 80.42%
Step 2100 - Loss: 0.0023 - Accuracy: 80.42%
Step 2200 - Loss: 0.0023 - Accuracy: 80.45%


# Test on test_dataset


In [None]:
class SentenceDataset(Dataset):
  def __init__(self, dataframe, text_column, label_column):
      """
      Args:
      - dataframe (pd.DataFrame): Input DataFrame.
      - text_column (str): Column name for the text data.
      - label_column (str): Column name for the labels.
      """
      self.texts = dataframe[text_column].tolist()
      self.labels = dataframe[label_column].tolist()

  def __len__(self):
      return len(self.texts)

  def __getitem__(self, idx):
      return self.texts[idx], self.labels[idx]

# Load datasets from DataFrame
train_dataset = SentenceDataset(train_dataset_df, text_column="text", label_column="label")
val_dataset = SentenceDataset(test_dataset_df, text_column="text", label_column="label")


In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE)



In [None]:
def test_model():
        # Load the trained SentenceTransformer model

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        # model.eval()

        # Define the classifier with appropriate dimensions


        classifier.to(device)
        classifier.eval()


        all_preds = []
        all_labels = []
        all_sentences = []
        correct_predictions = 0  # To count correct predictions
        total_predictions = 0  # To count total predictions

        with torch.no_grad():
            for sentences, labels in val_dataloader:
                # Move labels to device
                labels = torch.tensor(labels, dtype=torch.long).to(device)

                # Compute embeddings
                embeddings = model.sentence_transformer.encode(sentences, convert_to_tensor=True).to(device)

                # Get predictions from the classifier
                logits = classifier(embeddings)

                # Calculate predicted classes
                preds = torch.argmax(logits, dim=1)

                # Update counts for accuracy calculation
                correct_predictions += (preds == labels).sum().item()
                total_predictions += labels.size(0)

                # Collect predictions, labels, and sentences for saving
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
                all_sentences.extend(sentences)

        # Compute accuracy
        accuracy = (correct_predictions  * 100/ total_predictions)

        # Combine data into a DataFrame
        results_df = pd.DataFrame({
            'Sentence': all_sentences,
            'True_Label': all_labels,
            'Predicted_Label': all_preds
        })

        # Save to CSV
        csv_path = "test_predictions.csv"
        results_df.to_csv(csv_path, index=False)
        print(f"Predictions saved to '{csv_path}'")
        print(f"Test Accuracy: {accuracy:.2f}%")

        return accuracy

In [None]:
print(model)

In [None]:
test_model()