In [1]:
import pandas as pd
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_metric
import gc
from sklearn.metrics import precision_recall_fscore_support,accuracy_score
from torch.utils.data import DataLoader

train_df=pd.read_csv("train.csv.zip", index_col=False)
train_df=train_df.drop_duplicates(subset=["Tweet"]).reset_index(drop=True)
train_df=train_df.dropna(subset=["Tweet","Type"])
# train_df["label"] = train_df["Type"].apply(lambda x: 1 if x == "Spam" else 0)
train_df["label"] = train_df["Type"]
train_df["label"] =train_df["label"].astype(str)
train_df["Tweet"] =train_df["Tweet"].astype(str)
df=train_df[['Tweet',"label","Type"]]



In [2]:
def stratified_train_test_val_split(data, label_col="label", train_size=0.8, val_size=0.1, test_size=0.1, random_state=42):
    # First, split into 80% training and 20% for testing + validation
    train_data, temp_data = train_test_split(
        data, test_size=(1 - train_size), stratify=data[label_col], random_state=random_state
    )
    
    # Then, split the 20% (temp_data) into 10% validation and 10% testing
    relative_val_size = val_size / (val_size + test_size)  # Adjust size for the remaining 20%
    val_data, test_data = train_test_split(
        temp_data, test_size=(1 - relative_val_size), stratify=temp_data[label_col], random_state=random_state
    )
    
    return train_data, val_data, test_data

# Example usage:
train_data, val_data, test_data = stratified_train_test_val_split(df, label_col="label")

In [3]:
def finetune_tweet_classifier(train_data,val_data, model_name, num_labels=2, epochs=5, batch_size=8):
    # Map the "Type" column to 0 and 1, ensuring labels are integers
    label2id = {"Spam": 1, "Quality": 0}
    train_data["label"] = train_data["Type"].map(label2id).astype(int)
    val_data["label"]= val_data["Type"].map(label2id).astype(int)

    
    # Convert to Hugging Face Dataset
    train_dataset = Dataset.from_pandas(train_data)
    val_dataset = Dataset.from_pandas(val_data)
    
    # Tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to("cuda")

    # Tokenize function with padding and truncation
    def tokenize_function(example):
        return tokenizer(
            example["Tweet"],
            padding="max_length",  # Ensures all sequences are the same length
            truncation=True,       # Truncates sequences longer than model's max length
            max_length=512
        )
    
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    # Ensure dataset format
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    
    # Metric calculation
    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
        acc = accuracy_score(labels, preds)
        return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_dir='./logs',
        logging_steps=1000
    )

    # Early stopping callback
    early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback]
    )

    # Train model
    trainer.train()
    base_model_name = model_name.split("/")[-1]
    new_model_name = f"{base_model_name}_Twitter_spam_classification"
    
    # Save the model and tokenizer
    model.save_pretrained(f"models/{new_model_name}")
    tokenizer.save_pretrained(f"models/{new_model_name}")
    gc.collect()
    torch.cuda.empty_cache()
    
    return model

In [4]:
finetune_tweet_classifier(train_data, val_data, "FacebookAI/xlm-roberta-large")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9429 [00:00<?, ? examples/s]

Map:   0%|          | 0/1179 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2226,0.169904,0.960136,0.960102,0.961078,0.960136
2,0.1341,0.160637,0.965225,0.965219,0.965326,0.965225
3,0.1192,0.21069,0.969466,0.969456,0.96972,0.969466
4,0.0754,0.212541,0.964377,0.964378,0.964401,0.964377
5,0.0474,0.19572,0.970314,0.970311,0.970344,0.970314


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_fe

In [4]:
finetune_tweet_classifier(train_data, val_data, "microsoft/deberta-v3-large",num_labels=2, epochs=5, batch_size=4)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9429 [00:00<?, ? examples/s]

Map:   0%|          | 0/1179 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1466,0.169423,0.960984,0.960925,0.962875,0.960984
2,0.1174,0.18322,0.966073,0.966058,0.966493,0.966073
3,0.0668,0.197709,0.970314,0.970301,0.970693,0.970314
4,0.0517,0.166689,0.972858,0.972846,0.973288,0.972858
5,0.029,0.190577,0.97201,0.972001,0.972306,0.97201


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, element

In [4]:


def evaluate_model_on_test_data(test_data, model_path, batch_size=8):
    # Load the model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path).to("cuda")
    model.eval()  # Set model to evaluation mode
    
    # Prepare the test data
    test_data["Tweet"] = test_data["Tweet"].astype(str)
    label2id = {"Spam": 1, "Quality": 0}
    test_data["label"] = test_data["Type"].map(label2id).astype(int)

    # Convert to Hugging Face Dataset and tokenize
    test_dataset = Dataset.from_pandas(test_data)
    def tokenize_function(example):
        return tokenizer(
            example["Tweet"],
            padding="max_length",
            truncation=True,
            max_length=512
        )
    
    test_dataset = test_dataset.map(tokenize_function, batched=True)
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    
    # DataLoader for test set
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Make predictions
    predictions = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to("cuda")
            attention_mask = batch['attention_mask'].to("cuda")
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()  # Get predicted labels
            predictions.extend(preds)
    
    # Calculate evaluation metrics
    true_labels = test_data["label"].tolist()
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average="weighted")
    accuracy = accuracy_score(true_labels, predictions)
    
    # Print metrics
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")

    # Return metrics in case you need them
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

model_path = "models/xlm-roberta-large_Twitter_spam_classification"
metrics = evaluate_model_on_test_data(test_data, model_path)


Map:   0%|          | 0/1179 [00:00<?, ? examples/s]

Accuracy: 0.9745547073791349
Precision: 0.9745737843156286
Recall: 0.9745547073791349
F1-Score: 0.9745530226387936


In [5]:
model_path = "models/deberta-v3-large_Twitter_spam_classification"
metrics = evaluate_model_on_test_data(test_data, model_path)

Map:   0%|          | 0/1179 [00:00<?, ? examples/s]

Accuracy: 0.9779474130619169
Precision: 0.9780790632953945
Recall: 0.9779474130619169
F1-Score: 0.9779428095129591
