In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import Trainer

# Preprocessing

In [2]:
data_path = 'cleaned_cyberbullying_tweets.csv'
df = pd.read_csv(data_path)

if 'sentiment' not in df.columns or 'text_clean' not in df.columns:
    raise ValueError("Required columns are missing from the DataFrame.")

print("Initial data size:", df.shape)
print(df.head())

#Prompt
def add_prompt_to_text(text):
    return f"Please classify the label of this text as either 0, 1, or 2. 0 represents the sentiment 'not_cyberbullying', 1 represents the sentiment 'other_cyberbullying', 2 represents the sentiments ethnicity, religion, gender, or other. 0 is negative, 1 is neutral, and 2 is positive for cyberbullying: {text}"

# Apply the prompt to each text entry in the DataFrame
df['text_clean'] = df['text_clean'].apply(add_prompt_to_text)


print("Unique sentiment values before mapping:", df['sentiment'].unique())

# Mapping sentiments to labels
sentiment_mapping = {
    'not_cyberbullying': 0,  # Negative
    'other_cyberbullying': 1,      # Neutral
    'ethnicity': 2,          # Positive
    'religion': 2,           # Positive
    'gender': 2,             # Positive
    'other': 2               # Positive
}

df['label'] = df['sentiment'].map(sentiment_mapping)

if df['label'].isna().any():
    print("NaN values found after mapping. Check mapping keys and sentiment column values.")
    print(df[df['label'].isna()])
else:
    print("No NaN values after mapping.")

df = df.dropna(subset=['label'])
print("Data size after dropping NaNs:", df.shape)

if df.empty:
    print("DataFrame is empty after dropping NaNs. Adjust data cleaning or check data quality.")
else:
    print("Data is ready for further processing.")

df['label'] = df['label'].astype(int)

print("Distribution of labels:", df['label'].value_counts())

texts = df['text_clean'].tolist()
labels = df['label'].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1, random_state=42)


Initial data size: (41408, 3)
                                                text          sentiment  \
0  In other words #katandandre, your food was cra...  not_cyberbullying   
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying   
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying   
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying   
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying   

                                          text_clean  
0             word katandandre food crapilicious mkr  
1  aussietv white mkr theblock imacelebrityau tod...  
2                    classy whore red velvet cupcake  
3  meh thanks head concerned another angry dude t...  
4  isi account pretending kurdish account like is...  
Unique sentiment values before mapping: ['not_cyberbullying' 'gender' 'religion' 'other_cyberbullying' 'age'
 'ethnicity']
NaN values found after mapping. Check mapping keys and sentiment c

In [10]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import numpy as np

train_texts = np.array(train_texts)
train_labels = np.array(train_labels)

train_labels_series = pd.Series(train_labels)

class_counts = train_labels_series.value_counts()
max_samples = class_counts.max()

sampling_strategy_over = {k: max_samples for k in class_counts.index}
sampling_strategy_under = {k: max_samples for k in class_counts.index}

over = SMOTE(sampling_strategy=sampling_strategy_over)
under = RandomUnderSampler(sampling_strategy=sampling_strategy_under)
steps = [('over', over), ('under', under)]
pipeline = Pipeline(steps=steps)

# Resample indices instead of texts
indices = np.arange(train_texts.shape[0]).reshape(-1, 1)
resampled_indices, resampled_labels = pipeline.fit_resample(indices, train_labels_series)

train_texts_resampled = train_texts[resampled_indices.flatten()]
train_labels_resampled = train_labels[resampled_indices.flatten()]

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

def tokenize_data(texts, tokenizer):
    texts = [str(text) for text in texts if text is not None]
    return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

try:
    train_encodings = tokenize_data(train_texts, tokenizer)
    val_encodings = tokenize_data(val_texts, tokenizer)
except Exception as e:
    print(f"An error occurred during tokenization: {str(e)}")

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_labels))



In [11]:
from torch.utils.data import Dataset
from torch.nn.functional import one_hot
from torch import nn

class CyberbullyingDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CyberbullyingDataset(train_encodings, train_labels)
val_dataset = CyberbullyingDataset(val_encodings, val_labels)

test_item = train_dataset[0]
print("Example of processed item:", test_item)
print("Labels tensor:", test_item['labels'])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

Example of processed item: {'input_ids': tensor([    0, 48759, 13562, 24786, 26293,   784,  1916,  3036, 16490, 44403,
            2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,  

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


# Training

In [12]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # Start with 3 epochs
    per_device_train_batch_size=16,  # Adjust based on your hardware capability
    per_device_eval_batch_size=64,  # Larger batches if your hardware supports it
    evaluation_strategy="steps",
    eval_steps=1000,  # Adjust based on the total number of training steps
    warmup_steps=500,  # Adjust if needed
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_strategy="steps",
    save_steps=1000,
    learning_rate=3e-5  # A typical starting learning rate for fine-tuning
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss,Validation Loss
1000,0.3717,0.445682
2000,0.3326,0.406413
3000,0.3256,0.369864
4000,0.2637,0.413479
5000,0.2746,0.409042


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=5670, training_loss=0.35088898340861, metrics={'train_runtime': 54679.8058, 'train_samples_per_second': 1.659, 'train_steps_per_second': 0.104, 'total_flos': 5965636298501376.0, 'train_loss': 0.35088898340861, 'epoch': 3.0})

# Evaluation

In [13]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate_model(model, dataloader, device):
    model.eval()  # Set model to evaluation mode
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            inputs = {'input_ids': batch['input_ids'].to(device),
                      'attention_mask': batch['attention_mask'].to(device)}
            labels = batch['labels'].to(device)

            outputs = model(**inputs)
            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds.view(-1).cpu().numpy())
            true_labels.extend(labels.view(-1).cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

val_metrics = evaluate_model(model, val_loader, device)
print("Validation Metrics:", val_metrics)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Validation Metrics: {'accuracy': 0.8493599285501637, 'precision': 0.8482269246189422, 'recall': 0.8493599285501637, 'f1': 0.8472000580954699}


NameError: name 'test_loader' is not defined