In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
import numpy as np
import pandas as pd
import random
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch.nn import CrossEntropyLoss

import transformers
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import re


In [None]:
FOLDER_PATH = "/content/drive/MyDrive/Colab Notebooks/Machine learning/ML_project/"

# model parameters
MAX_LEN = 512
TWITTER_ROBERTA_MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 2
MODEL_PATH = "fold_model.bin"
tokenizer = AutoTokenizer.from_pretrained(TWITTER_ROBERTA_MODEL)

In [41]:
class ROBERTAModel(nn.Module):
    """
    RoBERTa model with a dropout and linear layer for binary text classification
    """
    def __init__(self, roberta_model, num_classes=2, dropout_rate=0.3):
        super(ROBERTAModel, self).__init__()
        self.roberta = AutoModelForSequenceClassification.from_pretrained(roberta_model)
        self.drop = nn.Dropout(dropout_rate)
        self.out = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        output = self.drop(logits)
        return self.out(output)

In [42]:
def preprocess(text):
  # Remove '<user>'
  text = re.sub(r'<user>', '', text)
  # Remove '<url>'
  text = re.sub(r'<url>', '', text)
  # remove numbers
  text = re.sub(r'\d+', '', text)
  return text

In [43]:
def train(data, model, optimizer, device):
    """
        Train the model for one epoch
    """
    model.train()
    running_loss = 0.0

    for batch_idx, d in enumerate(data):
        mask = d['mask'].to(device, dtype=torch.long)
        ids = d['ids'].to(device, dtype=torch.long)
        token_type_ids = d['token_type_ids'].to(device, dtype=torch.long)
        targets = d['targets'].to(device, dtype=torch.long)
        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids)

        loss = torch.nn.CrossEntropyLoss(outputs, targets) # Calculate loss
        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0 and batch_idx !=0:
            temp = f'Batch index = {batch_idx}\tRunning Loss = {running_loss/10}'
            print(temp)
            running_loss = 0.0

In [44]:
pos_path = 'data/twitter-datasets/train_pos.txt'
neg_path = 'data/twitter-datasets/train_neg.txt'

with open(FOLDER_PATH + pos_path, 'r') as f:
    pos_tweets = f.readlines()
with open(FOLDER_PATH + neg_path, 'r') as f:
    neg_tweets = f.readlines()

# preprocess data
pos_tweets = [preprocess(tweet) for tweet in pos_tweets]
neg_tweets = [preprocess(tweet) for tweet in neg_tweets]


pos_labels = [1 for _ in range(len(pos_tweets))]
neg_labels = [0 for _ in range(len(neg_tweets))]
labels = pos_labels + neg_labels
tweets = pos_tweets + neg_tweets

# Combine labels and tweets into a list of tuples
data = list(zip(tweets, labels))

# Shuffle the data
random.shuffle(data)

# Unpack the shuffled data back into separate lists
train_tweets, train_labels = zip(*data)

# Use RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained(TWITTER_ROBERTA_MODEL)
config = AutoConfig.from_pretrained(TWITTER_ROBERTA_MODEL)

# Tokenize and convert to input IDs
def data_generator(tweets, labels, batch_size=32):
    for i in range(0, len(tweets), batch_size):
        batch_tweets = tweets[i:i + batch_size]
        batch_labels = labels[i:i + batch_size]
        batch_encodings = tokenizer(batch_tweets, truncation=True, padding=True, max_length=MAX_LEN, return_tensors="pt")
        yield batch_encodings, torch.tensor(batch_labels)

In [None]:
# Set k value
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

model = AutoModelForSequenceClassification.from_pretrained(TWITTER_ROBERTA_MODEL)
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 2  # Number of training epochs


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
criterion = CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer,
                      num_warmup_steps=0,
                      num_training_steps=len(train_labels) * (k_folds-1)*num_epochs)


# Function to calculate accuracy
def calc_accuracy(preds, labels):
    _, predictions = torch.max(preds, dim=1)
    correct = (predictions == labels).float()
    acc = correct.sum() / len(correct)
    return acc

all_labels = []
all_predictions = []
best_accuracy = 0

for fold, (train_indices, val_indices) in enumerate(kf.split(train_tweets)):
    print(f"Fold {fold + 1}/{k_folds}")

    # Create data loaders for the current fold
    train_fold_tweets = [train_tweets[i] for i in train_indices]
    train_fold_labels = [train_labels[i] for i in train_indices]
    val_fold_tweets = [train_tweets[i] for i in val_indices]
    val_fold_labels = [train_labels[i] for i in val_indices]

    # Training Loop
    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        model.train()
        total_train_loss = 0
        total_train_acc = 0

        # Use the data generator for training
        i = 1
        for batch_encodings, batch_labels in data_generator(train_fold_tweets, train_fold_labels, batch_size=32):
            batch_encodings = {k: v.to(device) for k, v in batch_encodings.items()}
            batch_labels = batch_labels.to(device)

            # Forward pass
            outputs = model(**batch_encodings)
            loss = criterion(outputs.logits, batch_labels)
            acc = calc_accuracy(outputs.logits, batch_labels)

            # Backward pass
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            total_train_loss += loss.item()
            total_train_acc += acc.item()

            if i%50==0:
              print(f"Batch {i} - Train loss: {loss.item()}, Accuracy: {acc.item()}")
            i+=1

        avg_train_loss = total_train_loss / len(train_fold_tweets)
        avg_train_acc = total_train_acc / len(train_fold_tweets)
        print(f"Fold {fold+1} - Train loss: {avg_train_loss}, Accuracy: {avg_train_acc}")

    # Validation Loop
    model.eval()
    total_val_accuracy = 0
    total_val_loss = 0

    for batch_encodings, batch_labels in data_generator(val_fold_tweets, val_fold_labels, batch_size=32):
        with torch.no_grad():
            batch_encodings = {k: v.to(device) for k, v in batch_encodings.items()}
            batch_labels = batch_labels.to(device)

            outputs = model(**batch_encodings)
            loss = criterion(outputs.logits, batch_labels)
            acc = calc_accuracy(outputs.logits, batch_labels)

            total_val_loss += loss.item()
            total_val_accuracy += acc.item()

            # for overall metrics calculation
            preds = outputs.logits.argmax(dim=1).cpu().numpy()
            labels = batch_labels.cpu().numpy()
            all_predictions.extend(preds)
            all_labels.extend(labels)

    avg_val_loss = total_val_loss / len(val_fold_tweets)
    avg_val_accuracy = total_val_accuracy / len(val_fold_tweets)
    print(f"Fold {fold+1} - Validation loss: {avg_val_loss}, Accuracy: {avg_val_accuracy}")
    if avg_val_accuracy>best_accuracy:
      torch.save(model, FOLDER_PATH + 'manipulated/roberta.pth')

# After all folds, calculate overall metrics
precision = precision_score(all_labels, all_predictions, average='macro')
recall = recall_score(all_labels, all_predictions, average='macro')
f1 = f1_score(all_labels, all_predictions, average='macro')

print("Training complete!")
print(f"Overall Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Fold 1/5
Epoch 1/2
Batch 50 - Train loss: 0.40607067942619324, Accuracy: 0.875
Batch 100 - Train loss: 0.2631092965602875, Accuracy: 0.84375
Batch 150 - Train loss: 0.37577298283576965, Accuracy: 0.875
Batch 200 - Train loss: 0.32417452335357666, Accuracy: 0.875
Batch 250 - Train loss: 0.2881397306919098, Accuracy: 0.875
Batch 300 - Train loss: 0.18938297033309937, Accuracy: 0.9375
Batch 350 - Train loss: 0.1849631816148758, Accuracy: 0.90625
Batch 400 - Train loss: 0.18197394907474518, Accuracy: 0.9375
Batch 450 - Train loss: 0.6610467433929443, Accuracy: 0.75
Batch 500 - Train loss: 0.41847431659698486, Accuracy: 0.84375
Batch 550 - Train loss: 0.39904776215553284, Accuracy: 0.875
Batch 600 - Train loss: 0.24807271361351013, Accuracy: 0.84375
Batch 650 - Train loss: 0.2552882730960846, Accuracy: 0.96875
Batch 700 - Train loss: 0.32590916752815247, Accuracy: 0.875
Batch 750 - Train loss: 0.20261028409004211, Accuracy: 0.90625
Batch 800 - Train loss: 0.28382763266563416, Accuracy: 0.90

### Possible improvement
1. Don't store all the scores in one variable, just store for each epoch and only keep the average score
2. save model for Each epoch

In [None]:
torch.save(model, FOLDER_PATH + 'manipulated/roberta_final.pth')

In [None]:
## predict
model = torch.load(FOLDER_PATH + 'manipulated/bert.pth')

## use the test set
test_path = FOLDER_PATH +'data/twitter-datasets/test_data.txt'
with open(test_path, 'r') as f:
    test_tweets = f.readlines()

test_encodings = tokenizer(test_tweets, truncation=True, padding=True, max_length=MAX_LEN)
test_dataset = TweetDataset(test_encodings, [0 for _ in range(len(test_tweets))])
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


model.eval()
predictions = []
for batch in test_loader:
    with torch.no_grad():
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = outputs.logits.argmax(dim=1).cpu().numpy()
        predictions.extend(preds)

In [None]:
predictions = np.array(predictions)
predictions[predictions == 0] = -1

In [None]:
print(predictions)

In [None]:
submission = pd.DataFrame({'Id':range(1, len(predictions) + 1),'Prediction': predictions})
submission.to_csv(FOLDER_PATH + 'manipulated/roberta_submission.csv', index=False)