In [1]:
# Imports
import pandas as pd
import numpy as np
import os

# Preprocessing

In [2]:
# Load data

team_id = '20' #put your team id here
split = 'test_1' # replace by 'test_2' for FINAL submission

df = pd.read_csv('dataset/tweets_train.csv')
df_test = pd.read_csv(f'dataset/tweets_{split}.csv')

In [3]:
df['words_str'] = df['words'].apply(lambda words: ' '.join(eval(words)))
df_test['words_str'] = df_test['words'].apply(lambda words: ' '.join(eval(words)))

In [4]:
# def preprocess(text):
#     new_text = []
#     for t in text.split(" "):
#         t = '@user' if t.startswith('@') and len(t) > 1 else t
#         t = 'http' if t.startswith('http') else t
#         new_text.append(t)
#     return " ".join(new_text)

In [5]:
# df['words_str'] = df['text'].apply(preprocess)
# df_test['words_str'] = df_test['text'].apply(preprocess)

In [6]:
from sklearn import preprocessing
from transformers import BertTokenizer, BertModel, BertPreTrainedModel, TrainingArguments, Trainer
from transformers import RobertaTokenizer, RobertaPreTrainedModel, RobertaModel, AutoTokenizer, AutoModel, PreTrainedModel
from transformers import TrainerCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch
import numpy as np
import torch.nn.functional as F
from torch import optim

2023-08-15 11:07:12.358218: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
X = df['words_str']
y_text = df['sentiment']
# y_text = df.sentiment.values
le = preprocessing.LabelEncoder()
le.fit(y_text)
print(f'Original classes {le.classes_}')
print(f'Corresponding numeric classes {le.transform(le.classes_)}')
y =le.transform(y_text)
print(f"X: {X.shape}")
print(f"y: {y.shape} {np.unique(y)}")

Original classes ['negative' 'neutral' 'positive']
Corresponding numeric classes [0 1 2]
X: (8000,)
y: (8000,) [0 1 2]


In [9]:
# Splitting
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Tokenize the input
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer_twitter = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base')
tokenizer_twitter_sentiment = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
tokenizer_mpnet = AutoTokenizer.from_pretrained('sentence-transformers/stsb-mpnet-base-v2')
tokenizer_bert_twitter = AutoTokenizer.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis')

In [11]:
tokenizer = tokenizer_bert_twitter
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

# Classification

In [12]:
class ClassificationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.astype('int') # Change to integer type

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long) # Change to long type for classification
        return item

    def __len__(self):
        return len(self.labels)

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        log_prob = F.log_softmax(inputs, dim=-1)
        prob = torch.exp(log_prob)
        return F.nll_loss(
            ((1 - prob) ** self.gamma) * log_prob,
            targets,
            reduction=self.reduction
        )


class BertClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = 3
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return (loss, logits) if loss is not None else logits

class BertClassificationTwitter(nn.Module):
    def __init__(self):
        super(BertClassificationTwitter, self).__init__()
        self.num_labels = 3
        self.bert = AutoModel.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, self.num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

    
class BertClassificationTwitter_2(nn.Module):
    def __init__(self):
        super(BertClassificationTwitter_2, self).__init__()
        self.bert = AutoModel.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis')
        self.dropout = nn.Dropout(0.1)
        hidden_size = self.bert.config.hidden_size

        # Adding an additional hidden layer
        self.hidden_layer = nn.Linear(hidden_size, hidden_size//2)
        
        # Adding L2 regularization (weight decay) to the hidden layer
        self.regularization = nn.LayerNorm(hidden_size//2)
        
        # Final classification layer with 3 classes
        self.classifier = nn.Linear(hidden_size//2, 3)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        
        # Passing through the hidden layer with ReLU activation
        hidden_output = self.hidden_layer(pooled_output)
        hidden_output = F.relu(hidden_output)
        
        # Applying Layer Normalization (regularization)
        hidden_output = self.regularization(hidden_output)
        
        logits = self.classifier(hidden_output)
        
        loss = None
        if labels is not None:
            loss_fn = FocalLoss(alpha=0.25, gamma=2)
            loss = loss_fn(logits, labels)
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
    
class BertClassificationTwitter_3(nn.Module):
    def __init__(self):
        super(BertClassificationTwitter_3, self).__init__()
        self.bert = AutoModel.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis')
        hidden_size = self.bert.config.hidden_size
        
        # Bi-directional LSTM
        self.lstm = nn.LSTM(hidden_size, hidden_size // 2, num_layers=2, batch_first=True, bidirectional=True, dropout=0.2)
        
        # Deeper Feed-Forward layers
        self.hidden1 = nn.Linear(hidden_size, hidden_size)
        self.hidden2 = nn.Linear(hidden_size, hidden_size//2)
        
        # Activation and regularization
        self.dropout1 = nn.Dropout(0.5)
        self.dropout2 = nn.Dropout(0.4)
        self.batchnorm1 = nn.BatchNorm1d(hidden_size)
        self.batchnorm2 = nn.BatchNorm1d(hidden_size//2)
        
        # Classifier
        self.classifier = nn.Linear(hidden_size//2, 3)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        
        lstm_out, _ = self.lstm(sequence_output)
        lstm_out = lstm_out[:, -1, :]
        
        # Passing through deeper layers
        hidden_output = F.leaky_relu(self.hidden1(lstm_out))
        hidden_output = self.batchnorm1(hidden_output)
        hidden_output = self.dropout1(hidden_output)
        
        hidden_output = F.leaky_relu(self.hidden2(hidden_output))
        hidden_output = self.batchnorm2(hidden_output)
        hidden_output = self.dropout2(hidden_output)
        
        logits = self.classifier(hidden_output)
        
        loss = None
        if labels is not None:
            loss_fn = FocalLoss(alpha=0.25, gamma=2, reduction='mean')
            loss = loss_fn(logits, labels)
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
    
class BertClassificationTwitter_4(nn.Module):
    def __init__(self, num_classes=3, alpha=1, gamma=2):
        super(BertClassificationTwitter_4, self).__init__()
        self.bert = AutoModel.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis')
        self.dropout = nn.Dropout(0.1)
        hidden_size = self.bert.config.hidden_size

        # Adding LSTM layer
        self.lstm = nn.LSTM(hidden_size, hidden_size // 2, batch_first=True)

        # Final classification layer
        self.classifier = nn.Linear(hidden_size//2, num_classes)
        self.focal_loss = FocalLoss(alpha=alpha, gamma=gamma) 

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)

        # Instead of using pooled_output, we'll utilize the last hidden state (sequence of embeddings)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)

        # LSTM
        lstm_out, _ = self.lstm(sequence_output)

        # Taking the last hidden state of LSTM for classification
        lstm_out = lstm_out[:, -1, :]
        logits = self.classifier(lstm_out)

        loss = None
        if labels is not None:
            loss = self.focal_loss(logits, labels)

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

class RobertaClassification(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = 3
        self.roberta = RobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return (loss, logits) if loss is not None else logits

    
class RobertaClassificationTwitter(nn.Module):
    def __init__(self):
        super(RobertaClassificationTwitter, self).__init__()
        self.num_labels = 3
        self.roberta = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, self.num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

# Function to compute f1_macro
def f1_macro(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {'f1_macro': f1_score(labels, predictions, average='macro')}



class RobertaClassificationTwitter_2(nn.Module):
    def __init__(self):
        super(RobertaClassificationTwitter_2, self).__init__()
        self.roberta = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base')
        self.dropout = nn.Dropout(0.1)
        hidden_size = self.roberta.config.hidden_size

        # Adding an additional hidden layer
        self.hidden_layer = nn.Linear(hidden_size, hidden_size//2)
        
        # Adding L2 regularization (weight decay) to the hidden layer
        self.regularization = nn.LayerNorm(hidden_size//2)
        
        # Final classification layer with 3 classes
        self.classifier = nn.Linear(hidden_size//2, 3)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        
        # Passing through the hidden layer with ReLU activation
        hidden_output = self.hidden_layer(pooled_output)
        hidden_output = F.relu(hidden_output)
        
        # Applying Layer Normalization (regularization)
        hidden_output = self.regularization(hidden_output)
        
        logits = self.classifier(hidden_output)
        
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}


class RobertaClassificationTwitter_3(nn.Module):
    def __init__(self):
        super(RobertaClassificationTwitter_3, self).__init__()
        self.roberta = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
        self.dropout = nn.Dropout(0.1)
        hidden_size = self.roberta.config.hidden_size

        # Adding an additional hidden layer
        self.hidden_layer = nn.Linear(hidden_size, hidden_size//2)
        
        # Adding L2 regularization (weight decay) to the hidden layer
        self.regularization = nn.LayerNorm(hidden_size//2)
        
        # Final classification layer with 3 classes
        self.classifier = nn.Linear(hidden_size//2, 3)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        
        # Passing through the hidden layer with ReLU activation
        hidden_output = self.hidden_layer(pooled_output)
        hidden_output = F.relu(hidden_output)
        
        # Applying Layer Normalization (regularization)
        hidden_output = self.regularization(hidden_output)
        
        logits = self.classifier(hidden_output)
        
        loss = None
        if labels is not None:
            loss_fn = FocalLoss(alpha=0.25, gamma=2)
            loss = loss_fn(logits, labels)
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}


class MpnetClassification(nn.Module):
    def __init__(self):
        super(MpnetClassification, self).__init__()
        self.mpnet = AutoModel.from_pretrained('sentence-transformers/stsb-mpnet-base-v2')
        self.dropout = nn.Dropout(0.1)
        hidden_size = self.mpnet.config.hidden_size

        # Adding an additional hidden layer
        self.hidden_layer = nn.Linear(hidden_size, hidden_size//2)
        
        # Adding L2 regularization (weight decay) to the hidden layer
        self.regularization = nn.LayerNorm(hidden_size//2)
        
        # Final classification layer with 3 classes
        self.classifier = nn.Linear(hidden_size//2, 3)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.mpnet(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        
        # Passing through the hidden layer with ReLU activation
        hidden_output = self.hidden_layer(pooled_output)
        hidden_output = F.relu(hidden_output)
        
        # Applying Layer Normalization (regularization)
        hidden_output = self.regularization(hidden_output)
        
        logits = self.classifier(hidden_output)
        
        loss = None
        if labels is not None:
            loss_fn = FocalLoss(alpha=0.25, gamma=2)
            loss = loss_fn(logits, labels)
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

    
    
class ThresholdEarlyStoppingCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        f1 = metrics['eval_f1_macro']
        if f1 > 0.78:
            control.should_training_stop = True
        return control


In [13]:
train_dataset = ClassificationDataset(train_encodings, train_labels)
val_dataset = ClassificationDataset(val_encodings, val_labels)

In [14]:
model_bert = BertClassification.from_pretrained('bert-base-uncased')
model_roberta = RobertaClassification.from_pretrained('roberta-base')
model_twitter = RobertaClassificationTwitter_3()
model_mpnet = MpnetClassification()
model_bert_twitter = BertClassificationTwitter_3()

Some weights of BertClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model = model_bert_twitter.to(device)

# Define training arguments and trainer
training_args = TrainingArguments(
    output_dir='./output',
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    learning_rate=0.00001,
    num_train_epochs=1000,
    logging_dir='./logs',
    evaluation_strategy='steps',
    logging_steps=100,
    weight_decay=0.0001,
    lr_scheduler_type='cosine',  # Using a cosine scheduler
    warmup_steps=100  # Number of warmup steps
)




trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=f1_macro,
    callbacks=[ThresholdEarlyStoppingCallback()],
)


# Train the model
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)



Step,Training Loss,Validation Loss,F1 Macro
100,0.557,0.329238,0.654251
200,0.3273,0.230923,0.721448
300,0.1941,0.250575,0.73568
400,0.111,0.334922,0.737611
500,0.0638,0.313544,0.758616
600,0.0362,0.620169,0.708838
700,0.0223,0.532077,0.750848
800,0.0157,0.50372,0.749223
900,0.0128,0.517863,0.748215
1000,0.0116,0.617731,0.746763


{'eval_loss': 0.863010585308075, 'eval_f1_macro': 0.7682082053350711, 'eval_runtime': 0.8892, 'eval_samples_per_second': 1799.336, 'eval_steps_per_second': 7.872, 'epoch': 1000.0}


In [16]:
# Save the model
trainer.save_model('pretrained_models/bert-twitter-clf')

# model = BertRegression.from_pretrained("./path/to/save/directory")

# Test

In [17]:
# Define a dataset without labels for testing
class ClassificationTestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


In [18]:
# Tokenize the test sentences
sentences = list(df_test.words_str.values)
test_encodings = tokenizer(sentences, truncation=True, padding=True)

# Convert to a PyTorch Dataset (using the renamed class)
test_dataset = ClassificationTestDataset(test_encodings)

# Get predictions with the neural network
predictions = trainer.predict(test_dataset)
y_hat_prob_tensor = torch.tensor(predictions.predictions, dtype=torch.float32)

# Convert the probabilities to class labels
y_hat_labels = torch.argmax(y_hat_prob_tensor, dim=1).cpu().numpy()

# revert the label encoding
y_hat_labels = le.inverse_transform(y_hat_labels)

# Save the results with the specified format
directory = 'results'
np.save(os.path.join(directory, f'{team_id}__{split}__clf_pred.npy'), y_hat_labels)


In [19]:
# Load 20__test_1__reg_pred.npy

d = np.load('results/20__test_1__clf_pred.npy', allow_pickle=True)
d.shape

(1000,)