In [75]:
# Imports
import pandas as pd
import numpy as np
import os

# Preprocessing

In [76]:
# Load data

team_id = '20' #put your team id here
split = 'test_1' # replace by 'test_2' for FINAL submission

df = pd.read_csv('dataset/tweets_train.csv')
df_test = pd.read_csv(f'dataset/tweets_{split}.csv')

In [77]:
df['words_str'] = df['words'].apply(lambda words: ' '.join(eval(words)))
df_test['words_str'] = df_test['words'].apply(lambda words: ' '.join(eval(words)))

In [78]:
# Defining the preprocess function as provided earlier
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Applying the preprocessing function to the 'text' column of the training data again
df['preprocessed_text'] = df['words_str'].apply(preprocess)

# Displaying the first few rows to check the preprocessing
df[['text', 'preprocessed_text']].head()


Unnamed: 0,text,preprocessed_text
0,@xbresson British Alps :-),british alps
1,RT @Aistats2020: Videos presentations of paper...,videos presentations papers keynote talks aist...
2,I hope I would be able to talk more about this...,hope would able talk balcony tomorrow pm et al...
3,RT @dlbcnai: Keynote by Joan Bruna (@joanbruna...,keynote joan bruna geometric deep learning pro...
4,@annargrs @Michael_J_Black @AllenHW0 @CSProfKG...,process science relies much basic honesty part...


In [79]:
# Importing required libraries
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import torch

In [80]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [81]:
# Task and model path
task = 'sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

# Initializing the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Downloading label mapping
labels = []
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
    labels = [row[1] for row in csvreader if len(row) > 1]

# Loading the PyTorch model
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Tokenizing the preprocessed text using the initialized tokenizer
encoded_train_input = tokenizer(df['preprocessed_text'].tolist(), padding=True, truncation=True, return_tensors="pt")

# Displaying the keys of the encoded input to confirm the tokenization
encoded_train_input.keys()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


dict_keys(['input_ids', 'attention_mask'])

In [82]:
from sklearn.metrics import f1_score
from scipy.special import softmax
import numpy as np

In [83]:
# Ensuring the model is in evaluation mode
model.eval()

# Making predictions using the model
with torch.no_grad():
    output = model(**encoded_train_input)
    logits = output[0]  # Keeping logits on the same device as the model

# Moving logits to CPU to apply softmax and other operations
logits = logits.cpu().numpy()

# Applying softmax to obtain probabilities
probabilities = softmax(logits, axis=1)

# Getting the predicted labels
predicted_labels = np.argmax(probabilities, axis=1)

# Converting string labels to integers for the true labels
label_mapping = {label: idx for idx, label in enumerate(labels)}
true_labels = df['sentiment'].map(label_mapping).values

# Calculating the F1 macro score
f1_macro = f1_score(true_labels, predicted_labels, average='macro')

f1_macro


0.6021846321800873