In [1]:
import pandas as pd
from transformers import pipeline
import torch
import numpy as np

from src.main.util.preprocessing import pre_process, preprocess_tweet

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read the data from resources/data and present them
test = pd.read_csv("../resources/data/test.csv")
train = pd.read_csv("../resources/data/training.csv")
train.head()

Unnamed: 0,id,sentence,label
0,0,Those 2 drinks are part of the HK culture and ...,negative
1,1,I was told by the repair company that was doin...,negative
2,2,It is there to give them a good time .,neutral
3,3,Like leafing through an album of photos accomp...,negative
4,4,Johnny was a talker and liked to have fun.,positive


In [3]:
# use gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
from src.main.model.roberta import train_roberta_sentiment

trainer, metrics = train_roberta_sentiment(train)
print(trainer)
metrics

Stringifying the column: 100%|██████████| 102097/102097 [00:00<00:00, 1907677.27 examples/s]
Casting to class labels: 100%|██████████| 102097/102097 [00:00<00:00, 1265282.06 examples/s]
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Tokenising train (num_proc=8): 100%|██████████| 91887/91887 [00:08<00:00, 11223.29 examples/s]
Toke

Epoch,Training Loss,Validation Loss


In [None]:
CANDIDATES = ["positive", "negative", "neutral"]

train['clean'] = train['sentence'].apply(preprocess_tweet)
sentences = train['clean'].tolist()


roberta_clf = pipeline(
    task="sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest",
    device=0,
    batch_size=64,
    padding=True,
    max_length=512,
    truncation=True
)
roberta_preds = roberta_clf(sentences, truncation=True)
train["pred"] = [p["label"].lower() for p in roberta_preds]

train.head()


In [None]:
_LABEL2NUM = {
    "negative": -1,
    "neutral": 0,
    "positive": 1
}

def _to_num(x):
    if isinstance(x, str):
        return _LABEL2NUM[x.strip().lower()]
    return x 


def sentiment_score(y_true, y_pred):
    """
    Implements   L = 0.5 * ( 2 − 1/n Σ |y_i − ŷ_i| )
    Returns a float in [0, 1].
    """
    y_t = np.fromiter((_to_num(t) for t in y_true), dtype=np.int8)
    y_p = np.fromiter((_to_num(p) for p in y_pred), dtype=np.int8)
    mean_abs_diff = np.abs(y_t - y_p).mean()
    return 0.5 * (2.0 - mean_abs_diff)


results = sentiment_score(train["label"], train["pred"])
print("Sentiment score: ", results)


# Roberta is the chosen one

In [None]:
test['clean'] = test['sentence'].apply(preprocess_tweet)
test_sentences = test['clean'].tolist()
rob_test = roberta_clf(test_sentences, truncation=True)
test["label"] = [p["label"].lower() for p in rob_test]
test.head()

In [None]:
test[["id", "label"]].to_csv("../resources/data/result.csv", index=False)