In [None]:
import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm import tqdm
import mlflow 
import re
import os
import shutil

In [10]:
mlflow.set_tracking_uri(uri="http://localhost:8080")

# Loading Data

In [None]:
try:
    if 'dataset' not in os.listdir("../"):
        print("Dataset doesn't exist, downloading it...")
        os.makedirs('../dataset/', exist_ok=True)
        path = kagglehub.dataset_download("myrios/news-sentiment-analysis")
        print(f"Dataset original path: {path}")
        shutil.move(path, '../dataset/')
        path = "dataset/news.csv"
        print("Dataset was downloaded and put in dataset/ directory.")
    else:
        path = "../dataset/news.csv"
        print("Loading news dataset... ")
        print(f"Dataset path: {path}")
        
    
except Exception as e :
    print(f"Dataset loading error: {e}")


df = pd.read_csv(f"{path}")
df.head(3)

Unnamed: 0,date,news,neg,neu,pos,compound,sentiment
0,2007-07-07,It was a long antipodean night. While there’s ...,0.059,0.878,0.064,0.0516,POSITIVE
1,2007-07-07,In Mexico there are no licensing or registrati...,0.044,0.956,0.0,-0.296,NEGATIVE
2,2007-07-07,The government has until Monday to protect the...,0.0,0.894,0.106,0.3818,POSITIVE


In [11]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [16]:
def clean_text(text):
    if text[0] == 'b':
        text= text[1:]
    if text[0] == '"' or text[0] == "'":
        text = text[1:len(text)-1]  
    text = re.sub(r'\\+', r'\\', text)
    text = re.sub(r'(?<!\d)\\(?!\d)', '', text)
    return text

In [17]:
def polarity_scores_roberta(text, date, year):
    encoded_text = tokenizer(text, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    sentiment_score = scores[2] - scores[0] + 0.5 * scores[1]
    scores_dict = {
        'neg' : scores[0],
        'neu' : scores[1],
        'pos' : scores[2],
        'sentiment_score' : sentiment_score,
        'date': date,
        'score': (sentiment_score) * 5 + (1 / (2025 - year + 1)) * 10
    }
    return scores_dict

In [18]:
def get_predictions(df):
    res = {}
    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            text = clean_text(row['news'])
            myid = row['id']
            date = row['date']
            year = pd.to_datetime(row['date']).year
            result = polarity_scores_roberta(text, date, year)
            res[myid] = result
        except RuntimeError:
            print(f'Issue when handling news with id: {myid}')
    return res

In [19]:
### Testing the model
sample = df.sample(100000, random_state=0)
input_df = sample[['date','news']].reset_index(names='id')
predictions = get_predictions(input_df)

100%|██████████| 100000/100000 [2:10:13<00:00, 12.80it/s] 


In [20]:
predicted_labels = []
for news_id in predictions:
    score = predictions.get(news_id).get('score', 0)
    label = lambda x : 'POSITIVE' if x > 0 else 'NEGATIVE'
    predicted_labels.append(label(score))
    
y_pred = predicted_labels
y_true = sample['sentiment']
report = classification_report(y_true, y_pred, target_names=['NEGATIVE', 'POSITIVE'])
print(report)

              precision    recall  f1-score   support

    NEGATIVE       0.82      0.44      0.57     50755
    POSITIVE       0.61      0.90      0.73     49245

    accuracy                           0.67    100000
   macro avg       0.72      0.67      0.65    100000
weighted avg       0.72      0.67      0.65    100000



In [21]:
report_dict = classification_report(y_true, y_pred, target_names=['NEGATIVE', 'POSITIVE'], output_dict=True)
metrics = {
    "accuracy": report_dict["accuracy"],
    "precision": report_dict["macro avg"]["precision"],
    "recall": report_dict["macro avg"]["recall"],
    "f1_score": report_dict["macro avg"]["f1-score"],
}

In [22]:
mlflow.set_experiment("Roberta Model")

with mlflow.start_run():
    mlflow.set_tag("Model Name", "cardiffnlp/twitter-roberta-base-sentiment")
    mlflow.log_metrics(metrics) 
    mlflow.log_dict(report_dict, "classification_report.json")

2025/03/05 00:49:13 INFO mlflow.tracking.fluent: Experiment with name 'Roberta Model' does not exist. Creating a new experiment.


🏃 View run overjoyed-mink-547 at: http://localhost:8080/#/experiments/518030901394168194/runs/20022f9dba95446e86cacafab222a81f
🧪 View experiment at: http://localhost:8080/#/experiments/518030901394168194


# Save Model

In [None]:
# save_path = "prediction_model/roberta_model"
# tokenizer.save_pretrained(save_path)
# model.save_pretrained(save_path)

# Load Model

In [None]:
# save_path = "prediction_model/roberta_model"
# tokenizer = AutoTokenizer.from_pretrained(save_path)
# model = AutoModelForSequenceClassification.from_pretrained(save_path)