In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
import re

In [3]:
df = pd.read_csv('Cleaned_messi_tweets.csv')
df.describe()

Unnamed: 0,tweet_id,author_id,retweet_count,like_count,quote_count
count,465223.0,465223.0,465223.0,465223.0,465223.0
mean,1.52673e+18,9.984133e+17,3.101463,37.88555,0.29563
std,1.103667e+16,5.670279e+17,75.534405,830.177615,9.300193
min,1.512352e+18,54273.0,0.0,0.0,0.0
25%,1.516576e+18,7.925003e+17,0.0,0.0,0.0
50%,1.524858e+18,1.256989e+18,0.0,0.0,0.0
75%,1.533561e+18,1.433063e+18,0.0,2.0,0.0
max,1.55437e+18,1.554358e+18,14147.0,134246.0,2197.0


In [None]:
df.describe()
df['engagement_count'] = df['retweet_count']+df['quote_count']
df = df[df['engagement_count'] != 0]
df['ratio'] = df['quote_count']/df['engagement_count']

# Downsample to a manageable subset for quicker experiments
MAX_ROWS = 20000
df = df.sample(n=min(len(df), MAX_ROWS), random_state=42).reset_index(drop=True)

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['engagement_count'] = df['retweet_count']+df['quote_count']


Unnamed: 0,tweet_id,author_id,content,lang,date,source,geo,retweet_count,like_count,quote_count,q/r ratio,engagement_count,ratio
5,1554369553505058817,826713582,psg without mbappe is much much better the che...,en,2022-08-02T07:32:35.000Z,Twitter for iPhone,-1,1,1,0,0.0,1,0.0
61,1554368303203684352,1322042125949698048,he s fan of justin bieber messi to but he dose...,en,2022-08-02T07:27:36.000Z,Twitter for Android,-1,1,2,0,0.0,1,0.0
105,1554367092119437312,1459674547167371268,she s a but she no get yansh asuu messi,en,2022-08-02T07:22:48.000Z,Twitter for Android,-1,2,2,0,0.0,2,0.0
117,1554366579776753666,1476856767006994432,few days ago cristiano ronaldo broke a young f...,en,2022-08-02T07:20:46.000Z,Twitter for Android,-1,4,33,1,0.2,5,0.2
130,1554366120018182145,766661911261896704,what better sound uplifts our spirits and assu...,en,2022-08-02T07:18:56.000Z,Twitter for iPhone,-1,1,3,0,0.0,1,0.0


In [None]:
df['ratio'].describe()

count    56529.000000
mean         0.164385
std          0.332329
min          0.000000
25%          0.000000
50%          0.000000
75%          0.088235
max          1.000000
Name: ratio, dtype: float64

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. DATA SETUP (Regression Data)
# Labels are now FLOATS between 0.0 (Safe) and 1.0 (Controversial)
X = df['content']
y = df['ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_data = list(zip(X_train, y_train))
test_data = list(zip(X_test, y_test))

class ControversyDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=64):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, score = self.data[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            # CRITICAL CHANGE: Labels must be FLOAT for regression
            'labels': torch.tensor(score, dtype=torch.float)
        }

# 2. MODEL SETUP
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

# num_labels=1 turns this into a regressor
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1)

# 3. TRAINING LOOP
dataset = ControversyDataset(train_data, tokenizer)
loader = DataLoader(dataset, batch_size=8, shuffle=True)
optimizer = AdamW(model.parameters(), lr=2e-5)

print("Starting training...")
model.train()

for epoch in range(3):
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        
        # Hugging Face automatically detects num_labels=1 and uses MSELoss (Mean Squared Error)
        outputs = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1} Loss (MSE): {total_loss:.4f}")

# 4. EVALUATION ON HOLD-OUT TEST SET
print("\n--- Evaluating on hold-out test set ---")
model.eval()

# Build test loader
test_dataset = ControversyDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

preds, targets = [], []
with torch.no_grad():
    for batch in test_loader:
        outputs = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask']
        )
        logits = outputs.logits.squeeze(-1)
        scores = torch.sigmoid(logits)  # map logits to 0-1
        preds.extend(scores.cpu().numpy().tolist())
        targets.extend(batch['labels'].cpu().numpy().tolist())

mse = mean_squared_error(targets, preds)
mae = mean_absolute_error(targets, preds)
r2 = r2_score(targets, preds)
print(f"MSE: {mse:.4f}  MAE: {mae:.4f}  R2: {r2:.4f}")

comparison_df = pd.DataFrame({
    "text": X_test.reset_index(drop=True),
    "true_ratio": y_test.reset_index(drop=True),
    "pred_ratio": preds,
})
comparison_df.head()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


KeyboardInterrupt: 