In [None]:
# =====================
# Ensemble Training Script: RoBERTa + DistilBERT + Meta-Model
# =====================
import os
import lzma
import json
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.optim import AdamW
import joblib

# =====================
# Config
# =====================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 16
EPOCHS = 10
MAX_LEN = 256

# =====================
# Load dataset
# =====================
json_path = "2023_processed.json.xz"
with lzma.open(json_path, 'rt') as f:
    data = json.load(f)
df = pd.DataFrame(data).sample(n=10000, random_state=42).reset_index(drop=True)

# Keep only rows with maintext and sentiment
df = df.dropna(subset=["maintext", "sentiment"])

# Convert sentiment dict to numeric label
def extract_label(sentiment_dict):
    if not isinstance(sentiment_dict, dict):
        return 1  # neutral fallback
    labels = ["negative", "neutral", "positive"]
    values = [sentiment_dict.get(l, 0) for l in labels]
    return int(values.index(max(values)))

df['label'] = df['sentiment'].apply(extract_label)

texts = df['maintext'].tolist()
labels = df['label'].tolist()

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# =====================
# Tokenizers & Models
# =====================
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
distil_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

roberta_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=3).to(device)
distil_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3).to(device)

# =====================
# Tokenization helper
# =====================
def tokenize_texts(tokenizer, texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=MAX_LEN, return_tensors="pt")

def create_loader(texts, labels, tokenizer):
    enc = tokenize_texts(tokenizer, texts)
    dataset = Dataset.from_dict({
        'input_ids': enc['input_ids'],
        'attention_mask': enc['attention_mask'],
        'labels': torch.tensor(labels)
    })
    collator = DataCollatorWithPadding(tokenizer=tokenizer)
    return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collator)

train_loader_roberta = create_loader(train_texts, train_labels, roberta_tokenizer)
test_loader_roberta = create_loader(test_texts, test_labels, roberta_tokenizer)
train_loader_distil = create_loader(train_texts, train_labels, distil_tokenizer)
test_loader_distil = create_loader(test_texts, test_labels, distil_tokenizer)

# =====================
# Training helper
# =====================
def train_model(model, train_loader, epochs=EPOCHS):
    optimizer = AdamW(model.parameters(), lr=2e-5)
    model.train()
    for epoch in range(epochs):
        loop = tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}")
        for batch in loop:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            loop.set_postfix(loss=loss.item())
    return model

# Train both models
print("Training RoBERTa for 10 epochs...")
roberta_model = train_model(roberta_model, train_loader_roberta)

print("Training DistilBERT for 10 epochs...")
distil_model = train_model(distil_model, train_loader_distil)

# =====================
#  Generate predictions for meta-model
# =====================
def get_preds(model, loader):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
            probs = torch.softmax(logits, dim=1).cpu().numpy()
            all_preds.append(probs)
    return np.vstack(all_preds)

X_train_meta = np.hstack([
    get_preds(roberta_model, train_loader_roberta),
    get_preds(distil_model, train_loader_distil)
])
X_test_meta = np.hstack([
    get_preds(roberta_model, test_loader_roberta),
    get_preds(distil_model, test_loader_distil)
])
y_train_meta = np.array(train_labels)
y_test_meta = np.array(test_labels)

# =====================
# Train meta-model
# =====================
meta_model = LogisticRegression(max_iter=2000)
meta_model.fit(X_train_meta, y_train_meta)

# Evaluate
y_pred_meta = meta_model.predict(X_test_meta)
print("Meta-Model Accuracy:", accuracy_score(y_test_meta, y_pred_meta))
print(classification_report(y_test_meta, y_pred_meta, target_names=["Negative","Neutral","Positive"]))

# =====================
# Save models & meta-model
# =====================
os.makedirs("saved_models", exist_ok=True)
roberta_model.save_pretrained("saved_models/roberta")
distil_model.save_pretrained("saved_models/distilbert")
roberta_tokenizer.save_pretrained("saved_models/roberta")
distil_tokenizer.save_pretrained("saved_models/distilbert")
joblib.dump(meta_model, "saved_models/meta_model.pkl")
print(" All models and meta-model saved!")

# =====================
# Ensemble prediction function
# =====================
def predict_ensemble(text: str):
    roberta_enc = tokenize_texts(roberta_tokenizer, [text])
    distil_enc = tokenize_texts(distil_tokenizer, [text])

    with torch.no_grad():
        roberta_probs = torch.softmax(
            roberta_model(roberta_enc['input_ids'].to(device), attention_mask=roberta_enc['attention_mask'].to(device)).logits,
            dim=1
        ).cpu().numpy()
        distil_probs = torch.softmax(
            distil_model(distil_enc['input_ids'].to(device), attention_mask=distil_enc['attention_mask'].to(device)).logits,
            dim=1
        ).cpu().numpy()

    meta_input = np.hstack([roberta_probs, distil_probs])
    pred_class = meta_model.predict(meta_input)[0]
    pred_prob = meta_model.predict_proba(meta_input)[0][pred_class]

    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return {"label_id": int(pred_class), "label": label_map[int(pred_class)], "confidence": float(pred_prob)}

# Example
sample_text = "The company's revenue exceeded expectations this quarter."
print(predict_ensemble(sample_text))


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training RoBERTa for 10 epochs...


Training Epoch 1/10: 100%|██████████| 500/500 [05:49<00:00,  1.43it/s, loss=0.336]
Training Epoch 2/10: 100%|██████████| 500/500 [05:52<00:00,  1.42it/s, loss=0.118]
Training Epoch 3/10: 100%|██████████| 500/500 [05:52<00:00,  1.42it/s, loss=0.269]
Training Epoch 4/10: 100%|██████████| 500/500 [05:52<00:00,  1.42it/s, loss=0.182]
Training Epoch 5/10: 100%|██████████| 500/500 [05:52<00:00,  1.42it/s, loss=0.452]
Training Epoch 6/10: 100%|██████████| 500/500 [05:52<00:00,  1.42it/s, loss=0.21]
Training Epoch 7/10: 100%|██████████| 500/500 [05:52<00:00,  1.42it/s, loss=0.0273]
Training Epoch 8/10: 100%|██████████| 500/500 [05:52<00:00,  1.42it/s, loss=0.00877]
Training Epoch 9/10: 100%|██████████| 500/500 [05:52<00:00,  1.42it/s, loss=0.00703]
Training Epoch 10/10: 100%|██████████| 500/500 [05:52<00:00,  1.42it/s, loss=0.15]


Training DistilBERT for 10 epochs...


Training Epoch 1/10: 100%|██████████| 500/500 [02:59<00:00,  2.78it/s, loss=0.489]
Training Epoch 2/10: 100%|██████████| 500/500 [02:56<00:00,  2.83it/s, loss=0.294]
Training Epoch 3/10: 100%|██████████| 500/500 [02:56<00:00,  2.83it/s, loss=0.54]
Training Epoch 4/10: 100%|██████████| 500/500 [02:56<00:00,  2.83it/s, loss=0.0406]
Training Epoch 5/10: 100%|██████████| 500/500 [02:56<00:00,  2.83it/s, loss=0.0181]
Training Epoch 6/10: 100%|██████████| 500/500 [02:56<00:00,  2.84it/s, loss=0.0974]
Training Epoch 7/10: 100%|██████████| 500/500 [02:56<00:00,  2.83it/s, loss=0.313]
Training Epoch 8/10: 100%|██████████| 500/500 [02:56<00:00,  2.83it/s, loss=0.0439]
Training Epoch 9/10: 100%|██████████| 500/500 [02:56<00:00,  2.83it/s, loss=0.224]
Training Epoch 10/10: 100%|██████████| 500/500 [02:56<00:00,  2.83it/s, loss=0.00184]


Meta-Model Accuracy: 0.537
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       487
     Neutral       0.00      0.00      0.00       439
    Positive       0.54      1.00      0.70      1074

    accuracy                           0.54      2000
   macro avg       0.18      0.33      0.23      2000
weighted avg       0.29      0.54      0.38      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


✅ All models and meta-model saved!
{'label_id': 2, 'label': 'Positive', 'confidence': 0.5365825570676341}


In [None]:
import os
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import joblib

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# =====================
# 1️⃣ Load saved models
# =====================
roberta_path = "saved_models/roberta"
distil_path = "saved_models/distilbert"
meta_path = "saved_models/meta_model.pkl"

if not (os.path.exists(roberta_path) and os.path.exists(distil_path) and os.path.exists(meta_path)):
    raise FileNotFoundError("Saved models not found. Please train and save them first.")

# Load tokenizers & models
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_path)
distil_tokenizer = AutoTokenizer.from_pretrained(distil_path)

roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_path).to(device)
distil_model = AutoModelForSequenceClassification.from_pretrained(distil_path).to(device)

# Load meta-model
meta_model = joblib.load(meta_path)

# =====================
# 2️⃣ Prediction function
# =====================
def tokenize_texts(tokenizer, texts, max_len=256):
    return tokenizer(texts, truncation=True, padding=True, max_length=max_len, return_tensors="pt")

def predict_ensemble(text: str):
    roberta_enc = tokenize_texts(roberta_tokenizer, [text])
    distil_enc = tokenize_texts(distil_tokenizer, [text])

    with torch.no_grad():
        roberta_probs = torch.softmax(
            roberta_model(roberta_enc['input_ids'].to(device),
                          attention_mask=roberta_enc['attention_mask'].to(device)).logits,
            dim=1
        ).cpu().numpy()

        distil_probs = torch.softmax(
            distil_model(distil_enc['input_ids'].to(device),
                         attention_mask=distil_enc['attention_mask'].to(device)).logits,
            dim=1
        ).cpu().numpy()

    meta_input = np.hstack([roberta_probs, distil_probs])
    pred_class = meta_model.predict(meta_input)[0]
    pred_prob = meta_model.predict_proba(meta_input)[0][pred_class]

    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return {"label_id": int(pred_class), "label": label_map[int(pred_class)], "confidence": float(pred_prob)}

# =====================
# 3️⃣ Predict sample paragraph
# =====================
sample_text = """Global markets reacted cautiously as multiple central banks signaled diverging monetary policies.
While the Federal Reserve indicated a potential slowdown in interest rate hikes, citing moderating inflation,
the European Central Bank emphasized persistent price pressures in the eurozone, leaving investors uncertain
about future liquidity conditions. Meanwhile, tech giants reported robust quarterly earnings, beating analyst
expectations, yet supply chain disruptions and rising raw material costs cast a shadow over projected profit margins.
In emerging markets, currency fluctuations and geopolitical tensions further complicated cross-border investment strategies.
Analysts warn that although short-term gains are visible in certain sectors, systemic risks may offset these benefits,
making the overall outlook highly volatile and sentimentally ambiguous."""

prediction = predict_ensemble(sample_text)
print("\nSample Paragraph Prediction:")
print(prediction)



Sample Paragraph Prediction:
{'label_id': 2, 'label': 'Positive', 'confidence': 0.5384260045009355}
