# Answering Exploratory Question 4: Does training the entire RoBERTa-BiLSTM network improve performance?

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('new_normalized_headlines.csv')

In [3]:
df

Unnamed: 0,url,news,headline,cleaned,lemmatized,stemmed,word_count
0,https://www.foxnews.com/lifestyle/jack-carrs-e...,Fox News,jack carr recalls eisenhower's d-day memo 'gre...,jack carr recalls eisenhowers dday memo great ...,jack carr recall eisenhower dday memo great no...,jack carr recal eisenhow dday memo great nobl ...,9
1,https://www.foxnews.com/entertainment/bruce-wi...,Fox News,"bruce willis, demi moore avoided one thing co-...",bruce willis demi moore avoided one thing copa...,bruce willis demi moore avoided one thing copa...,bruce willi demi moor avoid one thing copar da...,10
2,https://www.foxnews.com/politics/blinken-meets...,Fox News,"blinken meets qatar pm, says israeli actions '...",blinken meets qatar pm says israeli actions re...,blinken meet qatar pm say israeli action retal...,blinken meet qatar pm say isra action retali d...,11
3,https://www.foxnews.com/entertainment/emily-bl...,Fox News,emily blunt says ‘toes curl’ people tell kids ...,emily blunt says toes curl people tell kids wa...,emily blunt say toe curl people tell kid want ...,emili blunt say toe curl peopl tell kid want a...,15
4,https://www.foxnews.com/media/the-view-co-host...,Fox News,"'the view' co-host, cnn commentator ana navarr...",the view cohost cnn commentator ana navarro ho...,the view cohost cnn commentator ana navarro ho...,the view cohost cnn comment ana navarro host n...,12
...,...,...,...,...,...,...,...
3799,https://www.foxnews.com/food-drink/salad-alway...,Fox News,salad always better choice sandwich? think twice,salad always better choice sandwich think twice,salad always better choice sandwich think twice,salad alway better choic sandwich think twice,7
3800,https://www.foxnews.com/us/jocelyn-nungaray-fa...,Fox News,jocelyn nungaray sexually assaulted alleged mu...,jocelyn nungaray sexually assaulted alleged mu...,jocelyn nungaray sexually assaulted alleged mu...,jocelyn nungaray sexual assault alleg murder i...,10
3801,https://www.foxnews.com/politics/biden-gives-w...,Fox News,biden gives 3-word response asked debate trump,biden gives word response asked debate trump,biden give word response asked debate trump,biden give word respons ask debat trump,7
3802,https://www.foxnews.com/official-polls/fox-new...,Fox News,fox new poll: biden trump tie wisconsin head-t...,fox new poll biden trump tie wisconsin headtoh...,fox new poll biden trump tie wisconsin headtoh...,fox new poll biden trump tie wisconsin headtoh...,9


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Use the stemmed DataFrame
# Make sure df_stem has a 'stemmed' column and 'label' for target
X = df['lemmatized']
y = df['news']  # Replace 'label' with your actual column if different

# 2. Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

y_train = y_train.apply(lambda x: 1 if x == 'Fox News' else 0)
y_test = y_test.apply(lambda x: 1 if x == 'Fox News' else 0)

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
from sklearn.model_selection import train_test_split
import pandas as pd

In [13]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [14]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts.tolist()  # Make sure it's a list
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': enc['input_ids'].squeeze(),
            'attention_mask': enc['attention_mask'].squeeze(),
            'label': torch.tensor(self.labels[idx], dtype=torch.float)
        }

In [15]:
train_dataset = TextDataset(X_train, y_train, tokenizer)
test_dataset = TextDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [16]:
class RobertaBiLSTMClassifier(nn.Module):
    def __init__(self, roberta_model_name='roberta-base', hidden_dim=256, lstm_layers=3, bidirectional=True, dropout=0.3):
        super(RobertaBiLSTMClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.bidirectional = bidirectional

        input_dim = self.roberta.config.hidden_size  # 768 for roberta-base

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=0
        )

        lstm_output_dim = hidden_dim * 2 if bidirectional else hidden_dim

        self.classifier = nn.Sequential(
            nn.Linear(lstm_output_dim, 128),
            nn.ReLU(),
            # nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(),
            # nn.Dropout(dropout),
            nn.Linear(64, 1)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state

        _, (hn, _) = self.lstm(sequence_output)

        if self.bidirectional:
            last_hidden = torch.cat((hn[-2], hn[-1]), dim=1)
        else:
            last_hidden = hn[-1]

        logits = self.classifier(last_hidden)
        return logits

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = RobertaBiLSTMClassifier().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

cuda


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
from sklearn.metrics import classification_report, accuracy_score
epochs = 10

for epoch in range(epochs):
    model.train()
    total_train_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask).squeeze()

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # ===== Validation Phase =====
    model.eval()
    total_val_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask).squeeze()
            loss = criterion(outputs, labels)
            total_val_loss += loss.item()

            probs = torch.sigmoid(outputs)
            preds = (probs > 0.5).long()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(test_loader)
    acc = accuracy_score(all_labels, all_preds)
    # report = classification_report(all_labels, all_preds, target_names=["Fake", "Real"])

    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Accuracy: {acc:.4f}")
    # print("Classification Report:\n", report)

Epoch 1/10
Train Loss: 0.6724 | Val Loss: 0.6779 | Accuracy: 0.5690
Epoch 2/10
Train Loss: 0.6171 | Val Loss: 0.5596 | Accuracy: 0.7385
Epoch 3/10
Train Loss: 0.5508 | Val Loss: 0.5207 | Accuracy: 0.7766
Epoch 4/10
Train Loss: 0.4581 | Val Loss: 0.4704 | Accuracy: 0.8003
Epoch 5/10
Train Loss: 0.3820 | Val Loss: 0.4325 | Accuracy: 0.8160
Epoch 6/10
Train Loss: 0.2983 | Val Loss: 0.4717 | Accuracy: 0.8200
Epoch 7/10
Train Loss: 0.2631 | Val Loss: 0.4498 | Accuracy: 0.8108
Epoch 8/10
Train Loss: 0.1970 | Val Loss: 0.5530 | Accuracy: 0.8081
Epoch 9/10
Train Loss: 0.1723 | Val Loss: 0.5347 | Accuracy: 0.8160
Epoch 10/10
Train Loss: 0.1251 | Val Loss: 0.5684 | Accuracy: 0.8226


In [19]:
from sklearn.metrics import classification_report, accuracy_score

def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask).squeeze()
            probs = torch.sigmoid(outputs)
            preds = (probs > 0.5).long()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds)  # or use actual label names

    print(f"Accuracy: {acc:.4f}")
    print("Classification Report:\n", report)

evaluate(model, test_loader, device)

Accuracy: 0.8226
Classification Report:
               precision    recall  f1-score   support

         0.0       0.87      0.73      0.80       361
         1.0       0.79      0.91      0.84       400

    accuracy                           0.82       761
   macro avg       0.83      0.82      0.82       761
weighted avg       0.83      0.82      0.82       761



In [None]:
# torch.save(model, 'roberta_bilstm_model_full.pth')

In [20]:
torch.save(model.state_dict(), 'roberta_bilstm_model_e10.pt')

# Training the entire RoBERTa-BiLSTM network, including RoBERTa, helps significantly improve performance.