In [1]:
import pandas as pd

In [25]:
df = pd.read_csv('News Classification_Test Data_Project - Sheet1.csv')

In [26]:
df

Unnamed: 0,ID,Headline,Label(FoxNews/NBC)
0,1,Democrats' boiling pot: A look at their 2026 g...,
1,2,Appeals court restores hold on Trump admin’s p...,
2,3,"David Perdue, former senator and longtime Trum...",
3,4,Tesla arson suspect arrested in Arizona after ...,
4,5,Trump wants Eagles' tush push to remain in NFL...,
...,...,...,...
1168,1169,North Korea says it has deployed a new nuclear...,
1169,1170,"Prigozhin, Wagner Group chief, listed among th...",
1170,1171,5 best spin bike accessories for indoor cyclin...,
1171,1172,NBC Select Travel Awards 2024: The best hardsh...,


In [27]:
# Remove stopwords from the reviews
# From NLTK
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Get the list of English stopwords
stopwords = stopwords.words('english')
# stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
df['HeadlineStop'] = df['Headline'].apply(lambda x: ' '.join([word for word in x.lower().split() if word not in stopwords]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
import re

def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)
df['cleaned'] = df['HeadlineStop'].apply(remove_special_characters).str.lower()

In [29]:
df[['HeadlineStop', 'cleaned']].head()

Unnamed: 0,HeadlineStop,cleaned
0,democrats' boiling pot: look 2026 game plan,democrats boiling pot look game plan
1,appeals court restores hold trump admin’s plan...,appeals court restores hold trump admins plan ...
2,"david perdue, former senator longtime trump al...",david perdue former senator longtime trump all...
3,tesla arson suspect arrested arizona fiery ass...,tesla arson suspect arrested arizona fiery ass...
4,trump wants eagles' tush push remain nfl: 'exc...,trump wants eagles tush push remain nfl exciti...


In [30]:
import nltk
from nltk.stem import WordNetLemmatizer

# Download NLTK data (only need to do this once)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the lemmatization function
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# Apply it to the 'cleaned' column
df['lemmatized'] = df['cleaned'].astype(str).apply(lemmatize_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [31]:
# Preview
df[['cleaned', 'lemmatized']].head()

Unnamed: 0,cleaned,lemmatized
0,democrats boiling pot look game plan,democrat boiling pot look game plan
1,appeals court restores hold trump admins plan ...,appeal court restores hold trump admins plan c...
2,david perdue former senator longtime trump all...,david perdue former senator longtime trump all...
3,tesla arson suspect arrested arizona fiery ass...,tesla arson suspect arrested arizona fiery ass...
4,trump wants eagles tush push remain nfl exciti...,trump want eagle tush push remain nfl exciting...


In [32]:
X_test = df['lemmatized']

In [33]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
import pandas as pd

In [34]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [35]:
class HiddenTextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=128):
        self.texts = texts.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': enc['input_ids'].squeeze(),
            'attention_mask': enc['attention_mask'].squeeze()
        }

In [36]:
hidden_dataset = HiddenTextDataset(X_test, tokenizer)
hidden_loader = DataLoader(hidden_dataset, batch_size=16)

In [37]:
class RobertaBiLSTMClassifier(nn.Module):
    def __init__(self, roberta_model_name='roberta-base', hidden_dim=256, lstm_layers=3, bidirectional=True, dropout=0.3):
        super(RobertaBiLSTMClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.bidirectional = bidirectional

        input_dim = self.roberta.config.hidden_size  # 768 for roberta-base

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=0
        )

        lstm_output_dim = hidden_dim * 2 if bidirectional else hidden_dim

        self.classifier = nn.Sequential(
            nn.Linear(lstm_output_dim, 128),
            nn.ReLU(),
            # nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(),
            # nn.Dropout(dropout),
            nn.Linear(64, 1)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state

        _, (hn, _) = self.lstm(sequence_output)

        if self.bidirectional:
            last_hidden = torch.cat((hn[-2], hn[-1]), dim=1)
        else:
            last_hidden = hn[-1]

        logits = self.classifier(last_hidden)
        return logits

In [38]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RobertaBiLSTMClassifier().to(device)
model.load_state_dict(torch.load('roberta_bilstm_model_e10.pt', map_location=device))
model.eval()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaBiLSTMClassifier(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

In [39]:
hidden_preds = []

with torch.no_grad():
    for batch in hidden_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask).squeeze()
        probs = torch.sigmoid(outputs)
        preds = (probs > 0.5).long()

        hidden_preds.extend(preds.cpu().numpy())

# Save or view predictions
hidden_preds = pd.Series(hidden_preds)

In [40]:
df['Label(FoxNews/NBC)'] = hidden_preds

In [41]:
df.head()

Unnamed: 0,ID,Headline,Label(FoxNews/NBC),HeadlineStop,cleaned,lemmatized
0,1,Democrats' boiling pot: A look at their 2026 g...,0,democrats' boiling pot: look 2026 game plan,democrats boiling pot look game plan,democrat boiling pot look game plan
1,2,Appeals court restores hold on Trump admin’s p...,0,appeals court restores hold trump admin’s plan...,appeals court restores hold trump admins plan ...,appeal court restores hold trump admins plan c...
2,3,"David Perdue, former senator and longtime Trum...",1,"david perdue, former senator longtime trump al...",david perdue former senator longtime trump all...,david perdue former senator longtime trump all...
3,4,Tesla arson suspect arrested in Arizona after ...,0,tesla arson suspect arrested arizona fiery ass...,tesla arson suspect arrested arizona fiery ass...,tesla arson suspect arrested arizona fiery ass...
4,5,Trump wants Eagles' tush push to remain in NFL...,1,trump wants eagles' tush push remain nfl: 'exc...,trump wants eagles tush push remain nfl exciti...,trump want eagle tush push remain nfl exciting...


In [42]:
df['Label(FoxNews/NBC)'] = df['Label(FoxNews/NBC)'].map({1: 'FoxNews', 0: 'NBC'})

In [43]:
df.head()

Unnamed: 0,ID,Headline,Label(FoxNews/NBC),HeadlineStop,cleaned,lemmatized
0,1,Democrats' boiling pot: A look at their 2026 g...,NBC,democrats' boiling pot: look 2026 game plan,democrats boiling pot look game plan,democrat boiling pot look game plan
1,2,Appeals court restores hold on Trump admin’s p...,NBC,appeals court restores hold trump admin’s plan...,appeals court restores hold trump admins plan ...,appeal court restores hold trump admins plan c...
2,3,"David Perdue, former senator and longtime Trum...",FoxNews,"david perdue, former senator longtime trump al...",david perdue former senator longtime trump all...,david perdue former senator longtime trump all...
3,4,Tesla arson suspect arrested in Arizona after ...,NBC,tesla arson suspect arrested arizona fiery ass...,tesla arson suspect arrested arizona fiery ass...,tesla arson suspect arrested arizona fiery ass...
4,5,Trump wants Eagles' tush push to remain in NFL...,FoxNews,trump wants eagles' tush push remain nfl: 'exc...,trump wants eagles tush push remain nfl exciti...,trump want eagle tush push remain nfl exciting...


In [44]:
df.drop(columns=['HeadlineStop', 'cleaned', 'lemmatized'], inplace=True)

In [45]:
df.head()

Unnamed: 0,ID,Headline,Label(FoxNews/NBC)
0,1,Democrats' boiling pot: A look at their 2026 g...,NBC
1,2,Appeals court restores hold on Trump admin’s p...,NBC
2,3,"David Perdue, former senator and longtime Trum...",FoxNews
3,4,Tesla arson suspect arrested in Arizona after ...,NBC
4,5,Trump wants Eagles' tush push to remain in NFL...,FoxNews


In [46]:
df.to_csv('Dhruv_Kruthi_Predictions.csv', index=False)