# Answering Exploratory Question 2: Is RoBERTa better than TF-IDF?

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('new_normalized_headlines.csv')

# TFIDF - Stem

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1. Use the stemmed DataFrame
# Make sure df_stem has a 'stemmed' column and 'label' for target
X = df['stemmed']
y = df['news']  # Replace 'label' with your actual column if different

# 2. Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. Convert labels to binary (1 for FoxNews, 0 for NBC)
y_train = y_train.apply(lambda x: 1 if x == 'Fox News' else 0)
y_test = y_test.apply(lambda x: 1 if x == 'Fox News' else 0)

# 4. TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

## Logistic Regression

In [4]:
# 5. Train Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)

# 6. Predict and evaluate
y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"âœ… Accuracy (Stemmed): {accuracy:.4f}")
print("ðŸ“‹ Classification Report (Stemmed):\n", classification_report(y_test, y_pred))

âœ… Accuracy (Stemmed): 0.7148
ðŸ“‹ Classification Report (Stemmed):
               precision    recall  f1-score   support

           0       0.75      0.59      0.66       361
           1       0.69      0.82      0.75       400

    accuracy                           0.71       761
   macro avg       0.72      0.71      0.71       761
weighted avg       0.72      0.71      0.71       761



In [5]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# 5. Initialize and train the SVM classifier
svm_clf = SVC(kernel='linear', C=1.0, random_state=42)
svm_clf.fit(X_train_tfidf, y_train)

# 6. Make predictions
y_pred = svm_clf.predict(X_test_tfidf)

# 7. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy (Stemmed): {accuracy:.4f}")
print("Classification Report (Stemmed):\n", report)

Accuracy (Stemmed): 0.7057
Classification Report (Stemmed):
               precision    recall  f1-score   support

           0       0.76      0.55      0.64       361
           1       0.68      0.84      0.75       400

    accuracy                           0.71       761
   macro avg       0.72      0.70      0.70       761
weighted avg       0.72      0.71      0.70       761



In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 5. Initialize and train the Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_tfidf, y_train)

# 6. Make predictions
y_pred = rf_clf.predict(X_test_tfidf)

# 7. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)


Accuracy: 0.6833
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.61      0.65       361
           1       0.68      0.75      0.71       400

    accuracy                           0.68       761
   macro avg       0.68      0.68      0.68       761
weighted avg       0.68      0.68      0.68       761



In [7]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# 5. Initialize and train the XGBoost classifier
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train_tfidf, y_train)

# 6. Make predictions
y_pred = xgb_clf.predict(X_test_tfidf)

# 7. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.6991
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.57      0.64       361
           1       0.68      0.82      0.74       400

    accuracy                           0.70       761
   macro avg       0.71      0.69      0.69       761
weighted avg       0.71      0.70      0.69       761



# RoBERTa - Lemmatization

In [8]:
import numpy as np
# Load the .npy files
X_train = np.load('RoBERTa_train_lemma_embeddings.npy')
X_test = np.load('RoBERTa_test_lemma_embeddings.npy')

# Optional: check the shape and type
print(f"Train embeddings shape: {X_train.shape}, dtype: {X_train.dtype}")
print(f"Test embeddings shape: {X_test.shape}, dtype: {X_test.dtype}")

Train embeddings shape: (3043, 768), dtype: float32
Test embeddings shape: (761, 768), dtype: float32


In [9]:
y_train = np.load('y_train_roberta_lemma.npy')
y_test = np.load('y_test_roberta_lemma.npy')

print(f"Train output shape: {y_train.shape}, dtype: {y_train.dtype}")
print(f"Test output shape: {y_test.shape}, dtype: {y_test.dtype}")

Train output shape: (3043,), dtype: int64
Test output shape: (761,), dtype: int64


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1. Initialize and train the logistic regression model
lr_clf = LogisticRegression(max_iter=1000, random_state=42)
lr_clf.fit(X_train, y_train)

# 2. Make predictions
y_pred = lr_clf.predict(X_test)

# 3. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

Accuracy: 0.7451
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.68      0.72       361
           1       0.74      0.80      0.77       400

    accuracy                           0.75       761
   macro avg       0.75      0.74      0.74       761
weighted avg       0.75      0.75      0.74       761



In [11]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# 1. Initialize and train the SVM model
svm_clf = SVC(kernel='linear', C=1.0, random_state=42)
svm_clf.fit(X_train, y_train)

# 2. Make predictions
y_pred = svm_clf.predict(X_test)

# 3. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

Accuracy: 0.7569
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.70      0.73       361
           1       0.75      0.81      0.78       400

    accuracy                           0.76       761
   macro avg       0.76      0.75      0.75       761
weighted avg       0.76      0.76      0.76       761



In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Initialize and train the Random Forest model
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# 2. Make predictions
y_pred = rf_clf.predict(X_test)

# 3. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

Accuracy: 0.7240
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.63      0.68       361
           1       0.71      0.81      0.76       400

    accuracy                           0.72       761
   macro avg       0.73      0.72      0.72       761
weighted avg       0.73      0.72      0.72       761



In [13]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Initialize and train the XGBoost model
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train, y_train)

# 2. Make predictions
y_pred = xgb_clf.predict(X_test)

# 3. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.7530
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.68      0.72       361
           1       0.74      0.82      0.78       400

    accuracy                           0.75       761
   macro avg       0.76      0.75      0.75       761
weighted avg       0.75      0.75      0.75       761



# RoBERTa models outperformed TF-IDF models for all corresponding models, so RoBERTa is better.