# Answering Exploratory Question 1: Which normalization technique is better, stemming, normalization, or cleaned text?

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('new_normalized_headlines.csv')

In [None]:
df

Unnamed: 0,url,news,headline,cleaned,lemmatized,stemmed,word_count
0,https://www.foxnews.com/lifestyle/jack-carrs-e...,Fox News,jack carr recalls eisenhower's d-day memo 'gre...,jack carr recalls eisenhowers dday memo great ...,jack carr recall eisenhower dday memo great no...,jack carr recal eisenhow dday memo great nobl ...,9
1,https://www.foxnews.com/entertainment/bruce-wi...,Fox News,"bruce willis, demi moore avoided one thing co-...",bruce willis demi moore avoided one thing copa...,bruce willis demi moore avoided one thing copa...,bruce willi demi moor avoid one thing copar da...,10
2,https://www.foxnews.com/politics/blinken-meets...,Fox News,"blinken meets qatar pm, says israeli actions '...",blinken meets qatar pm says israeli actions re...,blinken meet qatar pm say israeli action retal...,blinken meet qatar pm say isra action retali d...,11
3,https://www.foxnews.com/entertainment/emily-bl...,Fox News,emily blunt says â€˜toes curlâ€™ people tell kids ...,emily blunt says toes curl people tell kids wa...,emily blunt say toe curl people tell kid want ...,emili blunt say toe curl peopl tell kid want a...,15
4,https://www.foxnews.com/media/the-view-co-host...,Fox News,"'the view' co-host, cnn commentator ana navarr...",the view cohost cnn commentator ana navarro ho...,the view cohost cnn commentator ana navarro ho...,the view cohost cnn comment ana navarro host n...,12
...,...,...,...,...,...,...,...
3799,https://www.foxnews.com/food-drink/salad-alway...,Fox News,salad always better choice sandwich? think twice,salad always better choice sandwich think twice,salad always better choice sandwich think twice,salad alway better choic sandwich think twice,7
3800,https://www.foxnews.com/us/jocelyn-nungaray-fa...,Fox News,jocelyn nungaray sexually assaulted alleged mu...,jocelyn nungaray sexually assaulted alleged mu...,jocelyn nungaray sexually assaulted alleged mu...,jocelyn nungaray sexual assault alleg murder i...,10
3801,https://www.foxnews.com/politics/biden-gives-w...,Fox News,biden gives 3-word response asked debate trump,biden gives word response asked debate trump,biden give word response asked debate trump,biden give word respons ask debat trump,7
3802,https://www.foxnews.com/official-polls/fox-new...,Fox News,fox new poll: biden trump tie wisconsin head-t...,fox new poll biden trump tie wisconsin headtoh...,fox new poll biden trump tie wisconsin headtoh...,fox new poll biden trump tie wisconsin headtoh...,9


#TFIDF

## Stemmed

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1. Use the stemmed DataFrame
# Make sure df_stem has a 'stemmed' column and 'label' for target
X = df['stemmed']
y = df['news']  # Replace 'label' with your actual column if different

# 2. Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. Convert labels to binary (1 for FoxNews, 0 for NBC)
y_train = y_train.apply(lambda x: 1 if x == 'Fox News' else 0)
y_test = y_test.apply(lambda x: 1 if x == 'Fox News' else 0)

# 4. TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 5. Train Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)

# 6. Predict and evaluate
y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"âœ… Accuracy (Stemmed): {accuracy:.4f}")
print("ðŸ“‹ Classification Report (Stemmed):\n", classification_report(y_test, y_pred))

âœ… Accuracy (Stemmed): 0.7148
ðŸ“‹ Classification Report (Stemmed):
               precision    recall  f1-score   support

           0       0.75      0.59      0.66       361
           1       0.69      0.82      0.75       400

    accuracy                           0.71       761
   macro avg       0.72      0.71      0.71       761
weighted avg       0.72      0.71      0.71       761



## Lemmatization

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1. Use lemmatized text for features
X = df['lemmatized']
y = df['news']  # Replace with your actual label column

# 2. Split into train and test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. Convert labels to binary (1 = FoxNews, 0 = NBC)
y_train = y_train.apply(lambda x: 1 if x == 'Fox News' else 0)
y_test = y_test.apply(lambda x: 1 if x == 'Fox News' else 0)

# 4. TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 5. Train Logistic Regression
model = LogisticRegression(max_iter=100)
model.fit(X_train_tfidf, y_train)

# 6. Evaluate
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

print(f"âœ… Accuracy (Lemmatized): {accuracy:.4f}")
print("ðŸ“‹ Classification Report (Lemmatized):\n", classification_report(y_test, y_pred))

âœ… Accuracy (Lemmatized): 0.7070
ðŸ“‹ Classification Report (Lemmatized):
               precision    recall  f1-score   support

           0       0.75      0.57      0.65       361
           1       0.68      0.83      0.75       400

    accuracy                           0.71       761
   macro avg       0.72      0.70      0.70       761
weighted avg       0.71      0.71      0.70       761



## Cleaned

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1. Use lemmatized text for features
X = df['cleaned']
y = df['news']  # Replace with your actual label column

# 2. Split into train and test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. Convert labels to binary (1 = FoxNews, 0 = NBC)
y_train = y_train.apply(lambda x: 1 if x == 'Fox News' else 0)
y_test = y_test.apply(lambda x: 1 if x == 'Fox News' else 0)

# 4. TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 5. Train Logistic Regression
model = LogisticRegression(max_iter=100)
model.fit(X_train_tfidf, y_train)

# 6. Evaluate
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

print(f"âœ… Accuracy: {accuracy:.4f}")
print("ðŸ“‹ Classification Report:\n", classification_report(y_test, y_pred))

âœ… Accuracy: 0.7004
ðŸ“‹ Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.57      0.64       361
           1       0.68      0.82      0.74       400

    accuracy                           0.70       761
   macro avg       0.71      0.69      0.69       761
weighted avg       0.71      0.70      0.69       761



#RoBERTa

In [None]:
import numpy as np

## Stemmed

In [None]:
# Load the .npy files
X_train = np.load('RoBERTa_train_stem_embeddings.npy')
X_test = np.load('RoBERTa_test_stem_embeddings.npy')

# Optional: check the shape and type
print(f"Train embeddings shape: {X_train.shape}, dtype: {X_train.dtype}")
print(f"Test embeddings shape: {X_test.shape}, dtype: {X_test.dtype}")

Train embeddings shape: (3043, 768), dtype: float32
Test embeddings shape: (761, 768), dtype: float32


In [None]:
y_train = np.load('y_train_roberta_stem.npy')
y_test = np.load('y_test_roberta_stem.npy')

print(f"Train output shape: {y_train.shape}, dtype: {y_train.dtype}")
print(f"Test output shape: {y_test.shape}, dtype: {y_test.dtype}")

Train output shape: (3043,), dtype: int64
Test output shape: (761,), dtype: int64


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the logistic regression model
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

# Predict on test set
y_pred = logreg.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7109067017082786

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.63      0.67       361
           1       0.70      0.79      0.74       400

    accuracy                           0.71       761
   macro avg       0.71      0.71      0.71       761
weighted avg       0.71      0.71      0.71       761


Confusion Matrix:
 [[226 135]
 [ 85 315]]


## Lemmatized

In [None]:
# Load the .npy files
X_train = np.load('RoBERTa_train_lemma_embeddings.npy')
X_test = np.load('RoBERTa_test_lemma_embeddings.npy')

# Optional: check the shape and type
print(f"Train embeddings shape: {X_train.shape}, dtype: {X_train.dtype}")
print(f"Test embeddings shape: {X_test.shape}, dtype: {X_test.dtype}")

Train embeddings shape: (3043, 768), dtype: float32
Test embeddings shape: (761, 768), dtype: float32


In [None]:
y_train = np.load('y_train_roberta_lemma.npy')
y_test = np.load('y_test_roberta_lemma.npy')

print(f"Train output shape: {y_train.shape}, dtype: {y_train.dtype}")
print(f"Test output shape: {y_test.shape}, dtype: {y_test.dtype}")

Train output shape: (3043,), dtype: int64
Test output shape: (761,), dtype: int64


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the logistic regression model
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

# Predict on test set
y_pred = logreg.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7450722733245729

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.68      0.72       361
           1       0.74      0.80      0.77       400

    accuracy                           0.75       761
   macro avg       0.75      0.74      0.74       761
weighted avg       0.75      0.75      0.74       761


Confusion Matrix:
 [[247 114]
 [ 80 320]]


## Cleaned

In [None]:
# Load the .npy files
X_train = np.load('RoBERTa_train_clean_embeddings.npy')
X_test = np.load('RoBERTa_test_clean_embeddings.npy')

# Optional: check the shape and type
print(f"Train embeddings shape: {X_train.shape}, dtype: {X_train.dtype}")
print(f"Test embeddings shape: {X_test.shape}, dtype: {X_test.dtype}")

Train embeddings shape: (3043, 768), dtype: float32
Test embeddings shape: (761, 768), dtype: float32


In [None]:
y_train = np.load('y_train_roberta_clean.npy')
y_test = np.load('y_test_roberta_clean.npy')

print(f"Train output shape: {y_train.shape}, dtype: {y_train.dtype}")
print(f"Test output shape: {y_test.shape}, dtype: {y_test.dtype}")

Train output shape: (3043,), dtype: int64
Test output shape: (761,), dtype: int64


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the logistic regression model
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

# Predict on test set
y_pred = logreg.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7398160315374507

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.69      0.71       361
           1       0.74      0.79      0.76       400

    accuracy                           0.74       761
   macro avg       0.74      0.74      0.74       761
weighted avg       0.74      0.74      0.74       761


Confusion Matrix:
 [[248 113]
 [ 85 315]]


# For TF-IDF, stemming is better. For RoBERTa, lemmatization is better.