## Imports

In [1]:
import pandas as pd
import re
from imblearn.over_sampling import SMOTE 
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
nltk.download('stopwords')
import os

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cvhuy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load Dataset

In [2]:
bodies_path = r'\Users\cvhuy\Fake News Project\train_bodies.csv'
stances_path = r'\Users\cvhuy\Fake News Project\train_stances.csv'

bodies_df = pd.read_csv(bodies_path)
stances_df = pd.read_csv(stances_path)

# Merge datasets on Body ID
merged_df = stances_df.merge(bodies_df, on='Body ID', how='left')

In [3]:
merged_df.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,30-year-old Moscow resident was hospitalized w...
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,(Reuters) - A Canadian soldier was shot at the...
4,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's..."


## Creating Labels
### Map the stance labels to numerical values for machine learning purposes

In [130]:
# Define function to map stance to label
def map_stance_to_label(stance):
    if stance == 'agree':
        return 1  # Label for real
    elif stance == 'disagree':
        return 0  # Label for fake
    elif stance == 'discuss':
        return 2  # Label for neutral/discuss
    else:
        return None  # Exclude any unrelated categories

In [131]:
# Apply mapping and drop unrelated stances
merged_df['label'] = merged_df['Stance'].apply(map_stance_to_label)
merged_df = merged_df[merged_df['label'].notnull()]

## Text Preprocessing
### reprocesses the headline text by removing punctuation and stop words, ensuring the text is in a suitable format for vectorization

In [132]:
# Preprocess the Headline text
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text).lower()
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text
# Preprocess the Headline and Body text
merged_df['cleaned_headline'] = merged_df['Headline'].apply(preprocess_text)
merged_df['cleaned_body'] = merged_df['articleBody'].apply(preprocess_text)

## Vectorization
### convert the cleaned text into numerical features using the TF-IDF vectorization technique.

In [133]:
# Vectorization with n-grams
merged_df['combined_text'] = merged_df['cleaned_headline'] + ' ' + merged_df['cleaned_body']


vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Using unigrams and bigrams
X = vectorizer.fit_transform(merged_df['combined_text'])
y = merged_df['label']

## Train-Test Split
### divides the data into training (80%) and testing (20%) sets

In [134]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Feature Scaling
### scales the features to improve the model's convergence and performance
##### StandardScaler is used to standardize the features by removing the mean and scaling to unit variance, which helps the model converge faster


In [135]:
# Scale features
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Oversample the Minority Class to Address Class Imbalance

In [136]:
# Oversample the minority class using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)


### Model Training
#### The logistic regression model is trained with the scaled training data. The class_weight='balanced' parameter adjusts weights inversely proportional to class frequencies.

In [137]:
# Train a classification model
model = LogisticRegression(max_iter=5000, class_weight='balanced')
model.fit(X_train_resampled, y_train_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Model Evaluation
#### This section calculates the model's accuracy and generates a detailed classification report, providing metrics such as precision, recall, and F1-score for each class.

In [138]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9072970960536113
              precision    recall  f1-score   support

         0.0       0.56      0.60      0.58       187
         1.0       0.84      0.86      0.85       758
         2.0       0.98      0.96      0.97      1741

    accuracy                           0.91      2686
   macro avg       0.79      0.81      0.80      2686
weighted avg       0.91      0.91      0.91      2686



##### Highlights
##### High Overall Accuracy: At 90.7%, this accuracy is strong, especially given the multi-class nature of the problem.

#### Class-Specific Performance:

Disagree (0.0): Precision and recall are both around 0.6, which is a notable improvement from before, given that this class was initially challenging. Since this is a minority class, the metrics indicate that the model is now picking up on patterns within this label better than before.
Agree (1.0) and Discuss (2.0): These classes show very high precision and recall, especially “discuss,” which has an F1-score of 0.97. This balance indicates a strong ability to generalize.
Balanced Macro and Weighted Averages: The macro and weighted averages both perform well, reflecting that the model isn’t just good at predicting the majority classes but can generalize across all classes.

In [139]:
import joblib

# Save the model to a file
joblib.dump(model, '/Users/cvhuy/Fake News Project/fake_news_detection_model.pkl')


['/Users/cvhuy/Fake News Project/fake_news_detection_model.pkl']

In [140]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'vectorizer' is already defined in your training code
joblib.dump(vectorizer, '/Users/cvhuy/Fake News Project/tfidf_vectorizer.pkl')


['/Users/cvhuy/Fake News Project/tfidf_vectorizer.pkl']