In [None]:
import zipfile

with zipfile.ZipFile("../data/archive.zip", 'r') as zip_ref:
    zip_ref.extractall("unzipped_data")
    
print("Files extracted successfully!")

In [None]:
import pandas as pd

fake_df = pd.read_csv("unzipped_data/Fake.csv")
true_df = pd.read_csv("unzipped_data/True.csv")

print("Fake News Dataset:", fake_df.shape)
print("True News Dataset:", true_df.shape)

fake_df.head()

In [None]:
#merge and label

#Add a label column
fake_df["label"] = "FAKE"
true_df["label"] = "TRUE"

#Merge into one dataset
data = pd.concat([fake_df, true_df], ignore_index = True)

#Shuffle the rows so FAKE and TRUE are mixed
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

#Check the structure
print(data.shape)
print(data["label"].value_counts())
data.head()

# Modelling
## Logistic Regression - Baseline Model

Logistic Regression (TF-IDF) on merged dataset

We train a baseline Logistic Regression model using TF-IDF features. Input: data with columns like title, text, label where label ("FAKE","TRUE")


In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    roc_auc_score, roc_curve, ConfusionMatrixDisplay
)
import matplotlib.pyplot as plt

### step 1:
We'll Encode labels and pick features
* Encode FAKE- 1, TRUE- 0.

In [None]:
# Encode labels: FAKE = 1, TRUE = 0
data["label"] = data["label"].map({"FAKE": 1, "TRUE": 0})


### Step 2:  Feature/Target Split

In [None]:
X = data["text"]      # text
y = data["label"]     # numeric target


### Step 3: Train/Test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


### Step 4: TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


### Step 5: Train Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=1000, solver="liblinear")
log_reg.fit(X_train_tfidf, y_train)


### Step 6: Evaluate
* accuracy, report, confusion matrix, ROC-AUC

In [None]:
# Predicted labels and probabilities
y_pred = log_reg.predict(X_test_tfidf)
y_proba = log_reg.predict_proba(X_test_tfidf)[:, 1]   # probability of FAKE = 1


In [None]:
print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["TRUE","FAKE"]))

cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm, display_labels=["TRUE","FAKE"]).plot(values_format="d")
plt.title("Logistic Regression — Confusion Matrix")
plt.show()

auc = roc_auc_score(y_test, y_proba)
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.plot(fpr, tpr, label=f"ROC-AUC = {auc:.3f}")
plt.plot([0,1],[0,1],"--")
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("Logistic Regression — ROC Curve"); plt.legend(); plt.show()


### Model Performance Summary

Accuracy: 0.987 (~99%)

Precision: TRUE = 0.98, FAKE = 0.99

Recall: TRUE = 0.99, FAKE = 0.98

F1-score: Both classes ~0.99

ROC-AUC: 0.999 (excellent separation between classes)


### Confusion Matrix Insights

TRUE articles: 4241 correctly predicted, 43 misclassified as FAKE.

FAKE articles: 4619 correctly predicted, 77 misclassified as TRUE.

The model makes very few mistakes compared to the large sample size.

* Interpretation:*

Errors are balanced between both classes - the model is not biased towards TRUE or FAKE.

Misclassifications (only ~120 out of ~9000) are acceptable in text classification at this stage.

### ROC Curve

ROC-AUC = 0.999 - that means the model can almost perfectly distinguish between fake and true news.

The curve hugs the top-left corner which is an indication of very high sensitivity and specificity.


## Key Takeaways

Strong baseline: Even without tuning, TF-IDF + Logistic Regression performs extremely well.

Low error rate: Only ~1% of articles are misclassified.

Next steps: Hyperparameter tuning (GridSearchCV) to confirm robustness and maybe squeeze out minor improvements.

### Save artifacts (model + vectorizer) for reuse/deployment

In [None]:
import os, joblib
os.makedirs("models", exist_ok=True)

joblib.dump(log_reg, "models/logreg_model.joblib")
joblib.dump(tfidf,   "models/tfidf_vectorizer.joblib")
print("Saved: models/logreg_model.joblib, models/tfidf_vectorizer.joblib")


## Next Steps - (Hyperparameter Tuning with GridSearchCV)

Now that we have a strong baseline, we perform hyperparameter tuning to confirm robustness and test whether performance can be further optimized.

We tune Logistic Regression parameters using GridSearchCV:

- `tfidf__ngram_range`: unigrams vs bigrams  
- `tfidf__max_df` and `tfidf__min_df`: filter overly common/rare words  
- `clf__C`: regularization strength  
- `clf__solver` and `clf__penalty`: logistic regression optimization  

Evaluation metric: **F1-macro** (balances FAKE and TRUE equally).


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Pipeline: TF-IDF + Logistic Regression
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("clf", LogisticRegression(max_iter=2000, random_state=42))
])

# Parameter grid
param_grid = {
    "tfidf__max_df": [0.5, 0.7, 0.9],
    "tfidf__min_df": [2, 5],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "clf__C": [0.1, 1, 3, 10],
    "clf__solver": ["liblinear"],
    "clf__penalty": ["l2"]
}

# Cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Run GridSearch
gs = GridSearchCV(pipe, param_grid, scoring="f1_macro", cv=cv, n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)

print("Best CV f1_macro:", round(gs.best_score_, 4))
print("Best params:", gs.best_params_)

# Evaluate best model on test set
best = gs.best_estimator_
pred = best.predict(X_test)
proba = best.predict_proba(X_test)[:,1]

print("\nTest report (best):\n", classification_report(y_test, pred, target_names=["TRUE","FAKE"]))
print("Test ROC-AUC:", round(roc_auc_score(y_test, proba), 4))




#### Hyperparameter Tuning Results

GridSearchCV tested **48 candidate parameter combinations**, each evaluated with **5-fold cross-validation**, totalling **240 fits**.  
This process helps ensure the best model is not chosen by chance and generalizes well.

#### Key Outputs:
- **Best Parameters (`gs.best_params_`)**  
  Shows the optimal settings for TF-IDF and Logistic Regression (e.g., n-grams, min/max document frequency, and regularization strength).  
  We found:
  * clf_ _C = 10
  * clf_ _penalty =12
  * clf_ _solver = lilinear
  * tfidf_ _max_df = 0.5
  * tfidf_ _min_df =5
  * tfidf_ _ngram_range = (1, 2)

- **Best Cross-Validated Score (CV F1-macro):** 0.99
- confirms chosen parameters perform consistently across folds.

- **Test Set Evaluation (Best Model):**    
  - **Classification Report** (precision, recall, F1 for bothTRUE/FAKE)  
  - **ROC-AUC Score** 0.99 (excellent probability-based performance)

#### Interpretation:
- The tuned model gave almost the same results as the baseline (~0.99).
This shows that the baseline Logistic Regression was already close to optimal.
- Running GridSearchCV was still valuable because it confirmed the model's stability and robustness.

### Save Tuned Model Artifacts - For Deployment 

After hyperparameter tuning, we save the best model and vectorizer.  
These artifacts will be reused in deployment (Streamlit app) and reporting (Tableau dashboard).


In [None]:
import os, joblib

os.makedirs("models", exist_ok=True)

# Save tuned model and vectorizer
joblib.dump(best, "models/logreg_tuned_model.joblib")
joblib.dump(best.named_steps["tfidf"], "models/tfidf_tuned_vectorizer.joblib")

print("Saved: models/logreg_tuned_model.joblib, models/tfidf_tuned_vectorizer.joblib")


## Baseline vs Tuned Model Comparison

We compare baseline Logistic Regression with the tuned GridSearchCV model to highlight improvements.

| Model                      | Accuracy | F1 (Macro) | ROC-AUC |
|-----------------------------|----------|------------|---------|
| Logistic Regression (Base)  | 0.9866   | ~0.9866    | ~0.999  |
| Logistic Regression (Tuned) | 0.9899   | ~0.9899   | ~0.9997 |

**Interpretation:**
- The baseline model already performs very well.
- Hyperparameter tuning slightly improves F1/ROC-AUC, showing robustness.
- Either model could be deployed, but the tuned model is safer for production since parameters were validated across multiple folds.
