## Imports & configs


In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

import nltk

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")

import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import scipy.stats as st

RANDOM_STATE = 42


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\diogo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\diogo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\diogo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\diogo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load dataset

In [2]:
TRAIN_PATH = r"C:\Users\diogo\IRONHACK\Bootcamp\WEEK 7\DAY 2\LAB\data\training_data.csv"
TEST_PATH = r"C:\Users\diogo\IRONHACK\Bootcamp\WEEK 7\DAY 2\LAB\data\testing_data.csv"

# training.csv: tab-separated, no header -> [label, headline]
train_df = pd.read_csv(
    TRAIN_PATH,
    sep="\t",
    header=None,
    names=["label", "headline"],
    encoding="utf-8-sig"
)

test_raw_df = pd.read_csv(
    TEST_PATH,
    sep="\t",
    header=None,
    names=["label", "headline"],
    encoding="utf-8-sig"
)

print("Train shape:", train_df.shape)
print("Test shape: ", test_raw_df.shape)
print(train_df.head())


Train shape: (34152, 2)
Test shape:  (9984, 2)
   label                                           headline
0      0  donald trump sends out embarrassing new year‚s...
1      0  drunk bragging trump staffer started russian c...
2      0  sheriff david clarke becomes an internet joke ...
3      0  trump is so obsessed he even has obama‚s name ...
4      0  pope francis just called out donald trump duri...


## Split train into train/validation

In [12]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


False
No GPU


In [3]:
X = train_df["headline"]
y = train_df["label"]

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)

print(f"Train size: {len(X_train)}, Validation size: {len(X_val)}")


Train size: 27321, Validation size: 6831


## Baseline preprocessing & feature extraction

In [4]:
# Bag-of-Words (unigrams)
bow_vectorizer = CountVectorizer(
    lowercase=True,
    stop_words="english",
    token_pattern=r"\b\w+\b",
    max_df=0.9,
    min_df=5
)

# TF-IDF (1–2 grams)
tfidf_vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words="english",
    token_pattern=r"\b\w+\b",
    ngram_range=(1, 2),
    max_df=0.9,
    min_df=5,
    max_features=20000
)

# Lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def custom_tokenizer(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    tokens = text.split()
    cleaned = []
    for tok in tokens:
        if tok not in stop_words:
            lemma = lemmatizer.lemmatize(tok)
            cleaned.append(lemma)
    
    return cleaned


## Define several baseline models

In [5]:
baseline_models = {
    "LogReg_TFIDF": Pipeline([
        ("vect", tfidf_vectorizer),
        ("clf", LogisticRegression(
            max_iter=1000,
            random_state=RANDOM_STATE
        ))
    ]),
    
    "LinearSVC_TFIDF": Pipeline([
        ("vect", tfidf_vectorizer),
        ("clf", LinearSVC(
            random_state=RANDOM_STATE
        ))
    ]),
    
    "MultinomialNB_BOW": Pipeline([
        ("vect", bow_vectorizer),
        ("clf", MultinomialNB())
    ]),
    
    "RandomForest_TFIDF": Pipeline([
        ("vect", tfidf_vectorizer),
        ("clf", RandomForestClassifier(
            n_estimators=300,
            random_state=RANDOM_STATE,
            n_jobs=-1
        ))
    ])
}




## Evaluate all baseline models

In [6]:
results = []

for name, model in baseline_models.items():
    print(f"\n===== Training {name} =====")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    acc = accuracy_score(y_val, y_pred)
    print(f"{name} accuracy: {acc:.4f}")
    print(classification_report(y_val, y_pred, digits=4))
    
    results.append({
        "model": name,
        "accuracy": acc
    })

results_df = pd.DataFrame(results).sort_values("accuracy", ascending=False)
print("\nBaseline model comparison:")
print(results_df)



===== Training LogReg_TFIDF =====
LogReg_TFIDF accuracy: 0.9417
              precision    recall  f1-score   support

           0     0.9482    0.9380    0.9431      3515
           1     0.9350    0.9457    0.9403      3316

    accuracy                         0.9417      6831
   macro avg     0.9416    0.9418    0.9417      6831
weighted avg     0.9418    0.9417    0.9417      6831


===== Training LinearSVC_TFIDF =====
LinearSVC_TFIDF accuracy: 0.9469
              precision    recall  f1-score   support

           0     0.9521    0.9442    0.9482      3515
           1     0.9414    0.9496    0.9455      3316

    accuracy                         0.9469      6831
   macro avg     0.9467    0.9469    0.9468      6831
weighted avg     0.9469    0.9469    0.9469      6831


===== Training MultinomialNB_BOW =====
MultinomialNB_BOW accuracy: 0.9340
              precision    recall  f1-score   support

           0     0.9313    0.9411    0.9362      3515
           1     0.9369   

## Hyperparameter tuning

In [7]:
from sklearn.model_selection import StratifiedKFold

logreg_pipeline = Pipeline([
    ("vect", TfidfVectorizer(
        tokenizer=custom_tokenizer,  
        lowercase=False,             
        stop_words=None,            
        token_pattern=None           
    )),
    ("clf", LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
])


param_distributions = {
    # Vectorizer params
    "vect__ngram_range": [(1, 1), (1, 2)],
    "vect__max_df": [0.7, 0.85, 0.95],
    "vect__min_df": [2, 5, 10],
    "vect__max_features": [10000, 20000, None],
    
    # Classifier params
    "clf__C": st.loguniform(1e-2, 1e2),
    "clf__penalty": ["l2"],
    "clf__class_weight": [None, "balanced"]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

logreg_search = RandomizedSearchCV(
    estimator=logreg_pipeline,
    param_distributions=param_distributions,
    n_iter=20,           
    scoring="accuracy",
    n_jobs=-1,
    cv=cv,
    verbose=2,
    random_state=RANDOM_STATE
)

logreg_search.fit(X_train, y_train)

print("\nBest params:", logreg_search.best_params_)
print("Best CV accuracy:", logreg_search.best_score_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits

Best params: {'clf__C': np.float64(58.72283616443724), 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'vect__max_df': 0.85, 'vect__max_features': 20000, 'vect__min_df': 2, 'vect__ngram_range': (1, 2)}
Best CV accuracy: 0.9359833095421104


## Evaluate tuned model on validation set

In [8]:
best_logreg = logreg_search.best_estimator_

y_val_pred_tuned = best_logreg.predict(X_val)
tuned_acc = accuracy_score(y_val, y_val_pred_tuned)

print("\nTUNED Logistic Regression accuracy on validation set: {:.4f}".format(tuned_acc))
print("\nClassification report (tuned):")
print(classification_report(y_val, y_val_pred_tuned, digits=4))
print("\nConfusion matrix (tuned):")
print(confusion_matrix(y_val, y_val_pred_tuned))



TUNED Logistic Regression accuracy on validation set: 0.9400

Classification report (tuned):
              precision    recall  f1-score   support

           0     0.9452    0.9377    0.9414      3515
           1     0.9345    0.9424    0.9384      3316

    accuracy                         0.9400      6831
   macro avg     0.9399    0.9400    0.9399      6831
weighted avg     0.9400    0.9400    0.9400      6831


Confusion matrix (tuned):
[[3296  219]
 [ 191 3125]]


## Retrain best model on full training data

In [9]:
# Here we assume best_logreg is the chosen model.
# We refit it on ALL labeled data for maximum performance.

best_model = logreg_search.best_estimator_
best_model.fit(X, y)


0,1,2
,steps,"[('vect', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,False
,preprocessor,
,tokenizer,<function cus...001DD47C4BCC0>
,analyzer,'word'
,stop_words,
,token_pattern,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,np.float64(58.72283616443724)
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000


## Predict on testing.csv and create final file

In [10]:
# 1. Predict on test set using your tuned best_model
X_test_final = test_raw_df["headline"]
test_preds = best_model.predict(X_test_final)

print("Unique predictions:", np.unique(test_preds))

# 2. Overwrite the label column with predictions (0 or 1)
test_raw_df["label"] = test_preds

# 3. Keep only the two original columns in correct order
final_test_df = test_raw_df[["label", "headline"]]

# 4. Save in SAME FORMAT as testing.csv:
#    - tab-separated
#    - NO header
OUTPUT_PATH = "testing_predictions.csv"

final_test_df.to_csv(
    OUTPUT_PATH,
    sep="\t",
    index=False,
    header=False
)

print(f"Saved final predictions to: {OUTPUT_PATH}")


Unique predictions: [0 1]
Saved final predictions to: testing_predictions.csv


In [11]:
who


CountVectorizer	 LinearSVC	 LogisticRegression	 MultinomialNB	 OUTPUT_PATH	 Pipeline	 RANDOM_STATE	 RandomForestClassifier	 RandomizedSearchCV	 
StratifiedKFold	 TEST_PATH	 TRAIN_PATH	 TfidfVectorizer	 WordNetLemmatizer	 X	 X_test_final	 X_train	 X_val	 
acc	 accuracy_score	 baseline_models	 best_logreg	 best_model	 bow_vectorizer	 classification_report	 confusion_matrix	 custom_tokenizer	 
cv	 final_test_df	 lemmatizer	 logreg_pipeline	 logreg_search	 model	 name	 nltk	 np	 
param_distributions	 pd	 re	 results	 results_df	 st	 stop_words	 stopwords	 test_preds	 
test_raw_df	 tfidf_vectorizer	 train_df	 train_test_split	 tuned_acc	 y	 y_pred	 y_train	 y_val	 
y_val_pred_tuned	 
