# Baseline model for claim classification

In this notebook, we'll explore basic ML models to train and make inferences about the veracity of environment related claims. These models, such as _linear regression_ and _naive bayes_, are simplier, faster and more easily interpretable than more advanced deep learning approaches. 

The results from this exercise will be used as reference point for the claim classification project. 

In [None]:
%load_ext autoreload
%autoreload 2

# import required libraries
import numpy as np 
import pandas as pd
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

### Load data

Import data from previously produced csv files

In [None]:
train_df = pd.read_csv("../data/train_data.csv")
test_df = pd.read_csv("../data/test_data.csv")
val_df = pd.read_csv("../data/val_data.csv")

# remove nans. they cant be vectorized
train_df = train_df.dropna()
test_df = test_df.dropna()
val_df = val_df.dropna()

In [None]:
val_df.head()

### Separate features and labels

Weâ€™ll train on _clean_text_ and predict _label_

In [None]:
x_list = ['text']
y_list = ['label']

x_train, y_train = train_df[x_list], train_df[y_list]
x_test, y_test   = test_df[x_list], test_df[y_list]
x_val, y_val     = val_df[x_list], val_df[y_list]

y_train = y_train.values.ravel() # flaten so it's labels are 1D
y_test  = y_test.values.ravel() # flaten so it's labels are 1D
y_val   = y_val.values.ravel() # flaten so it's labels are 1D

print(x_train.shape)
print(y_train.shape)

### Baseline pipeline

We'll use **TF-IDF** (Term Frequency - Inverse Document Frequency) with **Logistic Regression**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

#### Model parameter optimization
Through grid scan

In [None]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_recall_fscore_support
from sklearn.svm import LinearSVC
import pandas as pd

param_grid = [
    {"ngram_range": (1,1), "max_features": 5000, "class_weight": None},
    {"ngram_range": (1,2), "max_features": 5000, "class_weight": None},
    {"ngram_range": (1,2), "max_features": 10000, "class_weight": None},
    {"ngram_range": (1,2), "max_features": 20000, "class_weight": None},
    {"ngram_range": (1,2), "max_features": 5000,  "class_weight": "balanced"},
    {"ngram_range": (1,2), "max_features": 10000, "class_weight": "balanced"},
]

results = []

for params in param_grid:
    clf_temp = Pipeline([
        ("tfidf", TfidfVectorizer(
            ngram_range=params["ngram_range"],
            max_features=params["max_features"],
            stop_words="english"
        )),
        ("svc", LinearSVC(
            max_iter=2000,
            class_weight=params["class_weight"]
        ))
    ])

    clf_temp.fit(x_train["text"], y_train)
    y_pred_temp = clf_temp.predict(x_val["text"])

    acc = accuracy_score(y_val, y_pred_temp)
    acc_balanced = balanced_accuracy_score(y_val, y_pred_temp)
    f1_macro = f1_score(y_val, y_pred_temp, average="macro")
    f1_weighted = f1_score(y_val, y_pred_temp, average="weighted")
    results.append({
        "ngram_range": params["ngram_range"],
        "max_features": params["max_features"],
        "class_weight": params["class_weight"],
        "accuracy": acc,
        "f1 macro": f1_macro,
        "accuracy balanced": acc_balanced,
        "f1 weighted": f1_weighted
    })

# Put in table
results_df = pd.DataFrame(results)
print(results_df)

Chose parameters based on tradeoff between accuracy and f1

In [None]:
max_features = 10000
ngram_range=(1,2)
class_weight="balanced"

### Baseline performance

Chose a model between logistic regression, random forest, and xgboost based on overall performance.

In [None]:
from sklearn.metrics import classification_report

In [None]:
from src.models_xgb import *
from sklearn.preprocessing import LabelEncoder

# XGBooost needs encoded labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc  = le.transform(y_val)

print("Classes mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

In [None]:
from src.utils import compute_metrics
import joblib
import json

log_model = train_classic_model(x_train, y_train_enc, "logreg", ngram_range=ngram_range, max_features=max_features, class_weight=class_weight)
rf_model  = train_classic_model(x_train, y_train_enc, "rf", ngram_range=ngram_range, max_features=max_features, class_weight=class_weight)
svc_model = train_classic_model(x_train, y_train_enc, "svc", ngram_range=ngram_range, max_features=max_features, class_weight=class_weight)
xgb_model = train_classic_model(x_train, y_train_enc, "xgb", ngram_range=ngram_range, max_features=max_features, class_weight=class_weight)

results = []

models = {
        "log": log_model, # Logistic Regression
        "rf": rf_model, # Random Forest
        "svc": svc_model, # Linear SVC
        "xgb": xgb_model # XGBoost
}

for name, model in models.items():
    
    # save model
    model_path = "./results/baseline_model/"+name+"/"
    os.makedirs(model_path, exist_ok=True)
    joblib.dump(model, os.path.join(model_path,name+".joblib"))

    preds = model.predict(x_val)

    acc = accuracy_score(y_val_enc, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(y_val_enc, preds, average="weighted", zero_division=0)
    # Compute metrics using the updated compute_metrics (accepts 1D preds)
    metrics_weighted = compute_metrics((preds, np.array(y_val_enc)))
    print(f"Metrics for {name} (weighted):")
    print(json.dumps(metrics_weighted, indent=2))

    results.append({
        "Model": name,
        "Accuracy": round(acc, 3),
        "Precision": round(prec, 3),
        "Recall": round(rec, 3),
        "F1": round(f1, 3)
    })

pd.DataFrame(results)

Picking model to set baseline. Might re-optimize parameters later.

In [None]:
from src.viz import * 
from src.scoring import get_false_class_scores
from src.threshold_xgb import find_optimal_threshold_from_scores

for name, model in models.items():

    # avoid hard encoded 0.5 threshold from sklearnt
    scores_val = get_false_class_scores(model, x_val, name) # get scores for possibly_false
    all_probs = scores_val 

    result = find_optimal_threshold_from_scores(
        y_true=y_val_enc,
        scores=scores_val,
        false_label_id=0,
        return_curve=True
    )
    best_threshold = result["best_threshold"]
    print("best threshold ", name, " :", best_threshold)

    # y_pred = np.where(scores_val >= best_threshold, 0, 1)
    y_pred = np.where(scores_val >= 0.5, 0, 1)

    model_path = "./results/baseline_model/"+this_name+"/"

    # Save threshold and label maps
    import json
    with open(os.path.join(model_path, "threshold.json"), "w") as f:
        json.dump({"best_threshold": best_threshold}, f)

    # Decode back into strings
    y_pred_str = le.inverse_transform(y_pred)

    # Same for y_val if we want consistency
    y_val_str = le.inverse_transform(y_val_enc)

    acc = accuracy_score(y_val_str, y_pred_str)
    prec, rec, f1, _ = precision_recall_fscore_support(y_val_str, y_pred_str, average="weighted", zero_division=0)
            
    # Classification Report
    print(classification_report(y_val_str, y_pred_str, zero_division=0))

    labels_in_val = list(np.unique(y_val_str))

    # Overall confusion matrix
    plotly_confusion_matrix(y_val_str, y_pred_str, labels=labels_in_val, title="Overall Confusion Matrix")

#### Confusion matrix

#### Feature importance

Coefficients tell us what words are most indicative of _FAKE_ vs _REAL_ claims.

In [None]:
from src.utils import * 

fw = get_feature_importance(rf_model, top_n=20)   # works with RF, XGB or LogReg
plot_feature_importance(fw, "Random Forest Feature Importance")
plot_wordcloud(fw, model_type="rf")