#### Read TSV and label columns

In [336]:
import pandas as pd
from pandas import DataFrame
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
import pickle

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

pd.options.display.max_seq_items = None



import numpy as np
np.set_printoptions(threshold=np.inf)


In [362]:
party_ohe = None
title_ohe = None
mlb_subject = None
tfidf = None

In [338]:

def load_data(path):
    cols = ["id", 
            "label", 
            "statement", 
            "subject", 
            "speaker", 
            "speaker_title", 
            "state", 
            "party", 
            "true_count",
            "false_count",
            "half_true_count",
            "mostly_true_count",
            "pof_count",
            "context"
    ]
    cols_map = {}
    for i in range(len(cols)):
        cols_map[i] = cols[i]

    df = pd.read_table(path, header=None).rename(columns=cols_map)
    # df.drop(["id", "context"], axis=1, inplace=True)
    return df

In [339]:
def split_XY(df):
    return df.drop(["binary_label"], axis=1, inplace=False), df["binary_label"]

In [340]:
def impute_zeros(df: DataFrame):
    cols_to_impute = ["true_count", "false_count", "half_true_count", "mostly_true_count", "pof_count"]
    df[cols_to_impute] = \
        df[cols_to_impute].fillna(0, inplace=False)
    return df

#### One-hot Encode for *party*

In [341]:
def one_hot_party(df):
    # Fill missing
    df["party"] = df["party"].fillna("unknown")

    # One-hot encode
    global party_ohe
    if party_ohe is None:
        party_ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
        party_encoded = party_ohe.fit_transform(df[["party"]])
    else:
        party_encoded = party_ohe.transform(df[["party"]])

    # Convert to DataFrame
    party_df = pd.DataFrame(party_encoded, columns=party_ohe.get_feature_names_out(["party"]))

    # Concatenate with original dataframe
    df = pd.concat([df.reset_index(drop=True), party_df.reset_index(drop=True)], axis=1)
    df.drop(["party"], axis=1, inplace=True)
    return df

#### Multi-hot encoding for *subject* column


In [342]:
def multi_hot_subject(df):
    df["subject"] = df["subject"].fillna("")
    df["subject_split"] = df["subject"].str.split(",")

    global mlb_subject
    if mlb_subject is None:
        mlb_subject = MultiLabelBinarizer()
        subject_encoded = mlb_subject.fit_transform(df["subject_split"])
    else:
        subject_encoded = mlb_subject.transform(df["subject_split"])
    subject_df = pd.DataFrame(subject_encoded, columns=mlb_subject.classes_)

    subject_df = subject_df.reindex(columns=mlb_subject.classes_, fill_value=0)

    df.drop(["subject", "subject_split"], axis=1, inplace=True)
    df = pd.concat([df, subject_df], axis=1)
    return df



#### One hot encoding for Speaker Title (only top 10)

In [343]:
def one_hot_speaker_title(df):
    # Fill missing first
    df["speaker_title"] = df["speaker_title"].fillna("unknown")

    # Get top 10 most frequent titles
    top_titles = df["speaker_title"].value_counts().nlargest(10).index

    # Replace others with "other"
    df["title"] = df["speaker_title"].where(df["speaker_title"].isin(top_titles), "other")

    # One-hot encode
    global title_ohe
    if title_ohe is None:
        title_ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
        title_encoded = title_ohe.fit_transform(df[["title"]])
    else:
        title_encoded = title_ohe.transform(df[["title"]])

    # Use consistent input column name!
    title_df = pd.DataFrame(title_encoded, columns=title_ohe.get_feature_names_out(["title"]))

    # Concatenate and clean up
    df = pd.concat([df.reset_index(drop=True), title_df.reset_index(drop=True)], axis=1)
    df.drop(["title", "speaker_title"], axis=1, inplace=True)
    return df

#### Map labels to binary

In [344]:
def binarize_labels(df):
    def map_label(label):
        if label in ['pants-fire', 'false', 'barely-true']:
            return 0
        elif label in ['half-true', 'mostly-true', 'true']:
            return 1

    df['binary_label'] = df['label'].apply(map_label)
    df.drop(["label"], axis=1, inplace=True)
    return df

#### Convert *Statement* to TF-IDF

In [366]:
def generate_tfidf(statement: pd.DataFrame) -> pd.DataFrame:
    global tfidf
    statement_series = statement["statement"]
    if tfidf is None:
        tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
        tfidf_matrix = tfidf.fit_transform(statement_series)
    else:
        tfidf_matrix = tfidf.transform(statement_series)

    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out()).reset_index(drop=True)
    return tfidf_df

#### Pipeline

In [381]:
def base_pipeline(path):
    df = load_data(path)
    df = one_hot_party(df)
    df = one_hot_speaker_title(df)
    df = multi_hot_subject(df)
    df.drop(["id", "speaker", "state", "context"], axis=1, inplace=True)
    vectors = generate_tfidf(df[["statement"]].copy())
    df.drop(["statement"], axis=1, inplace=True)
    df = df.reset_index(drop=True)
    df = pd.concat([df, vectors], axis=1)
    df = binarize_labels(df)
    # print("Nulls before imputation" + df.isnull().sum(), sep="\n")
    df = impute_zeros(df)
    X_train, y_train = split_XY(df)
    return X_train, y_train

def statements_only(path):
    df = load_data(path)
    df = binarize_labels(df)
    
    # Split X and y
    y = df["binary_label"]
    # Vectorize statements (X only)
    X = generate_tfidf(df[["statement"]])
    
    return X, y

def history_only(path):
    df = load_data(path)
    df = binarize_labels(df)
    df = impute_zeros(df)
    y = df["binary_label"]
    X = df[["true_count",
            "false_count",
            "half_true_count",
            "mostly_true_count",
            "pof_count"]]
    
    return X, y

#### Evaluation

In [None]:
def train_and_evaluate(X_train, y_train, X_valid, y_valid):
    # model = LogisticRegression(max_iter=10000)
    model = LogisticRegression(
        penalty="l1",          # or "elasticnet"
        solver="saga",         # required for l1 or elasticnet
        # l1_ratio=0.5,          # only for elasticnet
        # C=0.5,                 # play around with this
        max_iter=10000,
        class_weight="balanced",
        n_jobs=-1              # if using CV later
    )
    model.fit(X_train, y_train)

    y_pred = model.predict(X_valid)

    print("Train Set Class Dist: ", print(y_train.value_counts(normalize=True)))
    # Accuracy
    print("Accuracy:", accuracy_score(y_valid, y_pred))

    # Detailed metrics like precision, recall, f1-score
    print(classification_report(y_valid, y_pred))

    # Confusion matrix
    print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))
    return model

In [389]:
# X_train, y_train = statements_only("train.tsv")
# X_valid,y_valid = statements_only("valid.tsv")

X_train, y_train = base_pipeline("train.tsv")
X_valid,y_valid = base_pipeline("valid.tsv")

# X_train, y_train = history_only("train.tsv")
# X_valid,y_valid = history_only("valid.tsv")

model = train_and_evaluate(X_train, y_train, X_valid, y_valid)

binary_label
1    0.561719
0    0.438281
Name: proportion, dtype: float64
Train Set Class Dist:  None
Accuracy: 0.6440809968847352
              precision    recall  f1-score   support

           0       0.67      0.52      0.58       616
           1       0.63      0.76      0.69       668

    accuracy                           0.64      1284
   macro avg       0.65      0.64      0.64      1284
weighted avg       0.65      0.64      0.64      1284

Confusion Matrix:
 [[319 297]
 [160 508]]


**Base Pipeline**
```
Accuracy: 0.6425233644859814
              precision    recall  f1-score   support

           0       0.66      0.51      0.58       616
           1       0.63      0.76      0.69       668

    accuracy                           0.64      1284
   macro avg       0.65      0.64      0.63      1284
weighted avg       0.65      0.64      0.64      1284

Confusion Matrix:
 [[317 299]
 [160 508]]
```
---

**Statement only**
```
Accuracy: 0.5950155763239875
              precision    recall  f1-score   support

           0       0.60      0.46      0.52       616
           1       0.59      0.72      0.65       668

    accuracy                           0.60      1284
   macro avg       0.60      0.59      0.59      1284
weighted avg       0.60      0.60      0.59      1284

Confusion Matrix:
 [[285 331]
 [189 479]]
```
---
**History only**
```
Accuracy: 0.5669781931464174
              precision    recall  f1-score   support

           0       0.73      0.15      0.25       616
           1       0.55      0.95      0.70       668

    accuracy                           0.57      1284
   macro avg       0.64      0.55      0.47      1284
weighted avg       0.64      0.57      0.48      1284

Confusion Matrix:
 [[ 94 522]
 [ 34 634]]

In [None]:
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

result = permutation_importance(model, X_valid, y_valid, n_repeats=10, random_state=42)

sorted_idx = result.importances_mean.argsort()[-20:]

plt.barh(X_valid.columns[sorted_idx], result.importances_mean[sorted_idx])
plt.title("Permutation Feature Importance")
plt.show()