#### Read TSV and label columns

In [3]:
import pandas as pd
from pandas import DataFrame
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
import pickle

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

pd.options.display.max_seq_items = None



import numpy as np
np.set_printoptions(threshold=np.inf)


In [4]:
party_ohe = None
title_ohe = None
mlb_subject = None
tfidf = None

In [5]:

def load_data(path):
    cols = ["id", 
            "label", 
            "statement", 
            "subject", 
            "speaker", 
            "speaker_title", 
            "state", 
            "party", 
            "true_count",
            "false_count",
            "half_true_count",
            "mostly_true_count",
            "pof_count",
            "context"
    ]
    cols_map = {}
    for i in range(len(cols)):
        cols_map[i] = cols[i]

    df = pd.read_table(path, header=None).rename(columns=cols_map)
    # df.drop(["id", "context"], axis=1, inplace=True)
    return df

In [6]:
def split_XY(df):
    return df.drop(["binary_label"], axis=1, inplace=False), df["binary_label"]

In [7]:
def impute_zeros(df: DataFrame):
    cols_to_impute = ["true_count", "false_count", "half_true_count", "mostly_true_count", "pof_count"]
    df[cols_to_impute] = \
        df[cols_to_impute].fillna(0, inplace=False)
    return df

#### One-hot Encode for *party*

In [None]:
# def one_hot_party(df):
#     # Fill missing
#     df["party"] = df["party"].fillna("unknown")

#     # One-hot encode
#     global party_ohe
#     if party_ohe is None:
#         party_ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
#         party_encoded = party_ohe.fit_transform(df[["party"]])
#     else:
#         party_encoded = party_ohe.transform(df[["party"]])

#     # Convert to DataFrame
#     party_df = pd.DataFrame(party_encoded, columns=party_ohe.get_feature_names_out(["party"]))

#     # Concatenate with original dataframe
#     df = pd.concat([df.reset_index(drop=True), party_df.reset_index(drop=True)], axis=1)
#     df.drop(["party"], axis=1, inplace=True)
#     return df

def one_hot_party(df):
    global party_ohe

    # Lowercase and fill NAs
    df["party"] = df["party"].fillna("unknown").str.lower()

    if party_ohe is None:
        party_ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
        party_encoded = party_ohe.fit_transform(df[["party"]])
    else:
        party_encoded = party_ohe.transform(df[["party"]])

    # Prefix already added by get_feature_names_out
    columns = party_ohe.get_feature_names_out(["party"])
    party_df = pd.DataFrame(party_encoded, columns=columns)

    return party_df

#### Multi-hot encoding for *subject* column


In [124]:
def multi_hot_subject(df: pd.DataFrame, top_n: int = 15) -> pd.DataFrame:
    global mlb_subject

    # Fill missing and split cleanly
    subject_series = df["subject"].fillna("").apply(
        lambda s: [x.strip().lower() for x in s.split(",") if x.strip()]
    )

    # If top_n specified, keep only top N subjects
    if top_n is not None:
        all_subjects = pd.Series([item for sublist in subject_series for item in sublist])
        top_subjects = set(all_subjects.value_counts().nlargest(top_n).index)
        subject_series = subject_series.apply(lambda sublist: [x if x in top_subjects else "other" for x in sublist])

    # Fit or transform
    if mlb_subject is None:
        mlb_subject = MultiLabelBinarizer()
        subject_encoded = mlb_subject.fit_transform(subject_series)
    else:
        subject_encoded = mlb_subject.transform(subject_series)

    subject_df = pd.DataFrame(subject_encoded, columns=mlb_subject.classes_)
    return subject_df.reset_index(drop=True)

#### One hot encoding for Speaker Title (only top 10)

In [98]:
# def one_hot_speaker_title(df):
#     # Fill missing first
#     df["speaker_title"] = df["speaker_title"].fillna("unknown")

#     # Get top 10 most frequent titles
#     top_titles = df["speaker_title"].value_counts().nlargest(10).index

#     # Replace others with "other"
#     df["title"] = df["speaker_title"].where(df["speaker_title"].isin(top_titles), "other")

#     # One-hot encode
#     global title_ohe
#     if title_ohe is None:
#         title_ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
#         title_encoded = title_ohe.fit_transform(df[["title"]])
#     else:
#         title_encoded = title_ohe.transform(df[["title"]])

#     # Use consistent input column name!
#     title_df = pd.DataFrame(title_encoded, columns=title_ohe.get_feature_names_out(["title"]))

#     # Concatenate and clean up
#     df = pd.concat([df.reset_index(drop=True), title_df.reset_index(drop=True)], axis=1)
#     df.drop(["title", "speaker_title"], axis=1, inplace=True)
#     return df

from sklearn.preprocessing import OneHotEncoder
import pandas as pd

title_ohe = None  # Global encoder for reuse

def one_hot_speaker_title(df: pd.DataFrame) -> pd.DataFrame:
    global title_ohe

    # Normalize: lowercase
    df["speaker_title"] = df["speaker_title"].fillna("unknown").str.lower()

    # Get top 10 frequent titles
    top_titles = df["speaker_title"].value_counts().nlargest(15).index

    # Replace less frequent ones with "unknown"
    df["title"] = df["speaker_title"].where(df["speaker_title"].isin(top_titles), "unknown")

    # One-hot encode only the mapped titles
    if title_ohe is None:
        title_ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
        encoded = title_ohe.fit_transform(df[["title"]])
    else:
        encoded = title_ohe.transform(df[["title"]])

    encoded_df = pd.DataFrame(encoded, columns=title_ohe.get_feature_names_out(["title"]))

    return encoded_df.reset_index(drop=True)

#### Map labels to binary

In [11]:
def binarize_labels(df):
    def map_label(label):
        if label in ['pants-fire', 'false', 'barely-true']:
            return 0
        elif label in ['half-true', 'mostly-true', 'true']:
            return 1

    df['binary_label'] = df['label'].apply(map_label)
    df.drop(["label"], axis=1, inplace=True)
    return df

#### Convert *Statement* to TF-IDF

In [12]:
def generate_tfidf(statement: pd.DataFrame) -> pd.DataFrame:
    global tfidf
    statement_series = statement["statement"]
    if tfidf is None:
        tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
        tfidf_matrix = tfidf.fit_transform(statement_series)
    else:
        tfidf_matrix = tfidf.transform(statement_series)

    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out()).reset_index(drop=True)
    return tfidf_df

#### Convert to Word2Vec

In [16]:
# %pip install gensim

import pandas as pd
from gensim.models import KeyedVectors
from gensim.utils import simple_preprocess
import numpy as np

# Load pretrained Word2Vec (Google News - 300 dim)
# Download from: https://code.google.com/archive/p/word2vec/ (or use Gensim's API)
# NOTE: It's ~1.5GB
word2vec = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

def generate_word2vec_embeddings(df: pd.DataFrame, column: str = "statement") -> pd.DataFrame:
    def get_average_vector(tokens):
        vectors = [word2vec[word] for word in tokens if word in word2vec]
        if not vectors:
            return np.zeros(word2vec.vector_size)
        return np.mean(vectors, axis=0)

    # Tokenize each statement using Gensim's simple tokenizer
    tokenized_statements = df[column].fillna("").apply(simple_preprocess)

    # Compute average vector per statement
    embeddings = tokenized_statements.apply(get_average_vector)

    # Convert to DataFrame
    embedding_df = pd.DataFrame(embeddings.tolist())
    embedding_df.columns = [f"w2v_{i}" for i in range(word2vec.vector_size)]
    
    return embedding_df

#### Pipeline

In [130]:
def base_pipeline(path):
    df = load_data(path)
    df = one_hot_party(df)
    df = one_hot_speaker_title(df)
    df = multi_hot_subject(df)
    df.drop(["id", "speaker", "state", "context"], axis=1, inplace=True)
    vectors = generate_tfidf(df[["statement"]].copy())
    df.drop(["statement"], axis=1, inplace=True)
    df = df.reset_index(drop=True)
    df = pd.concat([df, vectors], axis=1)
    df = binarize_labels(df)
    # print("Nulls before imputation" + df.isnull().sum(), sep="\n")
    df = impute_zeros(df)
    X_train, y_train = split_XY(df)
    return X_train, y_train

def statements_only(path):
    df = load_data(path)
    df = binarize_labels(df)
    
    # Split X and y
    y = df["binary_label"]
    # Vectorize statements (X only)
    X = generate_tfidf(df[["statement"]])
    
    return X, y

def history_only(path):
    df = load_data(path)
    df = binarize_labels(df)
    df = impute_zeros(df)
    y = df["binary_label"]
    X = df[["true_count",
            "false_count",
            "half_true_count",
            "mostly_true_count",
            "pof_count"]]
    
    return X, y

def word2vec_avg(path):
    df = load_data(path)
    df = binarize_labels(df)
    df = impute_zeros(df).reset_index(drop=True)
    w2v_embeddings = generate_word2vec_embeddings(df).reset_index(drop=True)
    y = df["binary_label"]
    return w2v_embeddings, y

def word2vec_avg_party(path):
    df = load_data(path)
    df = binarize_labels(df)
    # print(df.isna().sum()[df.isna().sum() > 0])
    df = impute_zeros(df).reset_index(drop=True)
    w2v_embeddings = generate_word2vec_embeddings(df).reset_index(drop=True)
    parties = one_hot_party(df).reset_index(drop=True)
    y = df["binary_label"]
    X = pd.concat([w2v_embeddings, parties], axis=1)
    # print(X.isna().sum()[X.isna().sum() > 0])
    return X, y

def word2vec_avg_party_speakerTitle(path):
    df = load_data(path)
    df = binarize_labels(df)
    # print(df.isna().sum()[df.isna().sum() > 0])
    df = impute_zeros(df).reset_index(drop=True)
    w2v_embeddings = generate_word2vec_embeddings(df).reset_index(drop=True)
    parties = one_hot_party(df).reset_index(drop=True)
    speakerTitle = one_hot_speaker_title(df).reset_index(drop=True)
    y = df["binary_label"]
    X = pd.concat([w2v_embeddings, parties, speakerTitle], axis=1)
    print(X.isna().sum()[X.isna().sum() > 0])
    return X, y

def word2vec_avg_party_subject(path):
    df = load_data(path)
    df = binarize_labels(df)
    # print(df.isna().sum()[df.isna().sum() > 0])
    df = impute_zeros(df).reset_index(drop=True)
    w2v_embeddings = generate_word2vec_embeddings(df).reset_index(drop=True)
    parties = one_hot_party(df).reset_index(drop=True)
    subjects = multi_hot_subject(df)
    y = df["binary_label"]
    X = pd.concat([w2v_embeddings, parties, subjects], axis=1)
    print(X.isna().sum()[X.isna().sum() > 0])
    return X, y

def word2vec_avg_party_speakerTitle_subject(path):
    df = load_data(path)
    df = binarize_labels(df)
    # print(df.isna().sum()[df.isna().sum() > 0])
    df = impute_zeros(df).reset_index(drop=True)
    w2v_embeddings = generate_word2vec_embeddings(df).reset_index(drop=True)
    parties = one_hot_party(df).reset_index(drop=True)
    speakerTitle = one_hot_speaker_title(df).reset_index(drop=True)
    subjects = multi_hot_subject(df).reset_index(drop=True)
    y = df["binary_label"]
    X = pd.concat([w2v_embeddings, parties, speakerTitle, subjects], axis=1)
    print(X.isna().sum()[X.isna().sum() > 0])
    return X, y

#### Evaluation

In [136]:
# %pip install xgboost
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

def train_and_evaluate(X_train, y_train, X_valid, y_valid):
    # model = LogisticRegression(max_iter=10000)
    # model = LogisticRegression(
    #     penalty="l1",          # or "elasticnet"
    #     solver="saga",         # required for l1 or elasticnet
    #     # l1_ratio=0.5,          # only for elasticnet
    #     # C=0.5,                 # play around with this
    #     max_iter=10000,
    #     class_weight="balanced",
    #     n_jobs=-1              # if using CV later
    # )

    # model = LinearSVC(
    # C=5.0,             # Regularization strength (try 0.1, 0.5, 1, 5...)
    # class_weight="balanced",  # Handle class imbalance
    # max_iter=100000     # Prevent convergence issues
    # )

    # model = SVC(
    # kernel="rbf",         # Radial Basis Function kernel
    # C=1.0,
    # gamma='scale',
    # class_weight='balanced'
    # )

    # model = RandomForestClassifier(
    #     n_estimators=100,
    #     max_depth=None,               # allow full growth
    #     min_samples_split=5,
    #     min_samples_leaf=2,
    #     max_features='sqrt',
    #     class_weight='balanced',
    #     n_jobs=-1,
    #     random_state=42
    # )

    from xgboost import XGBClassifier
    model = XGBClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss"
    )

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=True
    )
    
    
    # model.fit(X_train, y_train)

    y_pred = model.predict(X_valid)

    # print("Train Set Class Dist: ", print(y_train.value_counts(normalize=True)))
    # Accuracy
    print("Accuracy:", accuracy_score(y_valid, y_pred))

    # Detailed metrics like precision, recall, f1-score
    print(classification_report(y_valid, y_pred))

    # Confusion matrix
    print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))
    return model

In [137]:
# X_train, y_train = statements_only("train.tsv")
# X_valid,y_valid = statements_only("valid.tsv")

# X_train, y_train = base_pipeline("train.tsv")
# X_valid,y_valid = base_pipeline("valid.tsv")

# X_train, y_train = history_only("train.tsv")
# X_valid,y_valid = history_only("valid.tsv")

# X_train, y_train = word2vec_avg("train.tsv")
# X_valid, y_valid = word2vec_avg("valid.tsv")

# X_train, y_train = word2vec_avg_party("train.tsv")
# X_valid, y_valid = word2vec_avg_party("valid.tsv")

# X_train, y_train = word2vec_avg_party_speakerTitle("train.tsv")
# X_valid, y_valid = word2vec_avg_party_speakerTitle("valid.tsv")

# X_train, y_train = word2vec_avg_party_subject("train.tsv")
# X_valid, y_valid = word2vec_avg_party_subject("valid.tsv")

X_train, y_train = word2vec_avg_party_speakerTitle_subject("train.tsv")
X_valid, y_valid = word2vec_avg_party_speakerTitle_subject("valid.tsv")

# X_train.columns

model = train_and_evaluate(X_train, y_train, X_valid, y_valid)



Series([], dtype: int64)
Series([], dtype: int64)
[0]	validation_0-logloss:0.69276
[1]	validation_0-logloss:0.69064
[2]	validation_0-logloss:0.68833
[3]	validation_0-logloss:0.68622
[4]	validation_0-logloss:0.68398
[5]	validation_0-logloss:0.68196
[6]	validation_0-logloss:0.67989
[7]	validation_0-logloss:0.67872
[8]	validation_0-logloss:0.67698
[9]	validation_0-logloss:0.67515
[10]	validation_0-logloss:0.67362
[11]	validation_0-logloss:0.67208
[12]	validation_0-logloss:0.67068
[13]	validation_0-logloss:0.66961
[14]	validation_0-logloss:0.66887
[15]	validation_0-logloss:0.66805
[16]	validation_0-logloss:0.66674


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[17]	validation_0-logloss:0.66593
[18]	validation_0-logloss:0.66485
[19]	validation_0-logloss:0.66406
[20]	validation_0-logloss:0.66344
[21]	validation_0-logloss:0.66285
[22]	validation_0-logloss:0.66182
[23]	validation_0-logloss:0.66103
[24]	validation_0-logloss:0.66053
[25]	validation_0-logloss:0.65980
[26]	validation_0-logloss:0.65927
[27]	validation_0-logloss:0.65862
[28]	validation_0-logloss:0.65817
[29]	validation_0-logloss:0.65729
[30]	validation_0-logloss:0.65648
[31]	validation_0-logloss:0.65593
[32]	validation_0-logloss:0.65519
[33]	validation_0-logloss:0.65433
[34]	validation_0-logloss:0.65380
[35]	validation_0-logloss:0.65327
[36]	validation_0-logloss:0.65281
[37]	validation_0-logloss:0.65235
[38]	validation_0-logloss:0.65195
[39]	validation_0-logloss:0.65100
[40]	validation_0-logloss:0.65032
[41]	validation_0-logloss:0.65024
[42]	validation_0-logloss:0.64952
[43]	validation_0-logloss:0.64919
[44]	validation_0-logloss:0.64868
[45]	validation_0-logloss:0.64813
[46]	validatio

**Base Pipeline TF-IDF**
```
Accuracy: 0.6425233644859814
              precision    recall  f1-score   support

           0       0.66      0.51      0.58       616
           1       0.63      0.76      0.69       668

    accuracy                           0.64      1284
   macro avg       0.65      0.64      0.63      1284
weighted avg       0.65      0.64      0.64      1284

Confusion Matrix:
 [[317 299]
 [160 508]]
```
---

**Statement only TF-IDF**
```
Accuracy: 0.5950155763239875
              precision    recall  f1-score   support

           0       0.60      0.46      0.52       616
           1       0.59      0.72      0.65       668

    accuracy                           0.60      1284
   macro avg       0.60      0.59      0.59      1284
weighted avg       0.60      0.60      0.59      1284

Confusion Matrix:
 [[285 331]
 [189 479]]
```
---
**History only**
```
Accuracy: 0.5669781931464174
              precision    recall  f1-score   support

           0       0.73      0.15      0.25       616
           1       0.55      0.95      0.70       668

    accuracy                           0.57      1284
   macro avg       0.64      0.55      0.47      1284
weighted avg       0.64      0.57      0.48      1284

Confusion Matrix:
 [[ 94 522]
 [ 34 634]]

----
#### Word2Vec

**Statement only**

```
binary_label
1    0.561719
0    0.438281
Name: proportion, dtype: float64
Train Set Class Dist:  None
Accuracy: 0.6144859813084113
              precision    recall  f1-score   support

           0       0.59      0.63      0.61       616
           1       0.64      0.60      0.62       668

    accuracy                           0.61      1284
   macro avg       0.62      0.62      0.61      1284
weighted avg       0.62      0.61      0.61      1284

Confusion Matrix:
 [[389 227]
 [268 400]]
 ```

**Statement Only LinearSVC**
```
Accuracy: 0.6144859813084113
              precision    recall  f1-score   support

           0       0.59      0.62      0.61       616
           1       0.64      0.61      0.62       668

    accuracy                           0.61      1284
   macro avg       0.61      0.61      0.61      1284
weighted avg       0.62      0.61      0.61      1284

Confusion Matrix:
 [[384 232]
 [263 405]]
 ```

**Statement Only SVC**
```
Accuracy: 0.632398753894081
              precision    recall  f1-score   support

           0       0.61      0.66      0.63       616
           1       0.66      0.61      0.63       668

    accuracy                           0.63      1284
   macro avg       0.63      0.63      0.63      1284
weighted avg       0.63      0.63      0.63      1284

Confusion Matrix:
 [[405 211]
 [261 407]]
 ```

**Statement Only RandomForest**
```
Accuracy: 0.6129283489096573
              precision    recall  f1-score   support

           0       0.65      0.41      0.51       616
           1       0.60      0.80      0.68       668

    accuracy                           0.61      1284
   macro avg       0.62      0.61      0.59      1284
weighted avg       0.62      0.61      0.60      1284

Confusion Matrix:
 [[254 362]
 [135 533]]
```

**Statement Only XGBoost**
```
Accuracy: 0.6261682242990654
              precision    recall  f1-score   support

           0       0.63      0.52      0.57       616
           1       0.62      0.72      0.67       668

    accuracy                           0.63      1284
   macro avg       0.63      0.62      0.62      1284
weighted avg       0.63      0.63      0.62      1284

Confusion Matrix:
 [[322 294]
 [186 482]]
```

**Statement+Party SVC**
```
Accuracy: 0.6386292834890965
              precision    recall  f1-score   support

           0       0.62      0.62      0.62       616
           1       0.65      0.65      0.65       668

    accuracy                           0.64      1284
   macro avg       0.64      0.64      0.64      1284
weighted avg       0.64      0.64      0.64      1284

Confusion Matrix:
 [[385 231]
 [233 435]]
 ```

 **Statement+Party XGBoost**
 ```
 Accuracy: 0.6448598130841121
              precision    recall  f1-score   support

           0       0.65      0.55      0.60       616
           1       0.64      0.73      0.68       668

    accuracy                           0.64      1284
   macro avg       0.65      0.64      0.64      1284
weighted avg       0.65      0.64      0.64      1284

Confusion Matrix:
 [[339 277]
 [179 489]]
 ```

**Statement + Party + SpeakerTitle RandomForest**
```
Accuracy: 0.632398753894081
              precision    recall  f1-score   support

           0       0.67      0.46      0.55       616
           1       0.61      0.79      0.69       668

    accuracy                           0.63      1284
   macro avg       0.64      0.63      0.62      1284
weighted avg       0.64      0.63      0.62      1284

Confusion Matrix:
 [[284 332]
 [140 528]]
 ```

 **Statement+Party+SpeakerTitle XGBoost**
 ```
 Accuracy: 0.6386292834890965
              precision    recall  f1-score   support

           0       0.65      0.54      0.59       616
           1       0.63      0.73      0.68       668

    accuracy                           0.64      1284
   macro avg       0.64      0.63      0.63      1284
weighted avg       0.64      0.64      0.64      1284

Confusion Matrix:
 [[333 283]
 [181 487]]
 ```

**Statement+Party+Subject XGBoost**
```
Accuracy: 0.632398753894081
              precision    recall  f1-score   support

           0       0.64      0.54      0.58       616
           1       0.63      0.72      0.67       668

    accuracy                           0.63      1284
   macro avg       0.63      0.63      0.63      1284
weighted avg       0.63      0.63      0.63      1284

Confusion Matrix:
 [[331 285]
 [187 481]]
```


**Statement + Party + Subject SVC**
```
Accuracy: 0.6425233644859814
              precision    recall  f1-score   support

           0       0.63      0.63      0.63       616
           1       0.66      0.65      0.65       668

    accuracy                           0.64      1284
   macro avg       0.64      0.64      0.64      1284
weighted avg       0.64      0.64      0.64      1284

Confusion Matrix:
 [[390 226]
 [233 435]]
```

**Statement + Party + SpeakerTitle + Subject SVC**
```
Accuracy: 0.6386292834890965
              precision    recall  f1-score   support

           0       0.62      0.64      0.63       616
           1       0.66      0.63      0.65       668

    accuracy                           0.64      1284
   macro avg       0.64      0.64      0.64      1284
weighted avg       0.64      0.64      0.64      1284

Confusion Matrix:
 [[397 219]
 [245 423]]
```

**Statement + Party + SpeakerTitle + Subject XGBoost**
```
Accuracy: 0.6339563862928349
              precision    recall  f1-score   support

           0       0.64      0.54      0.59       616
           1       0.63      0.72      0.67       668

    accuracy                           0.63      1284
   macro avg       0.63      0.63      0.63      1284
weighted avg       0.63      0.63      0.63      1284

Confusion Matrix:
 [[334 282]
 [188 480]]
```