In [11]:
from classification import Pipeline, MachineLearningClassifier
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import json
from IPython.display import display, HTML


def get_metrics(train_y_true, train_y_pred, test_y_true, test_y_pred):
    acc_train = accuracy_score(train_y_true, train_y_pred)
    acc_test = accuracy_score(test_y_true, test_y_pred)
    f1_train = f1_score(train_y_true, train_y_pred)
    f1_test = f1_score(test_y_true, test_y_pred)
    return [acc_train, f1_train, acc_test, f1_test]

def run_tests(dataset_params: dict):
    dataset = Pipeline(**dataset_params).load_dataset_from_file("data/Horne2017_FakeNewsData/Buzzfeed")
    train_X, train_y, test_X, test_y = dataset.train_test
    clfs = MachineLearningClassifier()
    clfs.fit(train_X, train_y)
    preds_train = clfs.predict(train_X)
    preds_test = clfs.predict(test_X)
    params = clfs.best_params()
    df = pd.DataFrame(columns=["Method", "Params", "train_accuracy", "train_f1", "test_accuracy", "test_f1"])
    for k, p in params.items():
        metrics = get_metrics(train_y, preds_train[k], test_y, preds_test[k])
        df.loc[len(df)] = [k, json.dumps(p, indent=2), *metrics]
    df.sort_values(by="test_f1", ascending=False, inplace=True)

    return df


def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n","<br>")))

# BERT Only

In [12]:
pretty_print(run_tests({
    "non_latent": None,
    "similarity": False,
}))

Fitting models: 100%|██████████| 4/4 [00:24<00:00,  6.01s/it]


Unnamed: 0,Method,Params,train_accuracy,train_f1,test_accuracy,test_f1
0,Logistic Regression,"{  ""C"": 1.2,  ""solver"": ""liblinear"" }",0.961039,0.962963,0.6,0.666667
1,SVC,"{  ""C"": 1.0,  ""class_weight"": null,  ""gamma"": 0.01,  ""kernel"": ""poly"" }",0.935065,0.938272,0.55,0.608696
2,Decision Tree,"{  ""criterion"": ""entropy"",  ""max_depth"": 5,  ""max_features"": null,  ""min_samples_split"": 4 }",1.0,1.0,0.55,0.571429
3,XGBoost,"{  ""alpha"": 0.0,  ""eta"": 0.4,  ""lambda"": 0.8,  ""max_depth"": 4 }",1.0,1.0,0.5,0.545455


# BERT + non latent

In [13]:
pretty_print(run_tests({
    "non_latent": Pipeline.NonLatentConfig(),
    "similarity": False,
}))

Fitting models: 100%|██████████| 4/4 [00:22<00:00,  5.65s/it]


Unnamed: 0,Method,Params,train_accuracy,train_f1,test_accuracy,test_f1
2,Decision Tree,"{  ""criterion"": ""entropy"",  ""max_depth"": 5,  ""max_features"": 0.3,  ""min_samples_split"": 2 }",0.987013,0.987952,0.9,0.909091
3,XGBoost,"{  ""alpha"": 0.0,  ""eta"": 0.3,  ""lambda"": 1.2,  ""max_depth"": 1 }",1.0,1.0,0.9,0.9
1,SVC,"{  ""C"": 1.2,  ""class_weight"": null,  ""gamma"": 0.01,  ""kernel"": ""rbf"" }",1.0,1.0,0.7,0.785714
0,Logistic Regression,"{  ""C"": 1.0,  ""solver"": ""lbfgs"" }",0.974026,0.975,0.8,0.777778


# BERT + non latent + similarity

In [14]:
pretty_print(run_tests({
    "non_latent": Pipeline.NonLatentConfig(),
    "similarity": True,
}))

Fitting models: 100%|██████████| 4/4 [00:21<00:00,  5.44s/it]


Unnamed: 0,Method,Params,train_accuracy,train_f1,test_accuracy,test_f1
3,XGBoost,"{  ""alpha"": 0.0,  ""eta"": 0.3,  ""lambda"": 1.2,  ""max_depth"": 1 }",1.0,1.0,0.9,0.9
2,Decision Tree,"{  ""criterion"": ""gini"",  ""max_depth"": 3,  ""max_features"": ""sqrt"",  ""min_samples_split"": 2 }",0.896104,0.894737,0.8,0.8
1,SVC,"{  ""C"": 1.2,  ""class_weight"": null,  ""gamma"": 0.01,  ""kernel"": ""rbf"" }",1.0,1.0,0.7,0.785714
0,Logistic Regression,"{  ""C"": 1.0,  ""solver"": ""lbfgs"" }",0.974026,0.975,0.8,0.777778
