In [1]:
from classification import Pipeline, MachineLearningClassifier
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import json
from IPython.display import display, HTML


def get_metrics(train_y_true, train_y_pred, test_y_true, test_y_pred):
    acc_train = accuracy_score(train_y_true, train_y_pred)
    acc_test = accuracy_score(test_y_true, test_y_pred)
    f1_train = f1_score(train_y_true, train_y_pred)
    f1_test = f1_score(test_y_true, test_y_pred)
    return [acc_train, f1_train, acc_test, f1_test]

def run_tests(dataset_params: dict):
    dataset = Pipeline(**dataset_params).load_dataset_from_file("data/Horne2017_FakeNewsData/Buzzfeed")
    train_X, train_y, test_X, test_y = dataset.train_test
    clfs = MachineLearningClassifier()
    clfs.fit(train_X, train_y)
    preds_train = clfs.predict(train_X)
    preds_test = clfs.predict(test_X)
    params = clfs.best_params()
    df = pd.DataFrame(columns=["Method", "Params", "train_accuracy", "train_f1", "test_accuracy", "test_f1"])
    for k, p in params.items():
        metrics = get_metrics(train_y, preds_train[k], test_y, preds_test[k])
        df.loc[len(df)] = [k, json.dumps(p, indent=2), *metrics]
    df.sort_values(by="test_f1", ascending=False, inplace=True)

    return df


def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n","<br>")))

  from .autonotebook import tqdm as notebook_tqdm
2023-08-05 23:12:21.217466: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-05 23:12:23.706087: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-05 23:12:23.860438: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-05 23:12:23.860512: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0

# BERT Only

In [2]:
pretty_print(run_tests({
    "non_latent": None,
    "similarity": False,
}))

Fitting models: 100%|██████████| 4/4 [00:24<00:00,  6.09s/it]


Unnamed: 0,Method,Params,train_accuracy,train_f1,test_accuracy,test_f1
0,Logistic Regression,"{  ""C"": 1.2,  ""solver"": ""lbfgs"" }",0.974026,0.97561,0.8,0.833333
1,SVC,"{  ""C"": 0.2,  ""gamma"": ""scale"",  ""kernel"": ""rbf"" }",0.532468,0.694915,0.55,0.709677
3,XGBoost,"{  ""alpha"": 0.0,  ""eta"": 0.4,  ""lambda"": 1.6,  ""max_depth"": 2 }",1.0,1.0,0.7,0.666667
2,Decision Tree,"{  ""criterion"": ""gini"",  ""max_depth"": 3,  ""max_features"": null,  ""min_samples_split"": 2 }",0.961039,0.962963,0.55,0.571429


# BERT + non latent

In [3]:
pretty_print(run_tests({
    "non_latent": Pipeline.NonLatentConfig(),
    "similarity": False,
}))

Fitting models: 100%|██████████| 4/4 [00:20<00:00,  5.09s/it]


Unnamed: 0,Method,Params,train_accuracy,train_f1,test_accuracy,test_f1
0,Logistic Regression,"{  ""C"": 0.2,  ""solver"": ""lbfgs"" }",0.935065,0.936709,0.85,0.88
3,XGBoost,"{  ""alpha"": 0.2,  ""eta"": 0.4,  ""lambda"": 1.0,  ""max_depth"": 1 }",1.0,1.0,0.85,0.857143
1,SVC,"{  ""C"": 0.6,  ""gamma"": ""scale"",  ""kernel"": ""sigmoid"" }",0.766234,0.763158,0.8,0.833333
2,Decision Tree,"{  ""criterion"": ""entropy"",  ""max_depth"": 3,  ""max_features"": ""sqrt"",  ""min_samples_split"": 2 }",0.922078,0.921053,0.7,0.7


# BERT + non latent + similarity

In [4]:
pretty_print(run_tests({
    "non_latent": Pipeline.NonLatentConfig(),
    "similarity": True,
}))

Fitting models: 100%|██████████| 4/4 [00:20<00:00,  5.14s/it]


Unnamed: 0,Method,Params,train_accuracy,train_f1,test_accuracy,test_f1
0,Logistic Regression,"{  ""C"": 0.2,  ""solver"": ""lbfgs"" }",0.935065,0.936709,0.85,0.88
3,XGBoost,"{  ""alpha"": 0.0,  ""eta"": 0.5,  ""lambda"": 1.2,  ""max_depth"": 1 }",1.0,1.0,0.85,0.869565
1,SVC,"{  ""C"": 0.6,  ""gamma"": ""scale"",  ""kernel"": ""sigmoid"" }",0.766234,0.763158,0.8,0.833333
2,Decision Tree,"{  ""criterion"": ""gini"",  ""max_depth"": 3,  ""max_features"": 0.3,  ""min_samples_split"": 2 }",0.948052,0.95,0.7,0.769231
