# Machine Learning

**Note:** Jim (who was responsible for machine learning) lost his linux distro about a week before the code and report was due. This notebook and some features had to be rebuilt. After a bunch of testing, we couldn't reproduce the results we got during the project demo. We decided to copy the results from the demo in our report and discussion.

In [7]:
from classification import Pipeline, MachineLearningClassifier
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import numpy as np
import json
from IPython.display import display, HTML


def get_metrics(train_y_true, train_y_pred, test_y_true, test_y_pred):
    acc_train = accuracy_score(train_y_true, train_y_pred)
    acc_test = accuracy_score(test_y_true, test_y_pred)
    f1_train = f1_score(train_y_true, train_y_pred)
    f1_test = f1_score(test_y_true, test_y_pred)
    return [acc_train, f1_train, acc_test, f1_test]

def run_tests(dataset_params: dict):
    dataset = Pipeline(**dataset_params).load_dataset_from_file("data/Horne2017_FakeNewsData/Buzzfeed")
    train_X, train_y, test_X, test_y = dataset.train_test
    print(train_X.shape)
    clfs = MachineLearningClassifier()
    clfs.fit(train_X, train_y)
    preds_train = clfs.predict(train_X)
    preds_test = clfs.predict(test_X)
    params = clfs.best_params()
    df = pd.DataFrame(columns=["Method", "Params", "train_accuracy", "train_f1", "test_accuracy", "test_f1"])
    for k, p in params.items():
        metrics = get_metrics(train_y, preds_train[k], test_y, preds_test[k])
        df.loc[len(df)] = [k, json.dumps(p, indent=2), *metrics]
    df.sort_values(by="test_f1", ascending=False, inplace=True)

    return df


def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n","<br>")))


def print_as_latex(df, feature):
    metrics = ["train_accuracy", "train_f1", "test_accuracy", "test_f1"]
    lr = [str(np.round(v, 2)) for v in df[df["Method"] == "Logistic Regression"][metrics].values[0]]
    svc = [str(np.round(v, 2)) for v in df[df["Method"] == "SVC"][metrics].values[0]]
    dt = [str(np.round(v, 2)) for v in df[df["Method"] == "Decision Tree"][metrics].values[0]]
    xgb = [str(np.round(v, 2)) for v in df[df["Method"] == "XGBoost"][metrics].values[0]]
    print(f"""\\multirow{{4}}{{*}}{{{feature}}} & LR & {" & ".join(lr)}\\\\
\\cmidrule{{2-6}}
& SVC & {" & ".join(svc)}\\\\
\\cmidrule{{2-6}}
& DT & {" & ".join(dt)}\\\\
\\cmidrule{{2-6}}
& \\textbf{{XGB}} & {" & ".join(xgb)}\\\\""")

## BERT Only

In [8]:
df1 = run_tests({
    "non_latent": None,
    "similarity": False,
})
print_as_latex(df1, "BERT")
pretty_print(df1)

(77, 768)


Fitting models: 100%|██████████| 4/4 [00:23<00:00,  5.98s/it]

\multirow{4}{*}{BERT} & LR & 0.97 & 0.98 & 0.8 & 0.83\\
\cmidrule{2-6}
& SVC & 0.53 & 0.69 & 0.55 & 0.71\\
\cmidrule{2-6}
& DT & 0.96 & 0.96 & 0.55 & 0.57\\
\cmidrule{2-6}
& \textbf{XGB} & 1.0 & 1.0 & 0.7 & 0.67\\





Unnamed: 0,Method,Params,train_accuracy,train_f1,test_accuracy,test_f1
0,Logistic Regression,"{  ""C"": 1.2,  ""solver"": ""lbfgs"" }",0.974026,0.97561,0.8,0.833333
1,SVC,"{  ""C"": 0.2,  ""gamma"": ""scale"",  ""kernel"": ""rbf"" }",0.532468,0.694915,0.55,0.709677
3,XGBoost,"{  ""alpha"": 0.0,  ""eta"": 0.4,  ""lambda"": 1.6,  ""max_depth"": 2 }",1.0,1.0,0.7,0.666667
2,Decision Tree,"{  ""criterion"": ""gini"",  ""max_depth"": 3,  ""max_features"": null,  ""min_samples_split"": 2 }",0.961039,0.962963,0.55,0.571429


## BERT + non latent

In [9]:
df2 = run_tests({
    "non_latent": Pipeline.NonLatentConfig(),
    "similarity": False,
})
print_as_latex(df2, "\\shortstack{BERT\\\\+ Non-Latent}")
pretty_print(df2)

(77, 795)


Fitting models: 100%|██████████| 4/4 [00:21<00:00,  5.44s/it]

\multirow{4}{*}{\shortstack{BERT\\+ Non-Latent}} & LR & 0.94 & 0.94 & 0.85 & 0.88\\
\cmidrule{2-6}
& SVC & 0.77 & 0.76 & 0.8 & 0.83\\
\cmidrule{2-6}
& DT & 0.92 & 0.92 & 0.7 & 0.7\\
\cmidrule{2-6}
& \textbf{XGB} & 1.0 & 1.0 & 0.85 & 0.86\\





Unnamed: 0,Method,Params,train_accuracy,train_f1,test_accuracy,test_f1
0,Logistic Regression,"{  ""C"": 0.2,  ""solver"": ""lbfgs"" }",0.935065,0.936709,0.85,0.88
3,XGBoost,"{  ""alpha"": 0.2,  ""eta"": 0.4,  ""lambda"": 1.0,  ""max_depth"": 1 }",1.0,1.0,0.85,0.857143
1,SVC,"{  ""C"": 0.6,  ""gamma"": ""scale"",  ""kernel"": ""sigmoid"" }",0.766234,0.763158,0.8,0.833333
2,Decision Tree,"{  ""criterion"": ""entropy"",  ""max_depth"": 3,  ""max_features"": ""sqrt"",  ""min_samples_split"": 2 }",0.922078,0.921053,0.7,0.7


## BERT + non latent + similarity

In [10]:
df3 = run_tests({
    "non_latent": Pipeline.NonLatentConfig(),
    "similarity": True,
})
print_as_latex(df3, "\\shortstack{BERT\\\\+ Non-Latent\\\\+ Similarity}")
pretty_print(df3)

(77, 796)


Fitting models: 100%|██████████| 4/4 [00:21<00:00,  5.33s/it]

\multirow{4}{*}{\shortstack{BERT\\+ Non-Latent\\+ Similarity}} & LR & 0.94 & 0.94 & 0.85 & 0.88\\
\cmidrule{2-6}
& SVC & 0.77 & 0.76 & 0.8 & 0.83\\
\cmidrule{2-6}
& DT & 0.95 & 0.95 & 0.7 & 0.77\\
\cmidrule{2-6}
& \textbf{XGB} & 1.0 & 1.0 & 0.85 & 0.87\\





Unnamed: 0,Method,Params,train_accuracy,train_f1,test_accuracy,test_f1
0,Logistic Regression,"{  ""C"": 0.2,  ""solver"": ""lbfgs"" }",0.935065,0.936709,0.85,0.88
3,XGBoost,"{  ""alpha"": 0.0,  ""eta"": 0.5,  ""lambda"": 1.2,  ""max_depth"": 1 }",1.0,1.0,0.85,0.869565
1,SVC,"{  ""C"": 0.6,  ""gamma"": ""scale"",  ""kernel"": ""sigmoid"" }",0.766234,0.763158,0.8,0.833333
2,Decision Tree,"{  ""criterion"": ""gini"",  ""max_depth"": 3,  ""max_features"": 0.3,  ""min_samples_split"": 2 }",0.948052,0.95,0.7,0.769231
