# Trying AutoML for protest event detection

To be run with Python 3.9, dependencies not included in the Poetry project.

In [1]:
import pandas as pd

glpn_path = "../../datasets/glpn_v1.1/"
data_files = {
    "train": str(glpn_path + "glpn_train.csv"),
    "dev": str(glpn_path + "glpn_dev.csv"),
    "test": str(glpn_path + "glpn_test.csv"),
    "test.time": str(glpn_path + "glpn_test-time.csv"),
    "test.loc": str(glpn_path + "glpn_test-loc.csv"),
}
glpn = {k: pd.read_csv(v) for k, v in data_files.items()}
# convert the "label" column to 0 and 1 instead of "irrelevant" and "relevant" in each dataset
for k, v in glpn.items():
    v["label"] = v["labels"].apply(lambda x: 0 if x == "irrelevant" else 1)

In [2]:
glpn["train"].head()

Unnamed: 0,id,newspaper,labels,year,excerpt,label
0,1,Stuttgarter Zeitung,relevant,2010.0,Stuttgarter Zeitung 2010-07-06 Parkschützer st...,1
1,3,Stuttgarter Zeitung,relevant,2010.0,Stuttgarter Zeitung 2010-08-27 DER SCHWABENSTR...,1
2,4,Leipziger Volkszeitung,irrelevant,2015.0,"""Ohne uns hätte es keinen Wiederaufbau gegeben...",0
3,5,Leipziger Volkszeitung,irrelevant,2015.0,Mini-Meisterschaft im ATSV-Stadion Wurzen. Ein...,0
4,6,Leipziger Volkszeitung,irrelevant,2015.0,Kurstädter von energischem Einsatz beeindruckt...,0


In [9]:
from pycaret.nlp import *

# initialize the setup
nlp1 = setup(data=glpn["train"], target="excerpt")

Description,Value
session_id,5283
Documents,1914
Vocab Size,12898
Custom Stopwords,False


In [16]:
from pycaret.classification import *
from sklearn.metrics import f1_score

# train the model
clf = setup(
    data=glpn["train"],
    target="label",
    text_features=["excerpt"],
    ignore_features=["labels", "newspaper", "year", "id"],
    test_data=glpn["dev"],
)

Unnamed: 0,Description,Value
0,Session id,7043
1,Target,label
2,Target type,Binary
3,Original data shape,"(2188, 12)"
4,Transformed data shape,"(2188, 5)"
5,Transformed train set shape,"(1914, 5)"
6,Transformed test set shape,"(274, 5)"
7,Ignore features,7
8,Numeric features,4
9,Rows with missing values,12.9%


In [18]:
best = compare_models(sort="F1")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.6673,0.0,0.8415,0.6745,0.7471,0.2769,0.2937,0.005
lr,Logistic Regression,0.6667,0.6708,0.8388,0.675,0.7462,0.2764,0.2927,0.018
lda,Linear Discriminant Analysis,0.6657,0.6707,0.837,0.6743,0.7452,0.2745,0.2901,0.005
svm,SVM - Linear Kernel,0.6396,0.0,0.907,0.6349,0.7443,0.1875,0.2397,0.005
ada,Ada Boost Classifier,0.6647,0.7009,0.8289,0.6764,0.7428,0.2741,0.2872,0.019
gbc,Gradient Boosting Classifier,0.6699,0.7031,0.8146,0.6858,0.7428,0.2909,0.2999,0.04
nb,Naive Bayes,0.6641,0.6708,0.8209,0.6778,0.7405,0.2756,0.2883,0.005
dummy,Dummy Classifier,0.5836,0.5,1.0,0.5836,0.737,0.0,0.0,0.004
lightgbm,Light Gradient Boosting Machine,0.6583,0.6807,0.7635,0.6881,0.7222,0.2802,0.2852,0.016
knn,K Neighbors Classifier,0.6332,0.6423,0.7367,0.6689,0.7001,0.2297,0.2329,0.009


In [None]:
tuned_best = tune_model(best, optimize="F1")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8438,0.0,0.9196,0.8306,0.8729,0.6715,0.6774
1,0.7865,0.0,0.8839,0.7795,0.8285,0.5486,0.5563
2,0.7604,0.0,0.8839,0.75,0.8115,0.4889,0.5014
3,0.7344,0.0,0.8571,0.7328,0.7901,0.4344,0.4444
4,0.7644,0.0,0.8571,0.768,0.8101,0.5022,0.5075
5,0.8377,0.0,0.9196,0.824,0.8692,0.6571,0.664
6,0.8063,0.0,0.8304,0.8378,0.8341,0.6014,0.6014
7,0.8743,0.0,0.9369,0.8595,0.8966,0.7373,0.7418
8,0.8482,0.0,0.8739,0.8661,0.87,0.6876,0.6876
9,0.8429,0.0,0.8468,0.8785,0.8624,0.6796,0.6802


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [None]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [None]:
# predict on the test set
pred_holdout = predict_model(best, data=glpn["test"])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ridge Classifier,0.7861,0.7675,0.8576,0.8017,0.8287,0.5449,0.5471


In [None]:
# predict on the test.time set
pred_holdout = predict_model(best, data=glpn["test.time"])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ridge Classifier,0.7314,0.7208,0.7458,0.8581,0.798,0.403,0.4119


In [None]:
# predict on the test.loc set
pred_holdout = predict_model(best, data=glpn["test.loc"])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ridge Classifier,0.666,0.7134,0.7889,0.3318,0.4671,0.2786,0.3342


In [None]:
save_model(best, "pycaret_glpn")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=/var/folders/6v/w9nn6c_n4qdbrjwfnq7695n00000gn/T/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None, include=[],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               missing_values=nan,
                                                               strategy='mean',
                                                               verbose='deprecated'))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude=None, incl...
                                                               strategy='most_frequent',
                                                               verbose='deprecated'))),
                 ('text_embedding',
     