# Simple approaches to event detection

In [2]:
from protest_impact.data.protests.detection import load_glpn_dataset

glpn = load_glpn_dataset()

Using custom data configuration default-552cac500ccb144a
Found cached dataset csv (/Users/david/.cache/huggingface/datasets/csv/default-552cac500ccb144a/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/5 [00:00<?, ?it/s]

Loading cached processed dataset at /Users/david/.cache/huggingface/datasets/csv/default-552cac500ccb144a/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-ceafcab5597f7c17.arrow
Loading cached processed dataset at /Users/david/.cache/huggingface/datasets/csv/default-552cac500ccb144a/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-41bc297fe33d9da3.arrow
Loading cached processed dataset at /Users/david/.cache/huggingface/datasets/csv/default-552cac500ccb144a/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-201012f13e291afc.arrow
Loading cached processed dataset at /Users/david/.cache/huggingface/datasets/csv/default-552cac500ccb144a/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-b939ffa2b84d33a0.arrow
Loading cached processed dataset at /Users/david/.cache/huggingface/datasets/csv/default-552cac500ccb144a/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317

In [3]:
# create tfidf features
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=1000,
    ngram_range=(1, 3),
)

tfidf.fit(glpn["train"]["excerpt"])

In [4]:
from sklearn.linear_model import LogisticRegression

# make a pipeline
from sklearn.pipeline import Pipeline

pipe = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("clf", LogisticRegression()),
    ]
)

# make a grid search
from sklearn.model_selection import GridSearchCV

param_grid = {
    "tfidf__max_df": [0.95, 0.99],
    "tfidf__min_df": [2, 5],
    "tfidf__max_features": [1000, 2000],
    "tfidf__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "clf__C": [0.1, 1, 10],
}

grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=1)

grid.fit(glpn["train"]["excerpt"], glpn["train"]["label"])

# show best parameters
grid.best_params_

Fitting 5 folds for each of 72 candidates, totalling 360 fits


{'clf__C': 10,
 'tfidf__max_df': 0.99,
 'tfidf__max_features': 2000,
 'tfidf__min_df': 5,
 'tfidf__ngram_range': (1, 1)}

In [5]:
# show most important features
pd.DataFrame(
    grid.best_estimator_.named_steps["clf"].coef_,
    columns=grid.best_estimator_.named_steps["tfidf"].get_feature_names_out(),
).T.sort_values(0, ascending=False).head(50)

Unnamed: 0,0
demonstranten,8.927157
kundgebung,8.420434
demonstration,7.042979
unterschriften,6.819035
legida,6.692747
streik,6.616384
gegen,6.529716
demo,5.549451
bürgerbegehren,5.39145
2011,4.751349


In [6]:
# evaluate on dev, test, test.time, test.loc
from sklearn.metrics import classification_report

for split in ["dev", "test", "test.time", "test.loc"]:
    print(split)
    print(
        classification_report(
            glpn[split]["label"], grid.predict(glpn[split]["excerpt"])
        )
    )

dev
              precision    recall  f1-score   support

           0       0.81      0.74      0.77       122
           1       0.80      0.86      0.83       152

    accuracy                           0.81       274
   macro avg       0.81      0.80      0.80       274
weighted avg       0.81      0.81      0.81       274

test
              precision    recall  f1-score   support

           0       0.81      0.83      0.82       217
           1       0.89      0.87      0.88       330

    accuracy                           0.85       547
   macro avg       0.85      0.85      0.85       547
weighted avg       0.85      0.85      0.85       547

test.time
              precision    recall  f1-score   support

           0       0.55      0.77      0.64       217
           1       0.89      0.75      0.81       535

    accuracy                           0.75       752
   macro avg       0.72      0.76      0.73       752
weighted avg       0.79      0.75      0.76       752



In [2]:
import optuna

from protest_impact.data.protests.detection.simple_classification import objective

# create study
study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///db.sqlite3",
    study_name="glpn-3",
    load_if_exists=True,
)
# optimize
# study.optimize(objective, n_trials=100)

[32m[I 2023-02-04 12:50:08,962][0m Using an existing study with name 'glpn-3' instead of creating a new one.[0m


In [8]:
# show best parameters
study.best_params

{'classifier': 'XGBClassifier',
 'feature_extraction': 'tfidf',
 'max_depth': 32,
 'max_features': 702,
 'n_estimators': 73,
 'ngram_range': 2}

In [9]:
# show best score
study.best_value

0.8377214235718516

In [10]:
model = objective(study.best_trial, return_model=True)

In [12]:
# evaluate on test.time, and test.loc set

from sklearn.metrics import classification_report

for split in ["test", "test.time", "test.loc"]:
    print(split)
    print(
        classification_report(
            glpn[split]["label"], model.predict(glpn[split]["excerpt"])
        )
    )

test
              precision    recall  f1-score   support

           0       0.81      0.77      0.79       217
           1       0.86      0.88      0.87       330

    accuracy                           0.84       547
   macro avg       0.83      0.83      0.83       547
weighted avg       0.84      0.84      0.84       547

test.time
              precision    recall  f1-score   support

           0       0.57      0.83      0.67       217
           1       0.91      0.74      0.82       535

    accuracy                           0.77       752
   macro avg       0.74      0.79      0.75       752
weighted avg       0.81      0.77      0.78       752

test.loc
              precision    recall  f1-score   support

           0       0.91      0.90      0.91       395
           1       0.59      0.61      0.60        90

    accuracy                           0.85       485
   macro avg       0.75      0.76      0.75       485
weighted avg       0.85      0.85      0.85       

In [13]:
from collections import Counter

for split in ["train", "dev", "test", "test.time", "test.loc"]:
    print(split)
    print(Counter(glpn[split]["newspaper"]))
    print(Counter(glpn[split]["label"]))

train
Counter({'Stuttgarter Zeitung': 1280, 'Leipziger Volkszeitung': 517, 'Weser Kurier': 117})
Counter({1: 1117, 0: 797})
dev
Counter({'Stuttgarter Zeitung': 187, 'Leipziger Volkszeitung': 69, 'Weser Kurier': 18})
Counter({1: 152, 0: 122})
test
Counter({'Stuttgarter Zeitung': 365, 'Leipziger Volkszeitung': 143, 'Weser Kurier': 39})
Counter({1: 330, 0: 217})
test.time
Counter({'Leipziger Volkszeitung': 391, 'Weser Kurier': 361})
Counter({1: 535, 0: 217})
test.loc
Counter({'Sächsische Zeitung': 485})
Counter({0: 395, 1: 90})


In [19]:
# use cross validation

study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///db.sqlite3",
    study_name="glpn.cv",
    load_if_exists=True,
)
# optimize
# study.optimize(partial(objective, eval="cv"), n_trials=100)

[32m[I 2023-02-02 16:07:24,297][0m Using an existing study with name 'glpn.cv' instead of creating a new one.[0m


In [20]:
# show best parameters
study.best_params

{'C': 4.602833645797326,
 'classifier': 'LogisticRegression',
 'feature_extraction': 'tfidf',
 'max_features': 669,
 'ngram_range': 1}

In [21]:
# show best score
study.best_value

0.7476544079909743

In [22]:
# evaluate on test, test.time, and test.loc set

model = objective(study.best_trial, return_model=True, eval="cv")

for split in ["test", "test.time", "test.loc"]:
    print(split)
    print(
        classification_report(
            glpn[split]["label"], model.predict(glpn[split]["excerpt"])
        )
    )

test
              precision    recall  f1-score   support

           0       0.81      0.77      0.79       217
           1       0.86      0.88      0.87       330

    accuracy                           0.84       547
   macro avg       0.83      0.83      0.83       547
weighted avg       0.84      0.84      0.84       547

test.time
              precision    recall  f1-score   support

           0       0.56      0.78      0.66       217
           1       0.90      0.75      0.82       535

    accuracy                           0.76       752
   macro avg       0.73      0.77      0.74       752
weighted avg       0.80      0.76      0.77       752

test.loc
              precision    recall  f1-score   support

           0       0.94      0.75      0.83       395
           1       0.42      0.80      0.55        90

    accuracy                           0.76       485
   macro avg       0.68      0.77      0.69       485
weighted avg       0.85      0.76      0.78       