# Event type classification

##Imports

In [None]:
import json
import pandas as pd
import numpy as np
import csv
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error
from scipy import stats
from sklearn.model_selection import train_test_split
import joblib
import sys
import networkx as nx

## Loading

### Event embeddings and anchors

In [None]:
with open('all.json') as ff:
    data = list(map(json.loads, ff))
for i in range(len(data)):
    assert data[i]['idx'] == i

In [None]:
prototypes = sorted(data[0]['embedding'].keys())
for g in data:
    assert prototypes == sorted(g['embedding'].keys())

embeddings = [None] * len(data)
for g in data:
    embeddings[g['idx']] = [g['embedding'][i] for i in prototypes]
assert all(x is not None for x in embeddings)

In [None]:
len(prototypes)

32

In [None]:
dataset = pd.DataFrame(embeddings)

In [None]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,215.901003,216.309764,213.523169,161.595067,213.222548,213.328332,139.734635,212.444329,213.426005,216.58355,...,285.857375,284.059703,212.90983,213.546108,213.47141,179.341547,286.449943,135.692731,135.429587,162.814418
1,243.986593,243.872966,242.690744,191.457743,240.383261,242.334809,204.472943,242.080778,242.387419,243.592036,...,286.477682,283.863694,243.021083,242.64597,240.132782,208.710766,285.320822,164.48708,164.760553,162.666314
2,219.678998,219.986343,218.58829,161.837329,217.764351,218.63221,105.009719,218.548864,218.266195,219.199046,...,257.100417,256.232567,218.575548,218.657557,217.305159,185.309093,257.935326,63.308232,62.951722,84.342912
3,214.958109,214.410619,213.523838,161.996479,213.510981,213.275588,139.005606,213.644129,213.720719,214.917097,...,257.967859,256.43812,213.354627,213.594662,174.630941,179.718875,256.653712,174.316705,175.009398,162.427797
4,223.845106,223.753555,223.633124,153.327309,223.627844,223.247642,77.586831,223.294104,223.462733,223.708567,...,236.212629,234.1026,222.594378,223.408114,223.517863,181.915837,235.871011,125.739669,126.295838,103.00914


### Event types as classes

In [None]:
with open('../../stratified_samples/stratified_1000_all.json', 'r') as file:
    graphs = [nx.node_link_graph(g) for g in json.load(file)]
    roots = [g.graph['root'] for g in graphs]
    nodes_list = [list(g.nodes.data('type')) for g in graphs]
    classes = [n[1] for i in range(len(graphs)) for n in nodes_list[i] if n[0]== roots[i]] 

In [None]:
X = dataset.copy()
y = classes.copy()

## Classification

###SVM

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
K = 10
skf = StratifiedKFold(K, shuffle=True, random_state=42)

In [None]:
model = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(random_state=42, max_iter=1_000_000))
])

grid = {
  "scaler" : [None, StandardScaler()],  
  "svc__kernel": ["linear", "rbf", "poly", "sigmoid"],
  "svc__C": np.logspace(1, 9, 9)
}

In [None]:
# training
gs_svm = GridSearchCV(model, grid, cv=skf, n_jobs=-1)
%time gs_svm.fit(X_train, y_train)



Wall time: 29.7 s


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('svc',
                                        SVC(max_iter=1000000,
                                            random_state=42))]),
             n_jobs=-1,
             param_grid={'scaler': [None, StandardScaler()],
                         'svc__C': array([1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06, 1.e+07, 1.e+08,
       1.e+09]),
                         'svc__kernel': ['linear', 'rbf', 'poly', 'sigmoid']})

In [None]:
print('Best parameters:', gs_svm.best_params_)  

acc_train = gs_svm.score(X_train, y_train) 
acc_val = gs_svm.score(X_val, y_val)
print(f'Accuracy on train {acc_train}')
print(f'Accuracy on val {acc_val}')

Best parameters: {'scaler': None, 'svc__C': 100.0, 'svc__kernel': 'poly'}
Accuracy on train 0.982373678025852
Accuracy on val 0.8191780821917808


In [None]:
y_pred = gs_svm.predict(X_val)

In [None]:
f1 = f1_score(y_val, y_pred, average="weighted")
print(f"f1-score: {f1}")

f1-score: 0.8104892779733482


In [None]:
from sklearn.linear_model import LogisticRegression

model = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(random_state=42, max_iter=1_000_000))
])

grid = {'scaler': [None, StandardScaler()]}

gs_svm = GridSearchCV(model, grid, cv=skf, n_jobs=-1)
%time gs_svm.fit(X_train, y_train)



Wall time: 52.3 s


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('lr',
                                        LogisticRegression(max_iter=1000000,
                                                           random_state=42))]),
             n_jobs=-1, param_grid={'scaler': [None, StandardScaler()]})

In [None]:
y_pred = gs_svm.predict(X_val)

acc_train = gs_svm.score(X_train, y_train) 
acc_val = gs_svm.score(X_val, y_val)
print(f'Accuracy on train {acc_train}')
print(f'Accuracy on val {acc_val}')

f1 = f1_score(y_val, y_pred, average="weighted")
print(f"f1-score: {f1}")

Accuracy on train 0.8495887191539365
Accuracy on val 0.8246575342465754
f1-score: 0.7971569048730222
