# Dataset classification

##Imports

In [None]:
import json
import pandas as pd
import numpy as np
import csv
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error
from scipy import stats
from sklearn.model_selection import train_test_split
import joblib
import sys

## Loading

### Event embeddings and anchors

In [None]:
with open('all.json') as ff:
    data = list(map(json.loads, ff))
for i in range(len(data)):
    assert data[i]['idx'] == i

In [None]:
prototypes = sorted(data[0]['embedding'].keys())
for g in data:
    assert prototypes == sorted(g['embedding'].keys())

embeddings = [None] * len(data)
for g in data:
    embeddings[g['idx']] = [g['embedding'][i] for i in prototypes]
assert all(x is not None for x in embeddings)

In [None]:
dataset = pd.DataFrame(embeddings)

### Event types as classes

In [None]:
with open('../../stratified_samples/stratified_1000_all.json', 'r') as file:
    classes = [g['graph']['dataset'] for g in json.load(file)]

In [None]:
X = dataset.copy()
y = classes.copy()

## Classification

###SVM

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
K = 10
skf = StratifiedKFold(K, shuffle=True, random_state=42)

In [None]:
model = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(random_state=42, max_iter=1_000_000))
])

grid = {
  "scaler" : [None, StandardScaler()],  
  "svc__kernel": ["linear", "rbf", "poly", "sigmoid"],
  "svc__C": np.logspace(1, 9, 9)
}

In [None]:
# training
gs_svm = GridSearchCV(model, grid, cv=skf, n_jobs=-1)
%time gs_svm.fit(X_train, y_train)

Wall time: 3min 35s


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('svc',
                                        SVC(max_iter=1000000,
                                            random_state=42))]),
             n_jobs=-1,
             param_grid={'scaler': [None, StandardScaler()],
                         'svc__C': array([1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06, 1.e+07, 1.e+08,
       1.e+09]),
                         'svc__kernel': ['linear', 'rbf', 'poly', 'sigmoid']})

In [None]:
print('Best parameters:', gs_svm.best_params_)  

acc_train = gs_svm.score(X_train, y_train) 
acc_val = gs_svm.score(X_val, y_val)
print(f'Accuracy on train {acc_train}')
print(f'Accuracy on val {acc_val}')

Best parameters: {'scaler': StandardScaler(), 'svc__C': 100.0, 'svc__kernel': 'linear'}
Accuracy on train 0.7038777908343126
Accuracy on val 0.6054794520547945


In [None]:
# confusion matrix
y_pred = gs_svm.predict(X_val)
cm = confusion_matrix(y_val, y_pred)
pd.DataFrame(cm, index=gs_svm.best_estimator_.classes_, columns=gs_svm.best_estimator_.classes_)

Unnamed: 0,cg,epi,ge11,ge13,gro13,id11,mlee,pc,st09
cg,34,0,0,0,0,1,12,13,0
epi,0,28,2,0,0,0,0,0,0
ge11,0,3,12,7,0,1,0,0,19
ge13,0,2,9,10,0,2,0,0,7
gro13,0,0,0,0,40,0,0,0,0
id11,0,2,7,2,0,13,0,0,5
mlee,9,0,0,0,0,0,23,12,0
pc,2,0,0,0,0,0,2,48,0
st09,0,2,18,4,0,1,0,0,13


In [None]:
f1 = f1_score(y_val, y_pred, average="weighted")
print(f"f1-score: {f1}")

f1-score: 0.6012808560002124


In [None]:
from sklearn.linear_model import LogisticRegression

model = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(random_state=42))
])

grid = {}

gs_svm = GridSearchCV(model, grid, cv=skf, n_jobs=-1)
%time gs_svm.fit(X_train, y_train)

Wall time: 409 ms


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('lr',
                                        LogisticRegression(random_state=42))]),
             n_jobs=-1, param_grid={})

In [None]:
y_pred = gs_svm.predict(X_val)

acc_train = gs_svm.score(X_train, y_train) 
acc_val = gs_svm.score(X_val, y_val)
print(f'Accuracy on train {acc_train}')
print(f'Accuracy on val {acc_val}')

f1 = f1_score(y_val, y_pred, average="weighted")
print(f"f1-score: {f1}")

Accuracy on train 0.6392479435957696
Accuracy on val 0.5506849315068493
f1-score: 0.5392121730679921
