In [1]:
import numpy as np
import pandas as pd
import pickle
import regex as re
from sklearn.model_selection import cross_val_predict

In [2]:
from preprocessing.parse_rs3_make_objects import EDUPair, EDU
from make_features import load_pairs_target, generate_feature_matrix, all_classification_reports, smote_oversampling

In [3]:
pairs,target = load_pairs_target() 

In [4]:
X = generate_feature_matrix(pairs)

(4676, 6252)


In [5]:
X_res, target_res = smote_oversampling(X, target)

(52248, 6252)


In [6]:
SEED=669

In [9]:
## SVM
from sklearn.svm import SVC
svc = SVC(random_state=SEED)
pred_svc_cv = cross_val_predict(svc, X_res, target_res, cv=5, verbose=10)
all_classification_reports(target_res, pred_svc_cv)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 23.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 46.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 70.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 93.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 118.1min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 118.1min finished


All together:
                              precision    recall  f1-score   support

               antithesis_NS       0.94      1.00      0.97      1866
              attribution_NS       0.45      0.45      0.45      1866
              attribution_SN       0.27      0.26      0.27      1866
               background_NS       0.35      0.42      0.38      1866
               background_SN       0.38      0.57      0.45      1866
             cause-effect_NS       0.20      0.01      0.02      1866
             cause-effect_SN       0.31      0.27      0.29      1866
                comparison_M       0.30      0.28      0.29      1866
               concession_NS       0.67      0.64      0.65      1866
               concession_SN       0.81      0.65      0.72      1866
                condition_NS       0.50      0.54      0.52      1866
                condition_SN       0.32      0.27      0.29      1866
                  contrast_M       0.78      0.03      0.05      1866
     

In [7]:
# ## KNN
# from sklearn.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier(n_neighbors=11)
# pred_knn_cv = cross_val_predict(knn, X_res, target_res, cv=5, verbose=10)
# all_classification_reports(target_res, pred_knn_cv)

In [8]:
## XGboost (sklearn api)
from xgboost import XGBClassifier
xgb = XGBClassifier(objective="multi:softmax", random_state=SEED)
pred_xgb_cv = cross_val_predict(xgb, X_res, target_res, cv=5, verbose=10)
all_classification_reports(target_res, pred_xgb_cv)

  if diff:
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 14.0min remaining:    0.0s
  if diff:
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 28.7min remaining:    0.0s
  if diff:
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 42.4min remaining:    0.0s
  if diff:
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 56.1min remaining:    0.0s
  if diff:
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 69.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 69.7min finished


All together:
                              precision    recall  f1-score   support

               antithesis_NS       1.00      1.00      1.00      1866
              attribution_NS       0.99      0.98      0.98      1866
              attribution_SN       0.93      0.94      0.93      1866
               background_NS       0.99      0.98      0.99      1866
               background_SN       0.99      0.98      0.99      1866
             cause-effect_NS       0.99      0.96      0.98      1866
             cause-effect_SN       0.99      0.96      0.98      1866
                comparison_M       0.98      0.98      0.98      1866
               concession_NS       0.99      0.99      0.99      1866
               concession_SN       0.99      0.99      0.99      1866
                condition_NS       0.98      0.99      0.99      1866
                condition_SN       0.98      0.98      0.98      1866
                  contrast_M       0.94      0.94      0.94      1866
     

In [9]:
## Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1500, random_state=SEED)
pred_rf_cv = cross_val_predict(rf, X_res, target_res, cv=5, verbose=10)
all_classification_reports(target_res, pred_rf_cv)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 14.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 31.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 48.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 66.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 83.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 83.6min finished


All together:
                              precision    recall  f1-score   support

               antithesis_NS       1.00      1.00      1.00      1866
              attribution_NS       1.00      0.99      0.99      1866
              attribution_SN       0.99      0.95      0.97      1866
               background_NS       1.00      0.99      0.99      1866
               background_SN       1.00      0.99      0.99      1866
             cause-effect_NS       1.00      0.97      0.98      1866
             cause-effect_SN       1.00      0.97      0.99      1866
                comparison_M       1.00      0.98      0.99      1866
               concession_NS       0.99      0.99      0.99      1866
               concession_SN       1.00      0.99      1.00      1866
                condition_NS       0.99      0.99      0.99      1866
                condition_SN       0.99      0.98      0.99      1866
                  contrast_M       0.98      0.96      0.97      1866
     