In [1]:
from utils import *
from preprocessing import preprocess
from windowfy import windowfy
from featurizing import featurize
from tfidf_featurizer import combine_features, tfidf_featurize
from training import train, do_ensemble, do_train
from training_traditional import train_and_evaluate
from eval_erisk import evaluate
from IPython.display import display, Markdown
from itertools import product
from numpy.random import seed
import tensorflow



In [2]:
seed(42)
tensorflow.random.set_seed(42) 
logger("Initialized numpy random and tensorflow random seed at 42")

Initialized numpy random and tensorflow random seed at 42


In [4]:
train_users, y_train, test_users, y_test, train_samples, X_train, X_test = windowfy(window_size=10, max_size=20, sample_weights_size=20, is_oversample=False, include_new_data=True)
feats_text_train, feats_text_test = featurize(calculate_feats=True, include_feats=["first_prons","sentiment","nssi"], 
                       train_users=train_users, test_users=test_users, discretize=True)
tfidf_train, tfidf_test = tfidf_featurize(train_users, test_users, max_features=50000)

feats_train, feats_test = combine_features([tfidf_train, feats_text_train], [tfidf_test, feats_text_test])

y_pred, classifier = train_and_evaluate(feats_train, y_train, feats_test, y_test, train_samples, 
                                        classifier_name="svm", strategy="weights")

evaluate(1, 10, {"test":"test"}, y_pred=y_pred, test_users=test_users)


Windowfying training users
Windowfying test users

Finished windowfying
Featurizing calculate_feats=True, normalize=False, discretize=True, discretize_size=10, include_feats=['first_prons', 'sentiment', 'nssi']
Initialized numpy random and tensorflow random seed at 42
Data size: 5424, 5424
Data size: 4650, 4650
Calculating first prons
Calculating sentiment
Calculating NSSI words
Calculating first prons
Calculating sentiment
Calculating NSSI words
Discretizing


  "replaced with 0." % jj)
  centers = km.fit(column[:, None]).cluster_centers_[:, 0]
  'decreasing the number of bins.' % jj)


Is the combined different from tfidf: False
              precision    recall  f1-score   support

           0       0.86      0.87      0.87      3602
           1       0.55      0.53      0.54      1048

    accuracy                           0.80      4650
   macro avg       0.71      0.70      0.70      4650
weighted avg       0.79      0.80      0.79      4650

[[3142  460]
 [ 491  557]]
{'precision': 0.4550898203592814, 'recall': 0.7307692307692307, 'F1': 0.5608856088560886, 'ERDE_5': 0.2983885451186414, 'ERDE_50': 0.11908634150976129, 'median_latency_tps': 11.0, 'median_penalty_tps': 0.03898023902249159, 'speed': 0.9610197609775084, 'latency_weighted_f1': 0.5390221537586025}


{'precision': 0.4550898203592814,
 'recall': 0.7307692307692307,
 'F1': 0.5608856088560886,
 'ERDE_5': 0.2983885451186414,
 'ERDE_50': 0.11908634150976129,
 'median_latency_tps': 11.0,
 'median_penalty_tps': 0.03898023902249159,
 'speed': 0.9610197609775084,
 'latency_weighted_f1': 0.5390221537586025}

In [5]:
y_pred, classifier = train_and_evaluate(feats_train, y_train, feats_test, y_test, train_samples, 
                                        classifier_name="naive_bayes", strategy="weights")

evaluate(1, 10, {"test":"test"}, y_pred=y_pred, test_users=test_users)

              precision    recall  f1-score   support

           0       0.86      0.86      0.86      3602
           1       0.52      0.51      0.52      1048

    accuracy                           0.78      4650
   macro avg       0.69      0.69      0.69      4650
weighted avg       0.78      0.78      0.78      4650

[[3111  491]
 [ 510  538]]
{'precision': 0.44805194805194803, 'recall': 0.6634615384615384, 'F1': 0.5348837209302326, 'ERDE_5': 0.2949095034861263, 'ERDE_50': 0.13214738807013127, 'median_latency_tps': 11.0, 'median_penalty_tps': 0.03898023902249159, 'speed': 0.9610197609775084, 'latency_weighted_f1': 0.5140338256391325}


{'precision': 0.44805194805194803,
 'recall': 0.6634615384615384,
 'F1': 0.5348837209302326,
 'ERDE_5': 0.2949095034861263,
 'ERDE_50': 0.13214738807013127,
 'median_latency_tps': 11.0,
 'median_penalty_tps': 0.03898023902249159,
 'speed': 0.9610197609775084,
 'latency_weighted_f1': 0.5140338256391325}

In [6]:
y_pred, classifier = train_and_evaluate(feats_train, y_train, feats_test, y_test, train_samples, 
                                        classifier_name="linear_svm", strategy="weights")

evaluate(1, 10, {"test":"test"}, y_pred=y_pred, test_users=test_users)

              precision    recall  f1-score   support

           0       0.87      0.87      0.87      3602
           1       0.55      0.55      0.55      1048

    accuracy                           0.80      4650
   macro avg       0.71      0.71      0.71      4650
weighted avg       0.80      0.80      0.80      4650

[[3121  481]
 [ 470  578]]
{'precision': 0.4375, 'recall': 0.7403846153846154, 'F1': 0.55, 'ERDE_5': 0.30303327075402364, 'ERDE_50': 0.12137216437804965, 'median_latency_tps': 11.0, 'median_penalty_tps': 0.03898023902249159, 'speed': 0.9610197609775084, 'latency_weighted_f1': 0.5285608685376296}


{'precision': 0.4375,
 'recall': 0.7403846153846154,
 'F1': 0.55,
 'ERDE_5': 0.30303327075402364,
 'ERDE_50': 0.12137216437804965,
 'median_latency_tps': 11.0,
 'median_penalty_tps': 0.03898023902249159,
 'speed': 0.9610197609775084,
 'latency_weighted_f1': 0.5285608685376296}