In [2]:
from utils import *
from preprocessing import preprocess
from windowfy import windowfy
from featurizing import featurize
from tfidf_featurizer import combine_features, tfidf_featurize
from training import train, do_ensemble, do_train
from training_traditional import train_and_evaluate
from eval_erisk import evaluate
from IPython.display import display, Markdown
from itertools import product
from numpy.random import seed
import tensorflow



In [3]:
seed(42)
tensorflow.random.set_seed(42) 
logger("Initialized numpy random and tensorflow random seed at 42")

Initialized numpy random and tensorflow random seed at 42


In [4]:
train_users, y_train, test_users, y_test, train_samples, X_train, X_test = windowfy(window_size=10, max_size=100, sample_weights_size=100, is_oversample=False, include_new_data=True)
feats_text_train, feats_text_test = featurize(calculate_feats=True, include_feats=["first_prons","sentiment","nssi"], 
                       train_users=train_users, test_users=test_users, discretize=True)
tfidf_train, tfidf_test = tfidf_featurize(train_users, test_users, max_features=50000)

feats_train, feats_test = combine_features([tfidf_train, feats_text_train], [tfidf_test, feats_text_test])

y_pred, classifier = train_and_evaluate(feats_train, y_train, feats_test, y_test, train_samples, 
                                        classifier_name="svm", strategy="weights")

evaluate(1, 10, {"test":"test"}, y_pred=y_pred, test_users=test_users)


Windowfying training users
Windowfying test users

Finished windowfying
Featurizing calculate_feats=True, normalize=False, discretize=True, discretize_size=10, include_feats=['first_prons', 'sentiment', 'nssi']
Initialized numpy random and tensorflow random seed at 42
Data size: 32892, 32892
Data size: 25611, 25611
Calculating first prons
Calculating sentiment
Calculating NSSI words
Calculating first prons
Calculating sentiment
Calculating NSSI words
Discretizing


  "replaced with 0." % jj)


Is the combined different from tfidf: False
              precision    recall  f1-score   support

           0       0.90      0.91      0.91     21304
           1       0.53      0.49      0.51      4307

    accuracy                           0.84     25611
   macro avg       0.72      0.70      0.71     25611
weighted avg       0.84      0.84      0.84     25611

[[19472  1832]
 [ 2205  2102]]
{'precision': 0.3879310344827586, 'recall': 0.8653846153846154, 'F1': 0.5357142857142857, 'ERDE_5': 0.3280570051854223, 'ERDE_50': 0.12621841939396872, 'median_latency_tps': 11.0, 'median_penalty_tps': 0.03898023902249159, 'speed': 0.9610197609775084, 'latency_weighted_f1': 0.5148320148093795}


{'precision': 0.3879310344827586,
 'recall': 0.8653846153846154,
 'F1': 0.5357142857142857,
 'ERDE_5': 0.3280570051854223,
 'ERDE_50': 0.12621841939396872,
 'median_latency_tps': 11.0,
 'median_penalty_tps': 0.03898023902249159,
 'speed': 0.9610197609775084,
 'latency_weighted_f1': 0.5148320148093795}

In [5]:
y_pred, classifier = train_and_evaluate(feats_train, y_train, feats_test, y_test, train_samples, 
                                        classifier_name="naive_bayes", strategy="weights")

evaluate(1, 10, {"test":"test"}, y_pred=y_pred, test_users=test_users)

              precision    recall  f1-score   support

           0       0.90      0.91      0.90     21304
           1       0.53      0.48      0.50      4307

    accuracy                           0.84     25611
   macro avg       0.71      0.69      0.70     25611
weighted avg       0.83      0.84      0.84     25611

[[19457  1847]
 [ 2258  2049]]
{'precision': 0.4321608040201005, 'recall': 0.8269230769230769, 'F1': 0.5676567656765675, 'ERDE_5': 0.3112049952721925, 'ERDE_50': 0.11189325165030806, 'median_latency_tps': 11.0, 'median_penalty_tps': 0.03898023902249159, 'speed': 0.9610197609775084, 'latency_weighted_f1': 0.5455293692677604}


{'precision': 0.4321608040201005,
 'recall': 0.8269230769230769,
 'F1': 0.5676567656765675,
 'ERDE_5': 0.3112049952721925,
 'ERDE_50': 0.11189325165030806,
 'median_latency_tps': 11.0,
 'median_penalty_tps': 0.03898023902249159,
 'speed': 0.9610197609775084,
 'latency_weighted_f1': 0.5455293692677604}

In [6]:
y_pred, classifier = train_and_evaluate(feats_train, y_train, feats_test, y_test, train_samples, 
                                        classifier_name="linear_svm", strategy="weights")

evaluate(1, 10, {"test":"test"}, y_pred=y_pred, test_users=test_users)

              precision    recall  f1-score   support

           0       0.90      0.90      0.90     21304
           1       0.50      0.48      0.49      4307

    accuracy                           0.83     25611
   macro avg       0.70      0.69      0.69     25611
weighted avg       0.83      0.83      0.83     25611

[[19229  2075]
 [ 2237  2070]]
{'precision': 0.33088235294117646, 'recall': 0.8653846153846154, 'F1': 0.4787234042553191, 'ERDE_5': 0.3513385895091939, 'ERDE_50': 0.1453432079738996, 'median_latency_tps': 11.0, 'median_penalty_tps': 0.03898023902249159, 'speed': 0.9610197609775084, 'latency_weighted_f1': 0.46006265153178594}


{'precision': 0.33088235294117646,
 'recall': 0.8653846153846154,
 'F1': 0.4787234042553191,
 'ERDE_5': 0.3513385895091939,
 'ERDE_50': 0.1453432079738996,
 'median_latency_tps': 11.0,
 'median_penalty_tps': 0.03898023902249159,
 'speed': 0.9610197609775084,
 'latency_weighted_f1': 0.46006265153178594}