In [25]:
from utils import *
from preprocessing import preprocess
from windowfy import windowfy
from featurizing import featurize
from tfidf_featurizer import combine_features, tfidf_featurize
from training import train, do_ensemble, do_train
from training_traditional import train_and_evaluate
from eval_erisk import evaluate, ensemble_vote
from IPython.display import display, Markdown
from itertools import product
from numpy.random import seed
import tensorflow
import numpy as np

In [7]:
seed(42)
tensorflow.random.set_seed(42) 
logger("Initialized numpy random and tensorflow random seed at 42")

Initialized numpy random and tensorflow random seed at 42


### With max_size = 10 and new data, sample_weights=10, oversample False, include_new_data=True

In [8]:
y_preds = {}

In [9]:
# prepare training data

In [10]:
train_users, y_train, test_users, y_test, train_samples, X_train, X_test = windowfy(window_size=10, max_size=50, sample_weights_size=20, is_oversample=False, include_new_data=True)
feats_train, feats_test = featurize(calculate_feats=True, include_feats=["first_prons","sentiment","nssi"], 
                       train_users=train_users, test_users=test_users, discretize=False)
tfidf_train, tfidf_test = tfidf_featurize(train_users, test_users, max_features=50000)

feats_train_comb, feats_test_comb = combine_features([tfidf_train, feats_train], [tfidf_test, feats_test])

Windowfying training users
Windowfying test users

Finished windowfying
Featurizing calculate_feats=True, normalize=False, discretize=False, discretize_size=10, include_feats=['first_prons', 'sentiment', 'nssi']
Initialized numpy random and tensorflow random seed at 42
Data size: 5424, 5424
Data size: 4650, 4650
Calculating first prons
Calculating sentiment
Calculating NSSI words
Calculating first prons
Calculating sentiment
Calculating NSSI words
Is the combined different from tfidf: False


In [11]:
# traditional classifiers

In [15]:
y_pred, classifier = train_and_evaluate(feats_train_comb, y_train, feats_test_comb, y_test, train_samples, 
                                        classifier_name="svm", strategy="weights")
eval_resul = evaluate(1, 10, {"test":"test"}, y_pred=y_pred, test_users=test_users)
y_preds["svm"] = y_pred

              precision    recall  f1-score   support

           0       0.87      0.77      0.82      3602
           1       0.43      0.61      0.51      1048

    accuracy                           0.73      4650
   macro avg       0.65      0.69      0.66      4650
weighted avg       0.77      0.73      0.75      4650

[[2771  831]
 [ 411  637]]


In [23]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
feats_train_comb_minimax = feats_train_comb.tocsc().toarray()
feats_test_comb_minimax = feats_test_comb.tocsc().toarray()
scaler.fit(feats_train_comb_minimax)
feats_train_comb_minmax = scaler.transform(feats_train_comb_minimax)
feats_test_comb_minmax = scaler.transform(feats_test_comb_minimax)

y_pred, classifier = train_and_evaluate(feats_train_comb_minmax, y_train, feats_test_comb_minmax, y_test, train_samples, 
                                        classifier_name="bayes", strategy="weights")
eval_resul = evaluate(1, 10, {"test":"test"}, y_pred=y_pred, test_users=test_users)
y_preds["bayes"] = y_pred

AttributeError: 'numpy.ndarray' object has no attribute 'tocsc'

In [18]:
# deep learning classifiers
batch_size=32
model_name = "cnn_model"
iterations = 10

In [19]:
model_resuls = {}
for i in range(0, iterations):
    y_pred = do_train(model_name=model_name, maxlen=1000, epochs=100, batch_size=batch_size,
                 shuffle=True, patience=30, feats_train=feats_train, feats_test=feats_test, 
                 X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, train_sample_weights=train_samples)
    eval_resul = evaluate(1, 10, {"test":"test"}, y_pred=y_pred, test_users=test_users)
    model_resuls[eval_resul['latency_weighted_f1']] = y_pred

y_preds[model_name] = model_resuls[max(model_resuls.keys())]

Starting training with cnn_model=cnn_model and maxlen=1000 and batch size=32
Generating embeddings
Data size: 5424
Training with callback
Restoring model weights from the end of the best epoch.
Epoch 00038: early stopping
Evaluating
Test Score: 0.97948157787323
Test Accuracy: 687.0
Entered here
              precision    recall  f1-score   support

           0       0.88      0.72      0.79      3602
           1       0.41      0.66      0.50      1048

    accuracy                           0.71      4650
   macro avg       0.64      0.69      0.65      4650
weighted avg       0.77      0.71      0.73      4650

[[2594 1008]
 [ 361  687]]
Finished training and evaluation
{'precision': 0.38425925925925924, 'recall': 0.7980769230769231, 'F1': 0.5187499999999999, 'ERDE_5': 0.32275047412298585, 'ERDE_50': 0.12694979572903334, 'median_latency_tps': 11.0, 'median_penalty_tps': 0.03898023902249159, 'speed': 0.9610197609775084, 'latency_weighted_f1': 0.4985290010070824}
Starting training wi

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Data size: 5424
Training with callback
Restoring model weights from the end of the best epoch.
Epoch 00051: early stopping
Evaluating
Test Score: 0.6318933963775635
Test Accuracy: 534.0
Entered here
              precision    recall  f1-score   support

           0       0.86      0.87      0.87      3602
           1       0.54      0.51      0.52      1048

    accuracy                           0.79      4650
   macro avg       0.70      0.69      0.69      4650
weighted avg       0.79      0.79      0.79      4650

[[3139  463]
 [ 514  534]]
Finished training and evaluation
{'precision': 0.5285714285714286, 'recall': 0.7115384615384616, 'F1': 0.6065573770491803, 'ERDE_5': 0.2838774569434117, 'ERDE_50': 0.10928357057827473, 'median_latency_tps': 11.0, 'median_penalty_tps': 0.03898023902249159, 'speed': 0.9610197609775084, 'latency_weighted_f1': 0.5829136255109477}
Starting training with cnn_model=cnn_model and maxlen=1000 and batch size=32
Generating embeddings
Data size: 5424
Trai

In [36]:
# deep learning classifiers
batch_size=32
model_name = "lstm_model_32"
iterations = 1

In [37]:
model_resuls = {}
for i in range(0, iterations):
    y_pred = do_train(model_name=model_name, maxlen=1000, epochs=100, batch_size=batch_size,
                 shuffle=True, patience=30, feats_train=feats_train, feats_test=feats_test, 
                 X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, train_sample_weights=train_samples)
    eval_resul = evaluate(1, 10, {"test":"test"}, y_pred=y_pred, test_users=test_users)
    model_resuls[eval_resul['latency_weighted_f1']] = y_pred

y_preds[model_name] = model_resuls[max(model_resuls.keys())]

Starting training with cnn_model=lstm_model_32 and maxlen=1000 and batch size=32
Generating embeddings
Data size: 5424
Training with callback
Restoring model weights from the end of the best epoch.
Epoch 00063: early stopping
Evaluating
Test Score: 2.5221431255340576
Test Accuracy: 935.0
Entered here
              precision    recall  f1-score   support

           0       0.93      0.42      0.57      3602
           1       0.31      0.89      0.46      1048

    accuracy                           0.52      4650
   macro avg       0.62      0.65      0.52      4650
weighted avg       0.79      0.52      0.55      4650

[[1499 2103]
 [ 113  935]]
Finished training and evaluation
{'precision': 0.2836676217765043, 'recall': 0.9519230769230769, 'F1': 0.4370860927152318, 'ERDE_5': 0.3906358952825279, 'ERDE_50': 0.15712936416120366, 'median_latency_tps': 11.0, 'median_penalty_tps': 0.03898023902249159, 'speed': 0.9610197609775084, 'latency_weighted_f1': 0.42004837234778514}


In [29]:
print(y_preds["cnn_model"].flatten())

[0 0 0 ... 0 0 0]


In [None]:
# ensemble

In [33]:
y_pred = ensemble_vote(np.array([y_preds["svm"].flatten(), y_preds["cnn_model"].flatten(), y_preds["cnn_model"].flatten()]))
evaluate(1, 10, {"test":"test"}, y_pred=y_pred, test_users=test_users)

{'precision': 0.5285714285714286, 'recall': 0.7115384615384616, 'F1': 0.6065573770491803, 'ERDE_5': 0.2838774569434117, 'ERDE_50': 0.10928357057827473, 'median_latency_tps': 11.0, 'median_penalty_tps': 0.03898023902249159, 'speed': 0.9610197609775084, 'latency_weighted_f1': 0.5829136255109477}


{'precision': 0.5285714285714286,
 'recall': 0.7115384615384616,
 'F1': 0.6065573770491803,
 'ERDE_5': 0.2838774569434117,
 'ERDE_50': 0.10928357057827473,
 'median_latency_tps': 11.0,
 'median_penalty_tps': 0.03898023902249159,
 'speed': 0.9610197609775084,
 'latency_weighted_f1': 0.5829136255109477}