In [1]:
from utils import *
from experiment_utils import *
from preprocessing import preprocess
from windowfy import windowfy
from featurizing import featurize
from tfidf_featurizer import combine_features, tfidf_featurize
from training import train, do_ensemble, do_train
from training_traditional import train_and_evaluate
from eval_erisk import evaluate, ensemble_vote
from IPython.display import display, Markdown
from itertools import product
from numpy.random import seed
import tensorflow
import numpy as np



In [2]:
seed(42)
tensorflow.random.set_seed(42) 
logger("Initialized numpy random and tensorflow random seed at 42")

Initialized numpy random and tensorflow random seed at 42


In [3]:
# params

first_part = {
    "include_feats": [["first_prons", "nssi"],["first_prons","sentiment","nssi"]],
    "feat_window_size": [10], #10
    "max_size": [20],
    "sample_weights_size": [20],
    "oversample": [True],
    "include_new_data": [False],
    "tfidf_max_features": [5000, 50000],
    "scale": [False, True],
    "normalize": [True, False],
    "discretize": [True, False],
    "discretize_size": [50, 75],
    "dis_strategy": ["quantile"]
}

second_part = {
    "eval_window_size": [1],
    "maxlen": [1000],
    "batch_size": [32],
    "epochs": [100],
    "patience": [10],
    "iterations": [5],
    "shuffle": [True, False],
}

models = ["svm", "bayes", "cnn_model"]
ensemble_combinations = [["svm", "bayes", "cnn_model"]]
weights = [[1, 1, 1], [2, 1, 1], [1, 2, 1], [2, 2, 1], [1, 1, 2], [3, 3, 1], [5, 5, 1], [1, 5, 1]]
eval_filename = "experiments_20_oversample.csv"

## Experiments

In [None]:
experiment = Experiment(models, ensemble_combinations, eval_filename)

firstpart_generator = traverse(first_part)

for i in firstpart_generator:
    try:
        logger("********** CALCULATING FEATURES FOR {} ***********".format(i))
        display(Markdown("#### Calculating features for {}".format(i)))
        
        experiment.prepare_data(i)

        secondpart_generator = traverse(second_part)

        for j in secondpart_generator:
            params = j.copy()
            params.update(i)
            logger("************ STARTING EXPERIMENT {} ***************".format(params))
            display(Markdown("#### Experiment {}".format(params)))
            try:
                experiment.train_and_evaluate_model(params, weights)
                logger("************ FINISHED EXPERIMENT {} ************* \n".format(params))
            except Exception as e:
                logger("*************************************")
                logger("Error during experiment {}: {}".format(params, e))
                logger("*************************************")
        del secondpart_generator
    except Exception as e:
        logger("*************************************")
        logger("General error during experiment {}: {}".format(i, e))
        logger("*************************************")

********** CALCULATING FEATURES FOR {'include_feats': ['first_prons', 'nssi'], 'feat_window_size': 10, 'max_size': 10, 'sample_weights_size': 10, 'oversample': True, 'include_new_data': False, 'tfidf_max_features': 5000, 'scale': False, 'normalize': True, 'discretize': True, 'discretize_size': 50, 'dis_strategy': 'quantile'} ***********


#### Calculating features for {'include_feats': ['first_prons', 'nssi'], 'feat_window_size': 10, 'max_size': 10, 'sample_weights_size': 10, 'oversample': True, 'include_new_data': False, 'tfidf_max_features': 5000, 'scale': False, 'normalize': True, 'discretize': True, 'discretize_size': 50, 'dis_strategy': 'quantile'}

Windowfying training users
Windowfying test users
Oversampling train users
After oversample: positive messages: 478, negative messages: 598
Data size: 1076

Finished windowfying
Featurizing calculate_feats=True, normalize=True, discretize=True, discretize_size=50, include_feats=['first_prons', 'nssi']
Initialized numpy random and tensorflow random seed at 42
Data size: 1076, 1076
Data size: 841, 841
Calculating first prons
Calculating NSSI words
Calculating first prons
Calculating NSSI words
Normalizing features
Discretizing


  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  "replaced with 0." % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)


Is the combined the same from tfidf: False
************ STARTING EXPERIMENT {'eval_window_size': 1, 'maxlen': 1000, 'batch_size': 32, 'epochs': 100, 'patience': 10, 'iterations': 10, 'shuffle': True, 'include_feats': ['first_prons', 'nssi'], 'feat_window_size': 10, 'max_size': 10, 'sample_weights_size': 10, 'oversample': True, 'include_new_data': False, 'tfidf_max_features': 5000, 'scale': False, 'normalize': True, 'discretize': True, 'discretize_size': 50, 'dis_strategy': 'quantile'} ***************


#### Experiment {'eval_window_size': 1, 'maxlen': 1000, 'batch_size': 32, 'epochs': 100, 'patience': 10, 'iterations': 10, 'shuffle': True, 'include_feats': ['first_prons', 'nssi'], 'feat_window_size': 10, 'max_size': 10, 'sample_weights_size': 10, 'oversample': True, 'include_new_data': False, 'tfidf_max_features': 5000, 'scale': False, 'normalize': True, 'discretize': True, 'discretize_size': 50, 'dis_strategy': 'quantile'}

Starting training traditional
              precision    recall  f1-score   support

           0       0.78      0.99      0.87       636
           1       0.79      0.15      0.25       205

    accuracy                           0.78       841
   macro avg       0.79      0.57      0.56       841
weighted avg       0.78      0.78      0.72       841

[[628   8]
 [175  30]]
{'precision': 0.7692307692307693, 'recall': 0.19230769230769232, 'F1': 0.3076923076923077, 'ERDE_5': 0.2492555420179793, 'ERDE_50': 0.20206897708029445, 'median_latency_tps': 11.0, 'median_penalty_tps': 0.03898023902249159, 'speed': 0.9610197609775084, 'latency_weighted_f1': 0.2956983879930795}
Writing results to CSV file
{'precision': 0.8333333333333334, 'recall': 0.09615384615384616, 'F1': 0.1724137931034483, 'ERDE_5': 0.2470038185729965, 'ERDE_50': 0.22338469448775772, 'median_latency_tps': 12.0, 'median_penalty_tps': 0.042873701496841665, 'speed': 0.9571262985031583, 'latency_weighted_f1': 0.16502177560399284

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Generating embeddings
