# Packages

In [0]:
#%tensorflow_version 1.x
#!pip install -q pipreqs


#----- magic trio + special guest
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#----- optimizer
#!pip install -q optuna
import optuna
#!pip install -q scikit-optimize
import skopt

#----- dl
from keras.utils import to_categorical
from keras import backend as K
from keras.models import Sequential, clone_model
from keras.layers import Dense, Dropout, ReLU
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import Adam, Nadam, RMSprop
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score

#!pip install -q livelossplot
from livelossplot.keras import PlotLossesCallback
from sklearn.preprocessing import MinMaxScaler, minmax_scale


#----- generic utils
import os
import smtplib
import time
import warnings
import pickle
import sys
import pprint
import copy
from tqdm.autonotebook import tqdm

#----- sql server
import sqlite3
from sqlite3 import Error

In [0]:
# from google.colab import drive
# drive.mount('/content/drive')

# Import data & define functions

In [0]:
with open("/content/drive/My Drive/Colab Notebooks/TM&S/df_preprocessed_eng.pckle", "rb") as infile:
  data = pickle.load(infile)

In [0]:
#----- remapping categories

category_remap_dict = {
    "LGBT": "socializing",
    "singles": "socializing",
    "fashion/beauty": "health/wellbeing",
    "movies/film": "socializing",
    "book clubs": "education/learning",
    "sci-fi/fantasy": "games",
    "support": "health/wellbeing",
    "cars/motorcycles": "outdoors/adventure"
}


data["remap_category"] = data.category.map(lambda x: category_remap_dict[x] if x in category_remap_dict.keys() else x )


In [0]:
#----- k-folds cross validation
def load_data_kfold(k, X_train, y_train):
    
    folds = list(StratifiedKFold(n_splits = k, shuffle = True
                                 #,random_state = 42
                                 ).split(X_train, y_train))
    
    return folds


In [0]:
#----- connection with SQLite database
def create_connection(db_file):
    """ create a database connection to a SQLite database """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        print('Version sqlite: {}'.format(sqlite3.version))
        print('Connected')
    except Error as e:
        print(e)
    finally:
        if conn:
            conn.close()

In [0]:
#----- top-k accuracy score
def top_k_acc(model, X_test, y_true, k_top = 3, ml = False):
  if ml:
    probs = model.predict_proba(X_test)
  else:  
    probs = model.predict(X_test)
  #y_true = np.argmax(y_true, axis = 1)
  topn = np.argsort(probs, axis = 1)[:,-k_top:]
  return np.mean(np.array([1 if y_true[k_top] in topn[k_top] else 0 for k_top in range(len(topn))]))

# Optimization process

In [0]:
#----- load best preprocessing and representation
train_vect = np.load("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/d2v_lemm_no_badwords.npz")['X']

In [0]:
num_classes = len(np.unique(data.remap_category))
num_classes

In [0]:
#----- objective function

def train_fnc(trial):

  K.clear_session()

  loss = 'categorical_crossentropy'
  metrics = ['acc']
  batch_size = 2048
  epochs = 100

  #----- hyper-parameters
  units1 = trial.suggest_categorical(name = 'units1', choices = [64, 128, 256, 512, 1024, 2048])
  units2 = trial.suggest_categorical(name = 'units2', choices = [64, 128, 256, 512, 1024, 2048])
  units3 = trial.suggest_categorical(name = 'units3', choices = [64, 128, 256, 512, 1024, 2048])
  d_rate1 = trial.suggest_discrete_uniform(name = 'd_rate1', low = 0, high = 1, q = .1)
  d_rate2 = trial.suggest_discrete_uniform(name = 'd_rate2', low = 0, high = 1, q = .1)
  d_rate3 = trial.suggest_discrete_uniform(name = 'd_rate3', low = 0, high = 1, q = .1)
  optimizer = trial.suggest_categorical(name = 'optimizer', choices = ['adam', 'nadam', 'rmsprop'])
  activation = trial.suggest_categorical(name = 'activation', choices = ['relu', 'leaky_relu'])
  learning_rate = trial.suggest_loguniform(name = 'lr', low = 1e-4, high = 1e-1)

  #----- cross-validation cicle
  acc = []
  top_3 = []
  f1_macro = []
  f1_weighted = []

  lb = LabelEncoder()
  lb.fit(data.remap_category)



  folds = load_data_kfold(k, train_vect, data.remap_category.values)
  for j, (train_idx, test_idx) in enumerate(tqdm(folds)):
    # es = EarlyStopping(monitor = 'val_loss', patience = 10, restore_best_weights = True)
    # rlrop = ReduceLROnPlateau(monitor='val_loss', patience = 5, factor = .5, min_lr = 1e-6)

    X_train_cv = train_vect[train_idx]
    y_train_cv = data.remap_category.values[train_idx]
    X_test_cv = train_vect[test_idx]
    y_test_cv = data.remap_category.values[test_idx]

    y_train_cv = lb.transform(y_train_cv)
    y_test_cv = lb.transform(y_test_cv)
    y_train_cv = to_categorical(y_train_cv, num_classes = num_classes)
    y_test_cv = to_categorical(y_test_cv, num_classes = num_classes)


    #----- creation of validation set for EarlyStopping
    X_train_cv, X_val, y_train_cv, y_val = train_test_split(X_train_cv, y_train_cv, test_size = .1,
                                                        #random_state = 42,
                                                        stratify = y_train_cv)
    kfold_model = Sequential()
    kfold_model.add(Dense(units1, input_dim = train_vect.shape[1]))
    if activation == 'relu':
      kfold_model.add(ReLU())
    else:
      kfold_model.add(LeakyReLU())
    kfold_model.add(Dropout(d_rate1))
    kfold_model.add(Dense(units2))
    if activation == 'relu':
      kfold_model.add(ReLU())
    else:
      kfold_model.add(LeakyReLU())
    kfold_model.add(Dropout(d_rate2))
    kfold_model.add(Dense(units3))
    if activation == 'relu':
      kfold_model.add(ReLU())
    else:
      kfold_model.add(LeakyReLU())
    kfold_model.add(Dropout(d_rate3))
    kfold_model.add(Dense(num_classes, activation = "softmax"))

    #----- Compile
    if optimizer == 'adam':
        optimizer = Adam(lr = learning_rate)
    elif optimizer == 'nadam':
        optimizer = Nadam(lr = learning_rate)
    elif optimizer == 'rmsprop':
        optimizer = RMSprop(lr = learning_rate)

    kfold_model.compile(optimizer = optimizer, loss = loss, metrics = metrics)

    kfold_model.fit(X_train_cv, y_train_cv,
                    batch_size = batch_size,
                    epochs = epochs,
                    verbose = 0,
                    validation_data = (X_val, y_val),
                    callbacks = [optuna.integration.KerasPruningCallback(trial, 'val_acc')],
                    use_multiprocessing = True)

  #----- evaluation
    y_pred = kfold_model.predict_classes(X_test_cv)
    y_true = np.argmax(y_test_cv, axis = 1)
    f1_macro.append(f1_score(y_true, y_pred, average = 'macro'))

    f1_weighted.append(f1_score(y_true, y_pred, average = 'weighted'))
    acc.append(accuracy_score(y_true, y_pred))
    top_3.append(top_k_acc(kfold_model, X_test_cv, y_true, k_top = 3))


  print("Acc: {} (+/- {})\nTop-3: {} (+/- {})\nF1-Macro: {} (+/- {})\nF1-Weighted: {} (+/- {})\n".format(round(np.mean(acc), 3), round(np.std(acc), 3),
                                round(np.mean(top_3), 3), round(np.std(top_3), 3),
                                round(np.mean(f1_macro), 3), round(np.std(f1_macro), 3),
                                round(np.mean(f1_weighted), 3), round(np.std(f1_weighted), 3)))

  return (1-np.mean(f1_macro))

In [0]:
# if __name__ == '__main__':
#     create_connection('/content/drive/My Drive/sqlite/db/optuna.db')

In [0]:
study_name = 'optimizing_d2v_lemm_no_badwords4'

In [0]:
study = optuna.create_study(study_name = study_name,
                            direction = 'minimize',
                            sampler = optuna.integration.SkoptSampler(independent_sampler = optuna.samplers.TPESampler(),
                                                                      warn_independent_sampling = True,
                                                                      skopt_kwargs={'base_estimator':'RF', 'acq_func':'LCB'}),
                                  # storage='sqlite:////content/drive/My Drive/sqlite/db/optuna.db', load_if_exists=True
                            )

In [0]:
#----- init k-fold cross validation and optimizer using n trials
k = 5
study.optimize(train_fnc, n_trials = 50)

In [0]:
study.trials_dataframe()

In [0]:
optuna.visualization.plot_optimization_history(study)

In [0]:
optuna.visualization.plot_parallel_coordinate(study)

In [0]:
study.best_params

# Optimized model

In [0]:
#----- optimized parameters
units1 = 1024
units2 = 1024
units3 = 128
d_rate1 = .1
d_rate2 = .5
d_rate3 = .0
activation = 'relu'
lr = 0.00015
optimizer = Nadam(lr = lr)
loss = 'categorical_crossentropy'
metrics = ['acc']
batch_size = 2048
epochs = 100

In [0]:
#----- nn model
K.clear_session()

model_opt = Sequential()
model_opt.add(Dense(units = units1, input_dim = train_vect.shape[1], activation = activation))
model_opt.add(Dropout(rate = d_rate1))
model_opt.add(Dense(units = units2, activation = activation))
model_opt.add(Dropout(rate = d_rate2))
model_opt.add(Dense(units = units3, activation = activation))
model_opt.add(Dropout(rate = d_rate3))
model_opt.add(Dense(num_classes, activation = "softmax"))

model_opt.summary()

In [0]:
model_opt.compile(optimizer = Nadam(lr = lr), loss = loss, metrics = metrics)

In [0]:
lb = LabelEncoder()
y = lb.fit_transform(data.remap_category)
y_cat = to_categorical(y, num_classes = 24)
X_train, X_test, y_train, y_test = train_test_split(train_vect, y_cat, test_size = .2,
                                                        random_state = 42,
                                                        stratify = y_cat)

In [0]:
model_opt.fit(X_train, y_train,
              batch_size = batch_size,
              epochs = epochs,
              validation_split = .1,
              verbose = 0,
              callbacks = [PlotLossesCallback()])

In [0]:
#----- evaluation
y_pred = model_opt.predict_classes(X_test)
y_true = np.argmax(y_test, axis = 1)
f1_macro = f1_score(y_true, y_pred, average = 'macro')
f1_weighted = f1_score(y_true, y_pred, average = 'weighted')
acc = accuracy_score(y_true, y_pred)
top_3 = top_k_acc(model_opt, X_test, y_true, k_top = 3)


print("Acc: {}\nTop-3: {}\nF1-Macro: {}\nF1-Weighted: {}\n".format(round(acc, 3),
                            round(top_3, 3),
                            round(f1_macro, 3),
                            round(f1_weighted, 3)))




In [0]:
#----- classification report
print(classification_report(y_true, y_pred, target_names = lb.inverse_transform(np.unique(y))))

In [0]:
#----- confusion matrix
cm = confusion_matrix(y_true, y_pred, normalize = 'true')

In [0]:
plt.figure(figsize=(20, 20))
chart = sns.heatmap(cm, annot = True, fmt = '.2f', cmap = 'BuPu', xticklabels = lb.inverse_transform(np.unique(y)), yticklabels = lb.inverse_transform(np.unique(y)), vmin = 0, vmax = 1)
chart.set_xlabel('Predicted', fontsize = 20)
chart.set_ylabel('Actual', fontsize = 20)
chart.set_xticklabels(chart.get_xticklabels(), rotation = 45, horizontalalignment = 'right')
plt.yticks(fontsize = 14)
plt.xticks(fontsize = 14)
cbar = chart.collections[0].colorbar
# here set the labelsize by 20
cbar.ax.tick_params(labelsize = 14)
chart