## Model training
This NB is supposed to be run in google colab due to our lack of GPU. It train a model and save it. You are supposed to have a train and test sequences .npy files that are built with tokenizer.py script. In variable segment you are suppose to enter a value between [GROUPS,A,B,C,D,E,F] for creating the corresponding model.h5 file. 

Also you are provided with a script for plotting confusion matrix and calculate precision, recall and F1 scores. 

In [None]:
## if you're in colab run this cell for mounting your drive and use the data in there
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
%matplotlib inline

# Librerias incluidas de manera personal
import re
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn

#from unidecode import unidecode


# Librerias incluidas por defecto en el extended case
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, GlobalMaxPooling1D, Dropout, LSTM, Conv1D, RNN  ## add as many types of layer you wanna try on. Be concient that due to size of data set, even with GPU training may take a while
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping

try:
    tf.set_random_seed(1337)                    # set the random seed for reproducibility
except:
    tf.random.set_seed(1337)                     # NOTE: Newer version aaof tensorflow uses tf.random.set_seed
np.random.seed(1337)                         #       instead of tf.set_random_seed

from nltk.corpus import stopwords

In [None]:
segment = 'GROUPS'

In [None]:
train_sequences = np.load('/content/drive/My Drive/datos_model/train_sequences_{}.npy'.format(segment))

y_train = pd.read_csv('/content/drive/My Drive/datos_model/y_train_{}.csv'.format(segment), usecols=['target'])
# 

In [None]:
## Find number of categories corresponding to last layer size.
num_neurons = y_train.target.nunique()

In [None]:
emb_size = len(train_sequences[0])

In [None]:
#Here model is buid. You can play with type of 

model = Sequential()

# Se genera una capa para el embedding (one hot encoding)
model.add(Embedding(20000, 256, input_length=emb_size))

# Paso 03. Se generan 2 capas con 128 neuronas cada una
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))

#
model.add(GlobalMaxPooling1D())

model.add(Dense(num_neurons, activation='sigmoid'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(train_sequences, y_train, validation_split=0.2, epochs=10, batch_size = 1024,
               callbacks=[EarlyStopping(monitor='val_accuracy', mode='auto', restore_best_weights=True, patience = 2)])

In [None]:

model.save(filepath='/content/drive/My Drive/datos_model/group_{}'.format(segment),save_format='h5')

In [None]:
# If you don't have a massive ram you should restart your kernel and just 
model = tf.keras.models.load_model('/content/drive/My Drive/datos_model/group_{}.h5'.format(segment))

In [None]:
## 
y_test = pd.read_csv('/content/drive/My Drive/datos_model/y_test_{}.csv'.format(segment), usecols=['target'])
test_sequences = np.load('/content/drive/My Drive/datos_model/test_sequences_{}.npy'.format(segment))

In [None]:
# Generemos la predicción del modelo para nuestro conjunto de test
y_predict = model.predict_classes(test_sequences) ## This may not work in futures versions of tensor flow, just use np.argmax(model.predict(test_sequences), axis=1)



In [None]:
# Crea la matriz de confusión
conf_mat = confusion_matrix(y_test, y_predict)

In [None]:
# Valores para normalizar la matriz respecto a la cantidad de valores reales
number_y_real = np.array([int((y_test == val).sum()) for val in sorted(y_test.target.unique())])

In [None]:
## crea la matriz de confusion normalizada
conf_matrix = []
for i, val in enumerate(number_y_real):
    conf_matrix.append(conf_mat[i]/val)

In [None]:
# Plots normalized confusion matrix
# Grafiquemos la matriz de confusión normalizada en un heatmap para identificar 
# de manera mas clara donde se focalizan los aciertos y donde los errores
plt.figure(figsize=(10,8))
sns.heatmap(conf_matrix, annot=True, fmt=".2f",  cmap="Blues")
plt.ylabel('100% rial no fake')
plt.xlabel('pred')

In [None]:
# Grafiquemos la matriz de confusión en un heatmap para identificar 
# de manera mas clara donde se focalizan los aciertos y donde los errores
plt.figure(figsize=(10,8))
sns.heatmap(confusion_matrix(y_test, y_predict), annot=True, fmt="d",  cmap="Blues")

In [None]:
from sklearn.metrics import classification_report

In [None]:
# Presition, recall and F1 score table.
print('accuracy %s' % accuracy_score(y_predict, y_test))
print(classification_report(y_predict, y_test))

In [None]:
model_group = tf.keras.models.load_model('/content/drive/My Drive/datos_model/group_groups.h5')
model_b = tf.keras.models.load_model('/content/drive/My Drive/datos_model/group_B.h5')
model_c = tf.keras.models.load_model('/content/drive/My Drive/datos_model/group_C.h5')
model_d = tf.keras.models.load_model('/content/drive/My Drive/datos_model/group_D.h5')
model_e = tf.keras.models.load_model('/content/drive/My Drive/datos_model/group_E.h5')
model_f = tf.keras.models.load_model('/content/drive/My Drive/datos_model/group_F.h5')


In [None]:
##pip install -U keras-tuner

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch

In [None]:
## If you are in collab this may take longer than the permitted time by google (~12h) if you dont have a GPU this may last also forever (LOL) try changing parameters
## Keras-turner is a hyper-parameter searcher for keras.
def tune_nn_model(hp):

    model = keras.Sequential()

    model.add(keras.layers.Dense(units=128,

                                 activation="relu"))

    for i in range(hp.Int('num_layers', 1, 6)):
        

        units = hp.Int(

          'units_' + str(i),

          min_value=8,

          max_value=64,

          step=8

      )
        

        model.add(keras.layers.Dense(units=units, activation='relu'))

        drop_rate = hp.Choice('drop_rate_' + str(i),

                            [

                              0.0, 0.1, 0.2, 0.3, 0.4,

                              0.5, 0.6, 0.7, 0.8, 0.9

                            ])

        model.add(keras.layers.Dropout(rate=drop_rate))

    model.add(keras.layers.Dense(7, activation='sigmoid'))
USDUSD
    model.compile(

        optimizer="adam",

        loss = 'sparse_categorical_crossentropy',

        metrics = ['accuracy'])

    return model

In [None]:
MAX_TRIALS = 15

EXECUTIONS_PER_TRIAL = 5

tuner = RandomSearch(

    tune_nn_model,

    objective='val_accuracy',

    max_trials=MAX_TRIALS,

    executions_per_trial=EXECUTIONS_PER_TRIAL,

    seed=42
)

In [None]:
TRAIN_EPOCHS = 15

tuner.search(x=train_sequences,

             y=y_train,

             epochs=TRAIN_EPOCHS,

             validation_data=(test_sequences, y_test),
             batch_size = 1024,
             callbacks=[tf.keras.callbacks.EarlyStopping('val_loss', patience=2)])