# Setting Up

### Move to base folder

In [None]:
%cd ./drive/MyDrive/Colab Notebooks

/content/drive/MyDrive/Colab Notebooks


### Installing Talos

In [None]:
% pip install talos

# Utils

## compute_metrics()

## load_encoded_corpus()

In [None]:
## LOAD ENCODED CORPUS
def load_encoded_corpus(encoding, scaling = None):
  """ 
  Loads encoded dataset from drive. Returns train and test sets ready to be 
  fed into training algorithm.
  
  Input:
  encoding_id  -- str

  Output:
  (X_train, y_train, X_test, y_test) -- tuple containing the encoded dataset

  """
  import pickle
  import numpy as np

  encodings_info = {'FT1':'FastText 1 - Common Crawl + Wikipedia',
                    'FT2':'FastText 2 - Esp. Wikipedia',
                    'FT3':'FastText 3 - Spanish Unannotated Corpora',
                    'W2V1':'W2V 1 - Spanish Unannotated Corpora',
                    'W2V2':'W2V 2 - Spanish CoNLL',
                    'GloVe_300d':'GloVe 300d - Spanish Billion Word Corpus',
                    'GloVe_100d':'GloVe 100d - Spanish Billion Word Corpus'
                    }
  
  paths_dict = {'FT1': ('dataset_files/EncodedTrainTweets_FastText1_pr2', 
                        'dataset_files/EncodedTestTweets_FastText1_pr2'),
                
                'FT2': ('dataset_files/EncodedTrainTweets_FastText2_pr2',
                        'dataset_files/EncodedTestTweets_FastText2_pr2'),
                
                'FT3': ('dataset_files/EncodedTrainTweets_FastText3_pr2',
                        'dataset_files/EncodedTestTweets_FastText3_pr2'),
                
                'W2V1': ('dataset_files/EncodedTrainTweets_W2V2_pr2',
                         'dataset_files/EncodedTestTweets_W2V2_pr2'),
                
                'W2V2': ('dataset_files/EncodedTrainTweets_GloVe300d_pr2',
                         'dataset_files/EncodedTestTweets_GloVe300d_pr2'),
                
                'GloVe_300d': ('dataset_files/EncodedTrainTweets_GloVe300d_pr2',
                               'dataset_files/EncodedTestTweets_GloVe300d_pr2'),
                
                'GloVe_100d': ('dataset_files/EncodedTrainTweets_GloVe100d_pr2',
                               'dataset_files/EncodedTestTweets_GloVe100d_pr2')
                }

  if encoding in encodings_info.keys():
    encoding_info = encodings_info[encoding]
    train_file_path,test_file_path = paths_dict[encoding]
  else:
    print('Unknown encoding. Function has been terminated.')
    return

  print(encoding_info)
  print('Encoded tweets and labels are being loaded.')

  ## Retrieve encoded Tweets
  with open(train_file_path, 'rb') as filehandle:
      # store the encoded documents as binary data
      X_train = pickle.load(filehandle)

  with open(test_file_path, 'rb') as filehandle:
      # store the encoded documents as binary data
      X_test = pickle.load(filehandle)

  ## Retrieve Lbels
  with open('dataset_files/TrainLabels', 'rb') as filehandle:
      # store the encoded documents as binary data
      y_train = pickle.load(filehandle)

  with open('dataset_files/TestLabels', 'rb') as filehandle:
      # store the encoded documents as binary data
      y_test = pickle.load(filehandle)
  
  # Check encodings dimensions
  encodings_dim = X_train[0].shape[1]

  # Turn Xtrain an X_test into ndarrays
  X_train = np.concatenate(X_train, axis=0)
  X_test  = np.concatenate(X_test, axis=0)

  # Scaling
  if scaling == 'StandardScaling':
    print('Standard scaling.')
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test  = scaler.transform(X_test)
  elif scaling == 'MinMaxScaling':
    print('MinMax scaling.')
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test  = scaler.transform(X_test)
  else:
    print('No scaling applied.')

  print('\nComplete')
  print('{} training instances\n {} test instances'.format(len(X_train), len(X_test)))
  print('\nencodings_dim = {}'.format(encodings_dim))

  return(X_train, y_train, X_test, y_test)
    

# Loading encoded Tweets and labels

In [None]:
encoding = 'W2V1'
scaling = 'StandardScaling'

data = load_encoded_corpus(encoding, scaling)
encodings_dim = data[0].shape[1]

W2V 1 - Spanish Unannotated Corpora
Encoded tweets and labels are being loaded.
Standard scaling.

Complete
4500 training instances
 500 test instances

encodings_dim = 100


# Talos Testing

## Parameters dictionary

In [None]:
from tensorflow.keras.activations import relu, elu

p = {
    'first_layer_size': [500, 400, 300, 200, 100],                    #5
    'second_layer_size': [200, 150, 125, 100, 75, 50],                #6
    'activation':['relu', 'elu'],                                     #2
    'p_dropout': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],                 #7
    'batch_size': [32, 128, 256, 512, 768, 1024],                     #6
    'epochs':[10, 20, 30, 40],                                        #4
    'optimizer':['adam','rmsprop']                                    #2
}

search_space_size = 1
for v in p.values():
  search_space_size *= len(v)

print('{} permutations in the search space.'.format(search_space_size))

20160 permutations in the search space.


## Model definition

In [None]:
# add input parameters to the function
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout

def test_model(x_train, y_train, x_val, y_val, params):
    
    # replace the hyperparameter inputs with references to params dictionary 
    #imput_layer_size, h1_size = params['layers_size']
    activation_f = params['activation']

    model = Sequential()
    model.add(Dense(units = params['first_layer_size'], activation=activation_f))
    model.add(Dense(units=params['second_layer_size'], activation=activation_f))
    model.add(Dropout(rate = params['p_dropout']))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=params['optimizer'], metrics=['acc'])
    
    # make sure history object is returned by model.fit()
    out = model.fit(x=x_train, 
                    y=y_train,
                    validation_data=(x_val, y_val),
                    epochs=params['epochs'],
                    batch_size=params['batch_size'],
                    verbose=0)
    
    # modify the output model
    return out, model

## Scanning

In [None]:
import talos
x_train, y_train, x_val, y_val = data
t = talos.Scan(x=x_train, y=y_train, params=p, 
               model=test_model, 
               experiment_name='SNN_experiment_1', 
               x_val = x_val, 
               y_val=y_val,
               random_method='quantum',
               fraction_limit = 0.1, print_params = True)


## Results analisys

In [None]:
t.data.sort_values(by='val_acc', ascending=False).head(30)

In [None]:
r = Reporting('experiment_log.csv')

# returns the results dataframe
r.data

# returns the highest value for 'val_fmeasure'
r.high('val_fmeasure')

# returns the number of rounds it took to find best model
r.rounds2high()

# draws a histogram for 'val_acc'
r.plot_hist()

# Resources

* Mini batch size selection: [1](https://datascience.stackexchange.com/questions/18414/are-there-any-rules-for-choosing-the-size-of-a-mini-batch), [2](https://www.quora.com/In-deep-learning-why-dont-we-use-the-whole-training-set-to-compute-the-gradient), [3](https://stats.stackexchange.com/questions/164876/what-is-the-trade-off-between-batch-size-and-number-of-iterations-to-train-a-neu)
* Optimizers: [1](https://ai.stackexchange.com/questions/18206/what-kind-of-optimizer-is-suggested-to-use-for-binary-classification-of-similar)
* RMSprop: [1](https://towardsdatascience.com/understanding-rmsprop-faster-neural-network-learning-62e116fcf29a)
* Number of Hidden Layers: [1](https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw)

    
