In [None]:
! wget https://raw.githubusercontent.com/PAE-ITBA-ML-2019/Regularizacion/master/reg_helper.py
! wget https://raw.githubusercontent.com/PAE-ITBA-ML-2019/Regularizacion/master/fnn_helper.py
! wget https://raw.githubusercontent.com/PAE-ITBA-ML-2019/Regularizacion/master/draw_nn.py
! wget https://raw.githubusercontent.com/PAE-ITBA-ML-2019/Regularizacion/master/regularization_helper.py
 
! wget https://github.com/PAE-ITBA-ML-2019/Regularizacion/raw/master/data.zip
! unzip data.zip


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from regularization_helper import plot_it, get_simple_dataset, get_polynimial_set, plot_classifier, save_dataset
from sklearn import model_selection
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras import optimizers
from fnn_helper import PlotLosses
from keras.callbacks import ModelCheckpoint 
from keras import regularizers

## Pre-procesamiento

### Cargo dataset y lo grafico

In [None]:
dataset_200 = np.load('200_samples_OK.npy')
X = dataset_200[:,:2]
y = dataset_200[:, 2]
plot_it(X,y)
plt.show()

### Divido en training y testing set

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y, test_size=0.4, shuffle=True, random_state=500)
y_train = np.array(y_train)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

## Modelo de una sola capa

### Armo arquitectura de la red

In [None]:
def get_basic_model(input_shape, output_size, lr=0.1):
    model = Sequential()
    sgd = optimizers.SGD(lr=lr)
    model.add(Dense(output_size, input_dim=input_shape,
                    activation='sigmoid', 
                    kernel_initializer='normal', 
                    name='Salida'
                   ))
    model.compile(loss = 'binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model


basic_model = get_basic_model(X_train.shape[1], 1)
basic_model.summary()

### Entreno el modelo
- Usar distintos valores de lr 0.1, 0.05, 0.01

In [None]:
plot_losses = PlotLosses(plot_interval=50, evaluate_interval=None, x_val=X_test, y_val_categorical=y_test)
basic_model = get_basic_model(X_train.shape[1], 1, lr=0.01)
basic_model.fit(X_train, 
          y_train, batch_size = 25,
          epochs=5000, 
          verbose=0, 
          validation_data=(X_test, y_test), 
          callbacks=[plot_losses],
         )

### Grafico resultados

In [None]:
def plot_results(model, X_train, y_train, X_test, y_test, N=200, figsize=(20,5)):
    score_train = model.evaluate(X_train, y_train, verbose=0)
    print('Train set')
    print("loss: ", score_train[0])
    print("accuracy: ", score_train[1])
    score_test = model.evaluate(X_test, y_test, verbose=0)
    print()
    print('Test set')
    print("loss: ", score_test[0])
    print("accuracy: ", score_test[1])
    
    f, (ax1, ax2) = plt.subplots(1, 2, sharex='col', sharey='row', figsize=figsize)
    plot_classifier(X_train, y_train, model.predict, 1, N = N, ax=ax1)
    plot_classifier(X_test, y_test, model.predict, 1, N = N, ax=ax2)
    plt.show()
    return score_train, score_test

In [None]:
score_basic_model = plot_results(basic_model, X_train, y_train, X_test, y_test)

### Conclusiones
- ¿Es posible mejorar el modelo?
- UNDERFITTING
- Proponga dos posibles soluciones para mejorar el modelo: Agrandar arquitectura de la red, Regresión polinomial

## Modelo de dos capas

### Armo arquitectura de la red (10 hidden units)

In [None]:
def get_two_layer_model(input_shape, output_size, hidden_units= 10, lr=0.1, decay=0.0):
    model = Sequential()
    sgd = optimizers.SGD(lr=lr, decay=decay)
    model.add(Dense(hidden_units,input_dim=input_shape,  activation='sigmoid', ))
    model.add(Dense(output_size, 
                    activation='sigmoid', 
                    kernel_initializer='zeros', 
                    name='Salida'
                   ))
    model.compile(loss = 'binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model
two_layer_model = get_two_layer_model(X_train.shape[1], 1, lr=1)
two_layer_model.summary()

### Entreno el modelo

**Notar la diferencia entre lr entre este modelo y el anterior**

In [None]:
plot_losses = PlotLosses(plot_interval=50, evaluate_interval=None, x_val=X_test, y_val_categorical=y_test)
checkpointer = ModelCheckpoint(filepath='two_layer_model.hdf5', verbose=0, save_best_only=True)
two_layer_model = get_two_layer_model(X_train.shape[1], 1, lr=2)
two_layer_model.fit(X_train, 
          y_train, batch_size = 25,
          epochs=3500, 
          verbose=0, 
          validation_data=(X_test, y_test), 
          callbacks=[plot_losses, checkpointer],
         )

### Grafico resultados

In [None]:
score_two_layer_model = plot_results(two_layer_model, X_train, y_train, X_test, y_test)

### Conclusiones:
- OVERFITTING
- ¿Posibles soluciones? Early Stop? Salir a buscar mas datos? Regularización? Dropout?

## Soluciones para el overfitting

### Early stop

In [None]:
two_layer_model_early_stop = get_two_layer_model(X_train.shape[1], 1, lr=2)
two_layer_model_early_stop.load_weights('two_layer_model.hdf5')
score_two_layer_model_early_stop = plot_results(two_layer_model_early_stop, X_train, y_train, X_test, y_test)

In [None]:
two_layer_model_early_stop.get_weights()

### ¿Tenemos mas data?
Cantidad de muestras mucho mayor a la cantidad de parametros

In [None]:
dataset_1000 = np.load('1000_samples.npy')
X_1000 = dataset_1000[:,:2]
y_1000 = dataset_1000[:, 2]
plot_it(X_1000,y_1000)
plt.show()

In [None]:
X_1000_train, X_1000_test, y_1000_train, y_1000_test = model_selection.train_test_split(X_1000,y_1000, test_size=0.4, shuffle=True, random_state=500)
print(X_1000_train.shape)
print(X_1000_test.shape)
print(y_1000_train.shape)

In [None]:
plot_losses = PlotLosses(plot_interval=50, evaluate_interval=None, x_val=X_1000_test, y_val_categorical=y_1000_test)
checkpointer = ModelCheckpoint(filepath='two_layer_model.1000.hdf5', verbose=0, save_best_only=True)
two_layer_model_1000 = get_two_layer_model(X_train.shape[1], 1, lr=2)
two_layer_model_1000.fit(X_1000_train, 
          y_1000_train, batch_size = 25,
          epochs=2000, 
          verbose=0, 
          validation_data=(X_1000_test, y_1000_test), 
          callbacks=[plot_losses, checkpointer],
         )

In [None]:
score_two_layer_model_1000 = plot_results(two_layer_model_1000, X_1000_train, y_1000_train, X_1000_test, y_1000_test)

#### Si lo comparamos contra el test set original:

In [None]:
score_two_layer_model_1000_test = two_layer_model_1000.evaluate(X_test, y_test, verbose=0)
print("loss: ", score_two_layer_model_1000_test[0])
print("accuracy: ", score_two_layer_model_1000_test[1])

#### Como soluciono el ruido? Learning rate decay

In [None]:
plot_losses = PlotLosses(plot_interval=50, evaluate_interval=None, x_val=X_1000_test, y_val_categorical=y_1000_test)
checkpointer = ModelCheckpoint(filepath='two_layer_model.1000.hdf5', verbose=0, save_best_only=True)
two_layer_model_1000_lrd = get_two_layer_model(X_train.shape[1], 1, lr=2, decay=1e-3)
two_layer_model_1000_lrd.fit(X_1000_train, 
          y_1000_train, batch_size = 25,
          epochs=2000, 
          verbose=0, 
          validation_data=(X_1000_test, y_1000_test), 
          callbacks=[plot_losses, checkpointer],
         )

In [None]:
score_two_layer_model_1000_lrd = plot_results(two_layer_model_1000_lrd, X_1000_train, y_1000_train, X_1000_test, y_1000_test)

In [None]:
score_two_layer_model_1000__lrd_test = two_layer_model_1000_lrd.evaluate(X_test, y_test, verbose=0)
print("loss: ", score_two_layer_model_1000__lrd_test[0])
print("accuracy: ", score_two_layer_model_1000__lrd_test[1])

### Achico red neuronal (3 hidden units) - Volvemos al dataset de 200 muestras

In [None]:
plot_losses = PlotLosses(plot_interval=50, evaluate_interval=None, x_val=X_test, y_val_categorical=y_test)
checkpointer = ModelCheckpoint(filepath='two_layer_model.h3.hdf5', verbose=0, save_best_only=True)
two_layer_model_h3 = get_two_layer_model(X_train.shape[1], 1, hidden_units=3,lr=2, decay=1e-3)
two_layer_model_h3.fit(X_train, 
          y_train, batch_size = 25,
          epochs=3500, 
          verbose=0, 
          validation_data=(X_test, y_test), 
          callbacks=[plot_losses, checkpointer],
         )

In [None]:
score_two_layer_model_1000_lrd = plot_results(two_layer_model_h3, X_train, y_train, X_test, y_test)

### Dropout (10 hidden units)

In [None]:
def get_two_layer_model_dropout(input_shape, output_size, hidden_units=10, lr=0.1, prob=0.1, decay=0.0):
    model = Sequential()
    sgd = optimizers.SGD(lr=lr, decay=decay)
    model.add(Dense(hidden_units,input_dim=input_shape,  activation='sigmoid', ))
    model.add(Dropout(prob))
    model.add(Dense(output_size, 
                    activation='sigmoid', 
                    kernel_initializer='zeros', 
                    name='Salida'
                   ))
    model.compile(loss = 'binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

In [None]:
# Variar prob: 0.1, 0.2, 0.5
plot_losses = PlotLosses(plot_interval=50, evaluate_interval=None, x_val=X_test, y_val_categorical=y_test)
checkpointer = ModelCheckpoint(filepath='two_layer_model.dropout.hdf5', verbose=0, save_best_only=True)
two_layer_model_dropout = get_two_layer_model_dropout(X_train.shape[1], 1, lr=2, decay=0.0, prob=0.5)
two_layer_model_dropout.fit(X_train, 
          y_train, batch_size = 25,
          epochs=5000, 
          verbose=0, 
          validation_data=(X_test, y_test), 
          callbacks=[plot_losses, checkpointer],
         )

In [None]:
score_two_layer_model_1000_lrd = plot_results(two_layer_model_dropout, X_train, y_train, X_test, y_test)

### Learning rate = 1

In [None]:
# Variar prob: 0.1, 0.2, 0.5
plot_losses = PlotLosses(plot_interval=50, evaluate_interval=None, x_val=X_test, y_val_categorical=y_test)
checkpointer = ModelCheckpoint(filepath='two_layer_model.dropout.decay.hdf5', verbose=0, save_best_only=True)
two_layer_model_dropout_decay = get_two_layer_model_dropout(X_train.shape[1], 1, lr=1, decay=0.0, prob=0.5)
two_layer_model_dropout_decay.fit(X_train, 
          y_train, batch_size = 25,
          epochs=5000, 
          verbose=0, 
          validation_data=(X_test, y_test), 
          callbacks=[plot_losses, checkpointer],
         )

In [None]:
_ = plot_results(two_layer_model_dropout_decay, X_train, y_train, X_test, y_test)

## Regularización L2

In [None]:
def get_two_layer_model_L2(input_shape, output_size, hidden_units=10, lr=0.1, l2_lambda=0.1, decay=0.0):
    model = Sequential()
    sgd = optimizers.SGD(lr=lr, decay=decay)
    regularizer = regularizers.l2(l2_lambda)
    model.add(Dense(hidden_units,input_dim=input_shape,  activation='sigmoid', kernel_regularizer=regularizer, 
                    bias_regularizer=regularizer))
    model.add(Dense(output_size, 
                    activation='sigmoid', 
                    kernel_initializer='zeros', 
                    name='Salida',
                    kernel_regularizer=regularizer, 
                    bias_regularizer=regularizer
                   ))
    model.compile(loss = 'binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

In [None]:
# Variar lambda: 0.1, 0.01, 0.005, 0.001
plot_losses = PlotLosses(plot_interval=50, evaluate_interval=None, x_val=X_test, y_val_categorical=y_test)
checkpointer = ModelCheckpoint(filepath='two_layer_model.l2.hdf5', verbose=0, save_best_only=True)
two_layer_model_L2 = get_two_layer_model_L2(X_train.shape[1], 1, lr=2, decay=0.0, l2_lambda=0.0001)
two_layer_model_L2.fit(X_train, 
          y_train, batch_size = 25,
          epochs=5000, 
          verbose=0, 
          validation_data=(X_test, y_test), 
          callbacks=[plot_losses, checkpointer],
         )

In [None]:
_ = plot_results(two_layer_model_L2, X_train, y_train, X_test, y_test)

In [None]:
def plot_hist_weights(model):
    weights = model.get_weights()
    w1 = np.concatenate((weights[0].flatten(),weights[1].flatten()))
    w2 = np.concatenate((weights[2].flatten(),weights[3].flatten()))
    wf = np.concatenate((w1,w2))
    plt.hist(wf,100)
    plt.show()

In [None]:
plot_hist_weights(two_layer_model)

In [None]:
plot_hist_weights(two_layer_model_L2)

## Regularización L1

In [None]:
def get_two_layer_model_L1(input_shape, output_size, hidden_units=10, lr=0.1, l1_lambda=0.1, decay=0.0):
    model = Sequential()
    sgd = optimizers.SGD(lr=lr, decay=decay)
    regularizer = regularizers.l1(l1_lambda)
    model.add(Dense(hidden_units,input_dim=input_shape,  activation='sigmoid', 
                    kernel_regularizer=regularizer, 
                    bias_regularizer=regularizer))
    model.add(Dense(output_size, 
                    activation='sigmoid', 
                    kernel_initializer='zeros', 
                    name='Salida',
                    kernel_regularizer=regularizer, 
                    bias_regularizer=regularizer
                   ))
    model.compile(loss = 'binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

In [None]:
# Variar lambda: 0.001, 0.005
plot_losses = PlotLosses(plot_interval=50, evaluate_interval=None, x_val=X_test, y_val_categorical=y_test)
checkpointer = ModelCheckpoint(filepath='two_layer_model.l1.hdf5', verbose=0, save_best_only=True)
two_layer_model_L1 = get_two_layer_model_L1(X_train.shape[1], 1, lr=2, decay=0.0, l1_lambda=0.0005)
two_layer_model_L1.fit(X_train, 
          y_train, batch_size = 25,
          epochs=5000, 
          verbose=0, 
          validation_data=(X_test, y_test), 
          callbacks=[plot_losses, checkpointer],
         )

In [None]:
_ = plot_results(two_layer_model_L1, X_train, y_train, X_test, y_test)

In [None]:
plot_hist_weights(two_layer_model_L1)