In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.optim import Adam, SGD
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import tensorflow as tf

In [2]:
def vectorise(target,num):
    """Creates an one hot encoded vector based on the class of each observation.
    
    Parameters
    ----------
    target : pd.Series
        The series containing the target variable to be transformed to vector
        
    num_classes : int
        The number of different classes
        
    Returns
    ----------
    vectors : pd.Series
        A pd.Series object with one hot encoded vectors

    """
    enc = OneHotEncoder(sparse=False,categories = [list(range(num))],handle_unknown='ignore')
    enc.fit(np.array(target).reshape(-1, 1))
    target= target.apply(lambda x: enc.transform(np.array(x).reshape(1, -1))[0])
    target = np.stack(target)
    return target

In [5]:
def normalizeData(y,min_val,max_val):
    'Transforms the target data to 0-1 range in order to use the sigmoid function as activation'
    return ((np.array(y) - min_val) / (max_val-min_val))

def logData(y):
    'Transforms the target variable to log(target) in order to follow a more normal disribution'
    return np.log1p(y)

def expData(data):
    'Calculate the exponential value of the target variable'
    return(np.expm1(data))

def denormalizeData(data,min_val,max_val):
    'Transforms the data from 0-1 range to the initial 0-9 range'
    return((np.array(data)*(max_val-min_val))+min_val).tolist()

def sigmoid(x,a=1,b=0):
    return 1.0 / (1.0 + np.exp(a*(-x+b)))

In [10]:
def my_plot(epochs, train, evals, ylabel):
    """Prints the plot of evaluation
    
    Parameters
    ----------
    epochs : list
        A list with the epochs
        
    train : list
        A list with the prediction score on the train set
        
    evals : list
        A list with the prediction score on the test set
        
    y_label : str
        The label of the y-axis
    """
    for i in range(len(train)):
        if evals[i] > train[i]:
            break
    plt.plot(epochs, train, label='Train')
    plt.plot(epochs, evals, label='Eval')
    plt.vlines(x = i+1,ls='--', ymin = 0, ymax = max(max(train),max(evals)), colors = 'grey', label = 'x = '+str(i+1))
    plt.xlabel('epochs')
    plt.ylabel(ylabel)
    plt.legend()
    plt.show()

In [None]:
def transformations(data, model_type, test=None, evaluation=False, transformation_list=['scaling'], embeddings=None):
    """Executes scaling, augmentation or label endconding on the dataset 
    
    Parameters
    ----------
    data : dataframe
        A dataframe containing all the data or the data of trainning.
        If test not given, then data will be randomnly slpit in 80-20% train and test set
        
    model_type : str
        The type of the model to be implemented.
        Could be 'class_regression' or 'mosquito_regression' or 'classification'
        
    test : dataframe, optional
        A dataframe containing all the data for testing (default = None)
        
    scaling : boolean, optional
        If True, perofrms scaling on numerical features (default = False)
        
    augment : boolean, optional
        If True, augments the train data with existing observations
        and giving greater weight on the observations with greater target value (default = False)
        
    embeddings : list, optional
        A list of columns with categorical features in order to be label encoded (default = None)
        
    evaluation : boolean, optional
        If True, 20% of the observations of train set will be held for evaluation set (default = False)
        
    Returns
    ----------
    train_X: numpy array
        A numpy array with independent variables for training
        
    train_y: pd.Series
        A array with the dependent variables (target) for training
        
    test_X: numpy array
        A numpy array with independent variables for test
        
    test_y: pd.Series
        A array with the dependent variables (target) for test
    
    """

    if test is None:    
        X, y = data.iloc[:,:-1], data.iloc[:,-1]
        train_X,test_X,train_y,test_y = train_test_split(X, y, test_size=0.20, random_state=1)
    else:
        data = data.sample(frac=1,random_state=1).reset_index(drop=True)
        train_X, train_y = data.iloc[:,:-1], data.iloc[:,-1]
        test_X, test_y = test.iloc[:,:-1], test.iloc[:,-1]
        
    train_X = train_X.reset_index(drop=True)
    train_y = train_y.reset_index(drop=True)
    test_X = test_X.reset_index(drop=True)
    test_y = test_y.reset_index(drop=True)
        
    if evaluation:
        train_X,eval_X,train_y,eval_y = train_test_split(train_X, train_y, test_size=0.20, random_state=1)
        eval_X = eval_X.reset_index(drop=True)
        eval_y = eval_y.reset_index(drop=True)
            
    percentile = round(np.percentile(train_y, 95))
    train_y.loc[train_y >= percentile] = percentile

    if 'augmentation' in transformation_list:
        augment_index = train_y.sample(frac=0.4, weights=train_y, random_state=1, replace=True).index
        train_X = pd.concat([train_X,train_X.iloc[augment_index,:]]).reset_index(drop=True)
        train_y = pd.concat([train_y,train_y[augment_index]]).reset_index(drop=True)
    
    if model_type == 'classification':
        num_classes = len(pd.concat([train_y,test_y]).unique())
        if evaluation:
            num_classes = len(pd.concat([train_y, test_y, eval_y]).unique())
            eval_y = vectorise(eval_y,num_classes)               
        train_y = vectorise(train_y,num_classes)
        test_y = vectorise(test_y,num_classes)

    if embeddings != None:   
        embedding_cols = embeddings.columns.tolist()
        embedded_columns_train = train_X.loc[:,embedding_cols] #categorical columns
        train_X = train_X.drop(columns=embedding_cols)

        embedded_columns_test = test_X.loc[:,embedding_cols] #categorical columns
        test_X = test_X.drop(columns=embedding_cols)
        
        if evaluation:
            embedded_columns_eval = eval_X.loc[:,embedding_cols] #categorical columns
            eval_X = eval_X.drop(columns=embedding_cols)
            
        label_encoder = LabelEncoder()
        for col in embeddings:
            label_encoder.fit(pd.concat([embedded_columns_train[col],embedded_columns_test[col]],axis=0))
            if evaluation:
                label_encoder.fit(pd.concat([embedded_columns_train[col],embedded_columns_test[col],embedded_columns_eval[col]],axis=0))
                embedded_columns_eval[col] = label_encoder.transform(embedded_columns_eval[col])
            embedded_columns_train[col] = label_encoder.transform(embedded_columns_train[col])
            embedded_columns_test[col] = label_encoder.transform(embedded_columns_test[col])

        train_X_emb = embedded_columns_train.values
        test_X_emb = embedded_columns_test.values
        if evaluation:
            eval_X_emb = embedded_columns_eval.values
            
    if 'scaling' in transformation_list:
        scaler = StandardScaler()
        train_X = scaler.fit_transform(train_X)
        test_X = scaler.transform(test_X)
        if evaluation:
            eval_X = scaler.transform(eval_X)
            
    if 'normalization' in transformation_list:
        min_val = train_y.min()
        max_val = train_y.max()
        test_y = normalizeData(test_y, min_val, max_val)
        train_y = normalizeData(train_y, min_val, max_val)
        if evaluation:
            eval_y = normalizeData(eval_y, min_val, max_val)
            
    if 'log' in transformation_list:
        test_y = logData(test_y)
        train_y = logData(train_y)
        if evaluation:
            eval_y = logData(eval_y)
            
    if embeddings != None:
        train_X = [train_X,train_X_emb]
        test_X = [test_X, test_X_emb]
        if evaluation:
            eval_X = [eval_X,eval_X_emb]
    else:
        train_X = [train_X]
        test_X = [test_X]
        if evaluation:
            eval_X = [eval_X]
            
    if evaluation:
        return train_X, train_y, eval_X, eval_y, test_X, test_y
    else:       
        return train_X, train_y, test_X, test_y

In [None]:
class FeedforwardNeuralNetModel(tf.keras.Model):
    """
    l1_weight: int or float, optional
        Weight parameter for L1 regularization (default=0)
    
    l2_weight: int or float, optional
        Weight parameter for L2 regularization (default=0)
    """    
    def __init__(self, output_len, hidden_layers, model_type, learning_rate, epochs, batch_size, embedding_data=None,
                 dropout=None, transformation_list=[], early_stop=False, l1_weight=0, l2_weight=0, val_metrics=[]):
        
        super().__init__()
        
        if (isinstance(dropout, float)) and (dropout < 0 or dropout > 1):
            raise ValueError('Dropout rate argument must be a float number in [0,1]')
        elif isinstance(dropout, list):
            if any(i<0 or i>1 for i in dropout):
                raise ValueError('All elements of the dropoutlist must be 0<=1<=1')
            if (embedding_data is None) and (len(dropout)!=len(hidden_layers)):
                raise ValueError('Dropout list and hidden_layers list must be of the same size')
            if (embedding_data is not None) and (len(dropout)!=len(hidden_layers)+1):
                raise ValueError('Dropout list must be one element greater than the hidden layers list')
        
        self.output_len = output_len
        self.hidden_layers = hidden_layers
        self.model_type = model_type
        self.transformation_list = transformation_list
        self.epochs = epochs
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.early_stop = early_stop
        self.embedding_data = embedding_data
        self.dropout = dropout
        self.l1_weight = l1_weight
        self.l2_weight = l2_weight
        if isinstance(early_stop, tuple):
            self.early_stop = tf.keras.callbacks.EarlyStopping(monitor=early_stop[0],min_delta=early_stop[1],patience=early_stop[2])
        self.val_metrics = val_metrics
        
        tf.keras.utils.set_random_seed(0)
        self.activation_relu = tf.keras.layers.Activation(tf.keras.activations.relu)
        self.activation_softmax = tf.keras.layers.Activation(tf.keras.activations.softmax)
        self.activation_sigmoid = tf.keras.layers.Activation(tf.keras.activations.sigmoid)
        self.initializer = tf.keras.initializers.HeNormal()
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
        if self.model_type == 'classification':
            self.loss = tf.keras.losses.CategoricalCrossentropy()
        else:
            self.loss = tf.keras.losses.MeanSquaredError()
        
        self.linear_layers = [tf.keras.layers.Dense(hidden_layers[i], kernel_initializer=self.initializer,kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_weight, l2=l2_weight)) for i in range(len(hidden_layers))]
        self.batchnorm_layers = [tf.keras.layers.BatchNormalization() for i in range(len(self.linear_layers))]
        self.dropout_layers =[]
    
        if embedding_data is not None:
            self.embedding_layers = [tf.keras.layers.Embedding(input_dim=len(embedding_data.iloc[:,i].unique()),output_dim=min(10,int(len(embedding_data.iloc[:,i].unique())/2)),input_length=1) for i in range(len(embedding_data.iloc[0,:]))]
        
        if dropout != None:
            if embedding_data is not None:
                dropout =[dropout for i in range(len(self.linear_layers)+1)]
            else:
                dropout =[dropout for i in range(len(self.linear_layers))]   
        
            self.dropout_layers = [tf.keras.layers.Dropout(dropout[i]) for i in range(len(dropout))]
        
        self.linear_layers.append(tf.keras.layers.Dense(output_len, kernel_initializer=self.initializer,kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_weight, l2=l2_weight)))
        
    def call(self, inputs):
        inputs_cont=inputs[0]
        dropout_layers = self.dropout_layers
        
        if len(inputs)==2:
            cat=[]
            inputs_cat = inputs[1]
            for i in range(len(self.embedding_layers)):
                if tf.shape(inputs_cat).shape[0]>1:
                    x = self.embedding_layers[i](inputs_cat[:,i])
                else:
                    x = self.embedding_layers[i](inputs_cat)
                if len(self.dropout_layers) != 0:    
                    x = dropout_layers[0](x)
                    dropout_layers = dropout_layers[1:]
                cat.append(x)
            combined = tf.keras.layers.concatenate([cat[i] for i in range(len(cat))])
            inputs_cont = tf.keras.layers.concatenate([combined,inputs_cont], axis=1)
            
        x = self.linear_layers[0](inputs_cont)
        x = self.activation_relu(x)
        if (len(dropout_layers) != 0):
            x = dropout_layers[0](x)

        for i in range(1,len(self.linear_layers)-1):
            x = self.batchnorm_layers[i-1](x)
            x = self.linear_layers[i](x)
            x = self.activation_relu(x)
            if (len(dropout_layers) != 0):
                x = dropout_layers[i](x)
                
        x = self.batchnorm_layers[-1](x)        
        x = self.linear_layers[-1](x)
        if self.model_type=='classification':
            x = self.activation_softmax(x)
        else:
            if 'normalization' in self.transformation_list:
                x = self.activation_sigmoid(x)
            else:
                x = self.activation_relu(x)
        return x
    
    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "output_len": self.output_len,
                "hidden_layers": self.hidden_layers,
                "model_type" : self.model_type,
                "transformation_list": self.transformation_list,
                "epochs": self.epochs,
                "batch_size": self.batch_size,
                "learning_rate": self.learning_rate,
                "early_stop": self.early_stop,
                "embedding_data": self.embedding_data,
                "dropout": self.dropout,
                "l1_weight": self.l1_weight,
                "l2_weight": self.l2_weight,
                "val_metrics": self.val_metrics
            }
        )
        return config

In [12]:
def train_nn(model, train_X, train_y, test_X, test_y, max_val, learning_rate=None, 
             epochs=None, batch_size=None, early_stop=False, sample_weights=False):
    """ Trainning of the model
    
    Parameters
    ----------
    model : FeedforwardNeuralNetModel
        A FeedforwardNeuralNetModel model
        
    train_X : Dataset
        A Dataset object with the train set
        
    test_set : Dataset
        A Dataset object with the test set
        
    learning_rate : int, optional
        The learning_rate of the training process. (default = None)
        
    epochs : int, optional
        The number of epochs for the training. (default = None)
        
    batch_size : int, optional
        The size of each batch in each iteration. (default = None)
        
    ealry_stop : boolean, optional
        If True, the trainning of the model may stop earlier than the epochs defined. (default = None)
        
    sample_weights: boolean, optional
        If True, each sample is weighted based on the mosquito number (default = False)
        
    Returns
    ----------
    results_train: DataFrame
        A Dataframe containing the actual and the predicted values on the train set
        
    results_test: DataFrame
        A Dataframe containing the actual and the predicted values on the test set
        
    model : FeedforwardNeuralNetModel
        A trained FeedforwardNeuralNetModel model 
    """    
    if epochs == None:
        epochs = model.epochs
    
    if learning_rate == None:
        learning_rate = model.learning_rate
        
    if batch_size == None:
        batch_size = model.batch_size
        
    if early_stop == False:
        early_stop = model.early_stop
    
    model.compile(loss = model.loss, optimizer = model.optimizer, metrics = model.val_metrics)
    
    if sample_weights:
        weights = np.array([sigmoid(train_y[i],0.005,1) for i in range(len(train_y))])
    else:
        weights = np.array([1 for i in range(len(train_y))])
        
    if early_stop == False:
        history = model.fit(train_X, train_y, batch_size=batch_size, epochs=epochs,
                            sample_weight=weights, validation_data=(test_X,test_y))
    else:
        history = model.fit(train_X, train_y, batch_size=batch_size, epochs=epochs,
                            sample_weight=weights, validation_data=(test_X,test_y), callbacks=[early_stop])
        epochs = early_stop.stopped_epoch + 1

    for i in ['loss'] + model.val_metrics:
        my_plot(np.linspace(1, epochs, epochs).astype(int), history.history[i], history.history['val_'+i] ,i)

    train_predict = model.predict(train_X)
    test_predict = model.predict(test_X)
       
    
    if model.model_type == 'classification':
        train_predict = np.argmax(train_predict, axis=1)
        test_predict = np.argmax(test_predict, axis=1)
        train_y = np.argmax(train_y, axis=1)
        test_y = np.argmax(test_y, axis=1)

    else:
        if 'normalization' in model.transformation_list:
            train_y = denormalizeData(train_y,min_val=0,max_val=max_val)
            train_predict = denormalizeData(train_predict,min_val=0,max_val=max_val)
            test_y = denormalizeData(test_y,min_val=0,max_val=max_val)
            test_predict = denormalizeData( test_predict,min_val=0,max_val=max_val)
            
        if 'log' in model.transformation_list:
            train_y = expData(train_y)
            train_predict = expData(train_predict)
            test_y = expData(test_y)
            test_predict = expData(test_predict)
            
        train_y = np.round(train_y)
        train_predict = [np.round(e[0]) for e in train_predict]
        test_y = np.round(test_y)
        test_predict = [np.round(e[0]) for e in test_predict]

        
    results_train = {'actual': train_y, 'prediction': train_predict}
    results_train = pd.DataFrame.from_dict(results_train)
    results_test = {'actual': test_y, 'prediction': test_predict}
    results_test = pd.DataFrame.from_dict(results_test)
    
    results_test.loc[results_test['prediction'] < 0,'prediction'] = 0
    results_test.loc[results_test['prediction'] > max_val,'prediction'] = max_val
    results_train.loc[results_train['prediction'] < 0,'prediction'] = 0
    results_train.loc[results_train['prediction'] > max_val,'prediction'] = max_val
        
    return results_train, results_test, model

In [None]:
def give_predictions(model, test_X, max_val, test_y=None):
    """ Returns predictions of a nn model on a set of featues.
    
    Parameters
    ----------
    model : FeedforwardNeuralNetModel
        A trained FeedforwardNeuralNetModel model 
        
    test_set : Dataset
        A Dataset object with the test set
                
    Returns
    ----------
    test_predict: list
        A list of the predictions for a test set given
        
    """    

    test_predict = model.predict(test_X)
       
    if model.model_type == 'classification':
        test_predict = np.argmax(test_predict, axis=1)
        if test_y != None:
            test_y = np.argmax(test_y, axis=1)

    else:
        if 'normalization' in model.transformation_list:
            if test_y != None:
                test_y = denormalizeData(test_y,min_val=0,max_val=max_val)
            test_predict = denormalizeData(test_predict,min_val=0,max_val=max_val)
            
        if 'log' in model.transformation_list:
            if test_y != None:
                test_y = expData(test_y)
            test_predict = expData(test_predict)
            
        if test_y != None:
            test_y = np.round(test_y)
        test_predict = [np.round(e[0]) for e in test_predict]
  
    if test_y != None:    
        results_test = {'actual': test_y, 'prediction': test_predict}
        results_test = pd.DataFrame.from_dict(results_test)
    else:
        results_test = {'prediction': test_predict}
        results_test = pd.DataFrame.from_dict(results_test)
        
    results_test.loc[results_test['prediction'] < 0,'prediction'] = 0
    results_test.loc[results_test['prediction'] > max_val,'prediction'] = max_val
        
    return results_test