In [1]:
import pandas as pd
import numpy as np
import datetime
from tensorflow.keras.models import clone_model
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import NearestNeighbors
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import random
from ipynb.fs.defs.Neural_Network_Module import  transformations, FeedforwardNeuralNetModel, train_nn

In [4]:
def plot_error_dist(actual, predictions, model_type, case=''):
    """Prints the error distribution plot
    Parameters
    --------
    actual : pd.Series
        A Series with the actual class of each prediction
        
    predictions : pd.Series
        A Series with the predicted class of each prediction
        
    model_type : str
        Could be 'class_regression' or 'mosquito_regression' or 'classification'
    
    case: str, optional
        Title of the plot (default= '')
    """
    error = np.abs(actual-predictions).tolist()
    if model_type != 'mosquito_regression':
        bins = np.arange(len(actual.unique())) - 0.5
        plt.hist(error, bins)
        plt.xticks(range(len(actual.unique())))
    else:
        plt.hist(error)
    plt.xlabel('abs(error)')
    plt.title('Error Distribution \n' + case)
    plt.show()

In [11]:
def plot_hist(actual, predictions, model_type, case=''):
    """Prints the histogram of the actual values and the predicted values
    Parameters
    --------
    actual : pd.Series
        A Series with the actual class of each prediction
        
    predictions : pd.Series
        A Series with the predicted class of each prediction
        
    model_type : str
        Could be 'class_regression' or 'mosquito_regression' or 'classification'
    
    case: str, optional
        Title of the plot (default= '')
    """
    plt.figure(figsize=(10,8)) 
    if model_type != 'mosquito_regression':
        bins = np.arange(len(actual.unique())+1)-0.5
        plt.hist(actual, bins=bins, alpha=0.5, label='actual')
        plt.hist(predictions, bins=bins, alpha=0.5, label='prediction')
        plt.xticks(range(len(actual.unique())))
    else:
        plt.hist(actual, alpha=0.5, label='actual')
        plt.hist(predictions, alpha=0.5, label='prediction')
    plt.legend()
    plt.title('Histogram of actual vs predicted values \n'+case)
    plt.show()

In [5]:
def plot_error_per_class(test, case=''):
    """Prints the error distribution per class plot
    Parameters
    --------
    actual : lst
        A list with the actual class of each prediction
        
    case: str, optional
        Title of the plot (Area and mosquito genus) (default= '')
    """
    labels = test.loc[:,'actual'].unique().tolist()
    labels.sort()
    f = []
    length = []
    for k in labels:
        cc = test.loc[test['actual']==k]
        length.append(len(cc))
        actual = cc.loc[:,'actual']
        predictions = cc.loc[:,'prediction']
        mae_class = mean_absolute_error(actual, predictions)
        f.append(mae_class)
    labels = [str(int(e)) for e in labels]
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.bar(labels,f)
    for i, v in enumerate(f):
        ax.text(i, v, str('%.2f'%(v)), rotation=30)
        ax.text(i, v/2,'n = '+ str(length[i]), weight="bold", ha='center', rotation=90)
    plt.xlabel('class')
    plt.ylabel('MAE')
    plt.title('MAE per class ' + case)
    plt.show()
    
    print('-----------|class error-MAE| difference-----------')
    z = np.abs(f-mean_absolute_error(actual, predictions))
    print('mean:',z.mean())
    print('std:',z.std())
    print('coefficient of variation (std/mean):',z.std()/z.mean())
    
    print()
    
    print('----------normalized difference-------------')
    min_val = min(z)
    max_val = max(z)
    z = (z - min_val) / (max_val-min_val)
    print('mean:',z.mean())
    print('std:',z.std())

In [6]:
def plot_error_per_month(df, case=''):
    """Prints the error per month
    Parameters
    --------
    df : dataframe
        A dataframe containing the data
    
    case: str, optional
        Title of the plot (Area and mosquito genus) (default= '')
    """
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    labels = (df['dt_prediction'].dt.month).unique()
    labels.sort()
    labels = [str(e) for e in labels]
    df['abs(error)'] = np.abs(df['actual']-df['prediction'])
    f = df.groupby(by=[df['dt_prediction'].dt.month])['abs(error)'].mean().values
    length = df.groupby(by=[df['dt_prediction'].dt.month])['dt_prediction'].count().values
    ax.bar(labels,f)
    for i, v in enumerate(f):
        ax.text(i, v, str('%.2f'%(v)),rotation=30)
        ax.text(i, v/2,'n = '+ str(length[i]),weight="bold",ha='center', rotation=90)
    plt.xlabel('Month')
    plt.ylabel('MAE')
    plt.title('Mean Absolute Error per month ' + case)
    plt.show()

In [10]:
def scatter_plot_error(actual, prediction, case=''):
    """Prints the error in relation with the distance of point from the train region
    Parameters
    --------
    df : dataframe
        A dataframe containing the data
    
    case: str, optional
        Title of the plot (Area and mosquito genus) (default= '')
    """
    # choose the input and output variables
    x, y = actual, np.abs(actual-prediction)
    plt.scatter(x, y)
    plt.xlabel('Mosquito bins')
    plt.ylabel('Error')
    plt.title('Scatterplot of error ' + case)
    plt.show()

In [9]:
def plot_error_per_group(actual,prediction,case=''):
    """Prints the error distribution plot
    Parameters
    --------
    actual : pd.Series
        A Series with the actual class of each prediction
        
    predictions : pd.Series
        A Series with the predicted class of each prediction
        
    case: str, optional
        Title of the plot (default= '')

    """
    test = {'mosq_now':actual,'predictions':prediction}
    test = pd.DataFrame(test)
    test['classes'] = pd.cut(x=test['mosq_now'], bins=[-1, 100, 200, 300, 400, 500, np.inf],
                      labels=['0-100', '101-200', '201-300', '301-400', '401-500', '500<'])
    labels = test['classes'].unique().tolist()
    labels.sort()
    f = []
    length = []
    for k in labels:
        cc = test.loc[test['classes']==k]
        length.append(len(cc))
        actual = cc.loc[:,'mosq_now']
        predictions = cc.loc[:,'predictions']
        f.append(mean_absolute_error(actual, predictions))
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.bar(labels,f)
    for i, v in enumerate(f):
        ax.text(i, v, str('%.2f'%(v)),rotation=30)
        ax.text(i, v/2,'n = '+ str(length[i]),weight="bold",ha='center', rotation=90)
    plt.xlabel('Mosquito Group')
    plt.ylabel('MAE')
    plt.title('MAE per Mosquito group \n'+case)
    plt.show()

In [None]:
def error_cdf(actual,prediction, case=''):
    """Prints the cdf of errors
    Parameters
    --------
    actual : pd.Series
        A Series with the actual class of each prediction
        
    predictions : pd.Series
        A Series with the predicted class of each prediction
        
    case: str, optional
        Title of the plot (default= '')

    """
    error = np.abs(actual-prediction)
    
    a = np.sort(error.unique())
    b = np.array(error)
    cdf = np.zeros(len(a))
    for k, val in enumerate(a):
        mask_d = b <= val
        cdf[k] = mask_d.sum()/ len(b)
    plt.figure(figsize=(8,8))
    plt.plot(a,cdf)
    plt.grid()
    plt.xlabel('abs(error)',fontsize=18)
    plt.ylabel('CDF',fontsize=18)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    #plt.title('CDF of error \n' + case)
    plt.show() 
    
    b = np.sort(error)
    a = np.arange(1,len(error)+1) 
    cdf = np.zeros(len(a))
    for k, val in enumerate(b):
        cdf[k] = b[k]
    plt.plot(a,cdf)
    plt.grid()
    plt.xlabel('Number of samples',fontsize=18)
    plt.ylabel('Error',fontsize=18)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.title('CDF of error \n' + case)
    plt.show()

In [8]:
def objective(x, A, B):
    return A*x + B

def plot_error_per_distance(df, case=''):
    """Prints the error in relation with the distance of point from the train region
    Parameters
    --------
    df : dataframe
        A dataframe containing the data
    
    case: str, optional
        Title of the plot (Area and mosquito genus) (default= '')
    """
    df = df[['neigh_distance','abs(error)']]
    df = df.sort_values(by=['neigh_distance'])
    # choose the input and output variables
    x, y = df['neigh_distance'], df['abs(error)']
    # curve fit
    popt, _ = curve_fit(objective, x, y)
    # summarize the parameter values
    a, b = popt
    print('y = %.5f * x + %.5f' % (a, b))
    # plot input vs output
    plt.scatter(x, y)
    x_line = x
    # calculate the output for the range
    y_line = objective(x_line, a, b)
    plt.xlabel('Distance')
    plt.ylabel('MAE')
    plt.title('Mean Absolute Error per Distance ' + case)
    plt.plot(x_line, y_line, '--', color='red',label='y={:.2f}x+{:.2f}'.format(a,b))
    plt.legend()
    plt.show()

In [12]:
def metrics(train, test, threshold=3):
    """Calculates the perfomance of the model on train and test set
    Parameters
    --------
    train : Dataframe
        A Dataframe with the actual and the predicted values on the train set
        
    test : Dataframe
        A Dataframe with the actual and the predicted values on the train set
        
    threshold: int, optional
        A threshold to calculate percentage of error < threshold (default= 3)

    """    
    print('MAE on train set: ', mean_absolute_error(train['actual'], train['prediction']))

    print('min prediction:',min(train['prediction']))
    print('max prediction:',max(train['prediction']))
    
    print()

    print('MAE on test set: ', mean_absolute_error(test['actual'], test['prediction']))
    perc = ((np.abs(test['actual']-test['prediction']) < (threshold+0.5)).mean())*100
    print('Error <= '+str(threshold)+':',"%.2f"%perc,'%')

    print('min prediction:',min(test['prediction']))
    print('max prediction:',max(test['prediction']))

In [13]:
def validation(train, test, model_type, case=''):
    """Prints plots about the performance of the model on the test set
    
    Parameters
    --------        
    test : Dataframe
        A Dataframe with the actual and the predicted values on the train set
    
    model_type : str
        Could be 'class_regression' or 'mosquito_regression' or 'classification'
        
    case: str, optional
        Title of the plot (default= '')
    """
    if model_type != 'mosquito_regression':
        metrics(train, test)
        plot_error_per_class(test, case)
    else:
        metrics(train, test, threshold=30)
        plot_error_per_group(test['actual'],test['prediction'], case)
        error_cdf(test['actual'],test['prediction'], case)
    scatter_plot_error(test['actual'],test['prediction'], case)
    plot_error_dist(test['actual'],test['prediction'], model_type, case)
    plot_hist(test['actual'],test['prediction'], model_type, case)
    plot_error_per_month(test, case)

In [14]:
def test_model_random_split(train, model, test=None, filepath = '', date_col = 'dt_placement', case='', export=False):
    """Trains a model on random splitted data
    
    Parameters
    --------
    data : dataframe
        A dataframe containing the data
    
    model : torch.nn.model
        A NN model to train
        
    filepath : srt, optional
        The path of the file to export the results (default = '')
        
    date_col : str, optional
        The name of the date column (default = 'dt_placement')
    
    case : str, optional
        The title of case for the plot (default='')
    
    export : boolean, optional
        Export a csv with the feature importance and a csv with the test data (default = False)
    """
    case = case + " random validation"
    
    if test is None:
        np.random.seed(1)
        drop_index = np.random.randint(0, len(train), int((len(train)*20)/100))
        test = train.iloc[drop_index,:].reset_index(drop=True)
        train = train.drop(drop_index).reset_index(drop=True)
    
    mosq_col = train.columns[-1]
    
    del train[date_col]
    date = test[date_col]
    del test[date_col]
    
    model_int = model.__class__.from_config(model.get_config())
    
    max_val = round(np.percentile(train.iloc[:,-1], 95))
        
    train_X, train_y, test_X, test_y = transformations(train, test = test, model_type = model.model_type,
                                                       transformation_list =model. transformation_list,
                                                       embeddings=model.embedding_data)
    
    results_train, results_test, _ = train_nn(model_int, train_X, train_y, test_X, test_y, max_val=max_val)
    
    test[date_col] = date
    test['dt_prediction'] = test[date_col] + datetime.timedelta(days=15)
    test['prediction'] = results_test['prediction']
    test['error'] = test[mosq_col] - test['prediction']
    test['abs(error)'] = np.abs(test['error'])
    
    test =  test.rename(columns={mosq_col : 'actual'})

    validation(results_train, test, model_type=model.model_type, case=case)  
    
    test =  test.rename(columns={'actual' : mosq_col})
    
    if export:
        csv = filepath + case + '.csv'
        test.to_csv(csv,index=False)

In [15]:
def train_model_KFold(data, model, cv=10, date_col='dt_placement', case=''):
    """Trains a model on KFold splitted data
    
    Parameters
    --------
     data : dataframe
        A dataframe containing the data
    
    model : torch.nn.model
        A NN model to train
        
    date_col : str, optional
        The name of the date column (default = 'dt_placement')
    """

    case = case + ' ' +str(cv) +" fold validation"
    kf = KFold(n_splits=cv)

    df_train = pd.DataFrame() 
    df_test = pd.DataFrame()
    
    for train_index, test_index in kf.split(data):
        
        model_int = model.__class__.from_config(model.get_config())
        
        train = data.iloc[train_index,:].reset_index(drop=True)        
        test = data.iloc[test_index,:].reset_index(drop=True)
        
        del train[date_col]
        date = test[date_col]
        del test[date_col]
        
        max_val = round(np.percentile(train.iloc[:,-1], 95))
        
        train_X, train_y, test_X, test_y = transformations(train, test = test, model_type = model.model_type,
                                                           transformation_list = model.transformation_list,
                                                           embeddings = model.embedding_data)
    
        results_train, results_test, _ = train_nn(model_int, train_X, train_y, test_X, test_y, max_val=max_val)
                
        results_test['dt_prediction'] = date + datetime.timedelta(days=15)
        
        df_train = pd.concat([df_train, results_train])
        df_test = pd.concat([df_test, results_test])
        
    validation(df_train, df_test, model_type=model.model_type, case=case)

In [16]:
def operational_validation(data, model, date, filepath ='', date_col='dt_placement', case='', export=False):
    """Trains a model on data of the previous months and evaluates on data of the next month iteratively.
    
    Parameters
    --------
     data : dataframe
        A dataframe containing the data
    
    model : torch.nn.model
        A NN model to train
    
    date: str
        The date to start the testing process from (format: YYYY-MM-DD)
    
    filepath : srt
        The path of the file to export results (default ='')
        
    date_col : str, optional
        The name of the date column (default = 'dt_placement')
    
    case : str, optional
        The title of case for the plot (default ='')
    
    export : boolean, optional
        Export a csv with the feature importance and a csv with the test data (default=False)
        
    Raise
    --------------
    ValueError
        If date > maximum date in the dataset
        
    """
    if (pd.to_datetime(date) > data['dt_placement'].max()):
        raise ValueError('date argument given must be before than '+ str(data['dt_placement'].max()))
        
    df_train = pd.DataFrame() 
    df_test = pd.DataFrame() 
        
    case = case + " operational validation"
    df = pd.DataFrame()
    months = data.loc[data['dt_placement']>pd.to_datetime(date),'dt_placement'].dt.to_period('M').unique()
    months = months.strftime('%Y-%m')
    months.sort()
    
    
    for i in months:
        model_int = model.__class__.from_config(model.get_config())
        
        date1 = i +'-01'
        if i.split('-')[1]>='09':
            if i.split('-')[1]=='12':
                date2 = str(int(i.split('-')[0])+1) +'-01-01'
            else:
                date2 = i.split('-')[0] + '-' + str(int(i.split('-')[1]) + 1) + '-01'
        else:
            date2 = i.split('-')[0] + '-0' + str(int(i.split('-')[1]) + 1) + '-01'

        train = data.loc[data[date_col] < pd.to_datetime(date1)].reset_index(drop=True)
        test = data.loc[data[date_col] >= pd.to_datetime(date1)].reset_index(drop=True)
        test = test.loc[test[date_col] < pd.to_datetime(date2)].reset_index(drop=True)
        
        max_val = round(np.percentile(train.iloc[:,-1], 95))

        del train[date_col]
        date = test[date_col]
        del test[date_col]
        
        train_X, train_y, test_X, test_y = transformations(train, test = test, model_type = model.model_type,
                                                           transformation_list = model.transformation_list,
                                                           embeddings=model.embedding_data)

        train, test2, _ = train_nn(model_int, train_X, train_y, test_X, test_y, max_val=max_val)
        
        test[date_col] = date
        test['dt_prediction'] = test[date_col] + datetime.timedelta(days=15)
        test['prediction'] = test2['prediction']
        test.loc[test['prediction']<0,'prediction'] = 0
        test['error'] = test_y - test['prediction']
        test['abs(error)'] = np.abs(test_y- test['prediction'])
        
        df_train = pd.concat([df_train,results_train])
        df_test = pd.concat([df_test,test])
        
    df_test = df_test.rename(columns={mosq_col:'actual'})
    
    validation(df_train, df_test, model_type=model.model_type, case=case)
    
    df_test = df_test.rename(columns={'actual':mosq_col})
    
    if export:
        csv = filepath + case + '.csv'
        df_test.to_csv(csv,index=False)

In [17]:
def off_trap_validation(data, model, model_type, step, transformation_list, embeddings=None, filepath ='',
                        date_col='dt_placement', case='',export=False):
    """Trains a model on random splitted data
    
    Parameters
    --------
    data : dataframe
        A dataframe containing the data
    
    model : torch.nn.model
        A NN model to train
    
    model_type : str
        Could be 'class_regression' or 'mosquito_regression' or 'classification'
        
    step : int
        The number of days for prediction
        
    transform_target : boolean, 
            If True, perofrms transformation of the target based on the model_type argument (default = False)
        
    filepath : srt, optional
        The path of the file to export the results (default='')
        
    date_col : str, optional
        The name of the date column (default = 'dt_placement')
    
    case : str, optional
        The title of case for the plot (default='')
    
    export : boolean, optional
        Export a csv with the feature importance and a csv with the test data (default=False)
    """
    case = case + " off trap validation"
    
    model_int = model.__class__.from_config(model.get_config())
    
    stations = data[['x','y']].drop_duplicates().reset_index(drop=True)
    station_range = range(0,len(stations)-1)
    test = random.sample(station_range, round(len(stations)/4))
    train = [x for x in list(station_range) if (x not in test)]
    test = pd.merge(data, stations.iloc[test,:], on=['x','y'], how='inner')
    train = pd.merge(data, stations.iloc[train,:], on=['x','y'], how='inner')
    
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    
    max_val = round(np.percentile(train.iloc[:,-1], 95))
    
    date = test[date_col]
    train = train.drop([date_col],axis=1)
    test = test.drop([date_col],axis=1)
    
    
    train['cluster'] = 0
    test['cluster'] = 1
    plot_df = pd.concat([train[['x','y','cluster']],test[['x','y','cluster']]],axis=0)
    colors = ['r' if flag==1 else 'b' for flag in plot_df.cluster]
    plt.scatter(plot_df.loc[:,'x'], plot_df.loc[:,'y'], c=colors)
    red_patch = mpatches.Patch(color='red', label='test stations')
    blue_patch = mpatches.Patch(color='blue', label='train stations')
    plt.legend(handles=[red_patch, blue_patch])
    plt.show()   
    train = train.drop(['cluster'],axis=1)
    test = test.drop(['cluster'],axis=1)
    
    neigh = NearestNeighbors(n_neighbors=1)
    neigh.fit(train[['x','y']].drop_duplicates(subset=['x','y']))
    dist,_ = neigh.kneighbors(test[['x','y']])
       

    train_X, train_y, test_X, test_y = transformations(train, test = test, model_type = model_type,
                                                       transformation_list = transformation_list,
                                                       embeddings=embeddings)

    train, test2, _ = train_nn(model_int, train_X, train_y, test_X, test_y, max_val=max_val) 
    
    test[date_col] = date
    test['dt_prediction'] = test[date_col] + datetime.timedelta(days=step)
    test['prediction'] = test2['prediction']
    test.loc[test['prediction']<0,'prediction'] = 0
    test['error'] = test['mosq_bins(t+1)'] - test['prediction']
    test['abs(error)'] = np.abs(test['mosq_bins(t+1)'] - test['prediction'])
    test['neigh_distance'] = dist

    test =  test.rename(columns={'mosq_bins(t+1)':'actual'})

    if model_type == 'mosquito_regression':
        metrics(train, test, threshold=30)
    else:
        metrics(train, test)
    validation_plots(test,model_type)
    
    if export:
        csv = filepath + case + '.csv'
        test.to_csv(csv,index=False)

In [18]:
def clustered_off_trap_validation(data, model, model_type, step, transformation_list, embeddings=None,
                                  filepath='', station_col='station_id', date_col='dt_placement',
                                  case='', export=False):
    """Trains a model on data of certain area and tests on another area
    
    Parameters
    --------
    data : dataframe
        A dataframe containing the data
    
    model : torch.nn.model
        A NN model to train
    
    model_type : str
        Could be 'class_regression' or 'mosquito_regression' or 'classification'
        
    step : int
        The number of days for prediction
        
    transform_target : boolean, 
            If True, perofrms transformation of the target based on the model_type argument (default = False)
        
    filepath : srt, optional
        The path of the file to export the results
        
    date_col : str, optional
        The name of the date column (default = 'dt_placement')
    
    
    case : str, optional
        The title of case for the plot (default='')
    
    export : boolean, optional
        Export a csv with the feature importance and a csv with the test data (default=False)
    """
    
    case = case + ' clustered off trap validation'
    
    model_int = model.__class__.from_config(model.get_config())
    
    stations = data[['x','y']].drop_duplicates().reset_index(drop=True)

    max_index =  stations['x'].idxmax()
    test_st = stations.iloc[max_index,:]
    stations = stations.drop([max_index])
    stations = stations.reset_index(drop=True)
    
    neigh = NearestNeighbors(n_neighbors=20)
    neigh.fit(stations[['x','y']])
    _,test_stations = neigh.kneighbors(test_st[['x','y']].values.reshape(1, -1))
    
    
    test = test_stations.tolist()[0]
    station_range = range(0,len(stations)-1)
    train = [x for x in list(station_range) if (x not in test)]
    test = pd.merge(data, stations.iloc[test,:], on=['x','y'], how='inner')
    train = pd.merge(data, stations.iloc[train,:], on=['x','y'], how='inner')
    
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    
    date = test[date_col]
    train = train.drop([date_col],axis=1)
    test = test.drop([date_col],axis=1)
    
    max_val = round(np.percentile(train.iloc[:,-1], 95))    
    
    train['cluster'] = 0
    test['cluster'] = 1
    plot_df = pd.concat([train[['x','y','cluster']],test[['x','y','cluster']]],axis=0)
    colors = ['r' if flag==1 else 'b' for flag in plot_df.cluster]
    plt.scatter(plot_df.loc[:,'x'], plot_df.loc[:,'y'], c=colors)
    red_patch = mpatches.Patch(color='red', label='test stations')
    blue_patch = mpatches.Patch(color='blue', label='train stations')
    plt.legend(handles=[red_patch, blue_patch])
    plt.show()   
    train = train.drop(['cluster'],axis=1)
    test = test.drop(['cluster'],axis=1)

    neigh = NearestNeighbors(n_neighbors=1)
    neigh.fit(train[['x','y']].drop_duplicates(subset=['x','y']))
    dist,_ = neigh.kneighbors(test[['x','y']])
    
    train_X, train_y, test_X, test_y = transformations(train, test = test, model_type = model_type,
                                                       transformation_list = transformation_list,
                                                       embeddings=embeddings)

    train, test2, _ = train_nn(model_int, train_X, train_y, test_X, test_y, max_val=max_val) 
    
    test[date_col] = date
    test['dt_prediction'] = test[date_col] + datetime.timedelta(days=step)
    test['prediction'] = test2['prediction']
    test.loc[test['prediction']<0,'prediction'] = 0
    test['error'] = test['mosq_bins(t+1)'] - test['prediction']
    test['abs(error)'] = np.abs(test['mosq_bins(t+1)'] - test['prediction'])
    test['neigh_distance'] = dist

    test =  test.rename(columns={'mosq_bins(t+1)':'actual'})

    if model_type == 'mosquito_regression':
        metrics(train, test, threshold=30)
    else:
        metrics(train, test)
    validation_plots(test,model_type)
    
    if export:
        csv = filepath + case + '.csv'
        test.to_csv(csv,index=False)