# Applying ML models on the dataset

In [1]:
#import libraries
import warnings

def function_that_warns():
    warnings.warn("This is a warning message", UserWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    function_that_warns()  # This will not show a warning
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import lime
import lime.lime_tabular
import shap
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor


# Functions

In [2]:
def draw_histograms(arr, variables, n_rows, n_cols):
    df = pd.DataFrame(arr, columns =variables)
    fig=plt.figure(figsize=(20, 20))
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

In [3]:
# normalsie the input base on the norm parametrs, if norm is none, no normalisation will happen
def normalisation(X_train,X_test, norm ):
    if norm != None:
        scaler1 = norm
        X_train  =  scaler1.fit_transform(X_train)
        X_test  =  scaler1.transform(X_test)
    return X_train, X_test

In [4]:
def remove_outliers_zscore(X,y, threshold=3):
    z_scores = np.abs(stats.zscore(X))
    mask = (z_scores < threshold).all(axis=1)
    X = X[mask]
    y = y[mask]
    print(X.shape)
    return  X,y



In [5]:
# calculate different performance metric for the model clf
def clf_score(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    ma = mean_absolute_error(y_test,y_pred)
    ms = mean_squared_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)   
    return [ma,ms,r2]

In [6]:
# calculate mean and std for kfold results
def show_statics(arr,type):  
   
    ma = np.mean([ x[0] for x in arr]), np.std([ x[0] for x in arr])
    ms = np.mean([ x[1] for x in arr]), np.std([ x[1] for x in arr])
    r2 = np.mean([ x[2] for x in arr]), np.std([ x[2] for x in arr])
    
    return [ma,ms,r2,type ] 
    
# convert input array into a string 
def make_sring(a):
    return  a[3]+' '+str(round(a[0][0],2))+' +-' + str(round(a[0][1],2))+ ' '+ str(round(a[1][0],0))+' +-' + str(round(a[1][1],0))+ ' '+str(round(a[2][0]*100,2))+' +-' + str(round(a[2][1]*100,2))+ ' '
 

In [7]:
# train a model and 
def train_clf(clf, X_tr,y_tr,X_test, y_test,   print_ind = False, to_string = False):
    metrics_train = []
    metrics_valid = []
    metrics_test = []
    for i, (train_index, valid_index) in enumerate(kf.split(X_tr)):
             
        X_train = X_tr[train_index]
        y_train = y_tr[train_index]
        X_valid = X_tr[valid_index]
        y_valid = y_tr[valid_index]
        
        
       
        
        clf.fit(X_train, y_train.ravel())

        res = clf_score(clf,X_train,y_train)
        if (print_ind): print('Train: ', res)
        metrics_train.append(res)
        
        res = clf_score(clf,X_valid,y_valid)
        if (print_ind): print('Valid: ', res)
        metrics_valid.append(res)
        
        res = clf_score(clf,X_test,y_test)
        if (print_ind): print('Test: ', res)
        metrics_test.append(res)
    #print(show_statics( Accs))
    if to_string :
        return make_sring(show_statics( metrics_train,'train')), make_sring(show_statics( metrics_valid,'valid'))  , make_sring(show_statics( metrics_test,'test'))  
    else:
        return (show_statics( metrics_train,'train')), (show_statics( metrics_valid,'valid'))  , (show_statics( metrics_test,'test'))  


In [8]:
def apply_model(model,X_train,y_train,X_test, y_test, to_string = True ):
       
    ma_t,ms_t,r2_t = train_clf(model,X_train,y_train.reshape(-1, 1),X_test, y_test,False, to_string )
     
    return ma_t,ms_t,r2_t

In [9]:
def read_csv(path, target , remove_outliers = True, log = True  ):
    df = pd.read_csv(path)
    df.head()
    df['Region'] = pd.factorize( df['Region'] )[0]
    df['Industry'] = pd.factorize( df['Industry'] )[0]
    df['Employee _size'] = pd.factorize( df['Employee _size'] )[0]

    
    y = df[target].to_numpy()
    X = df.drop(target, axis=1).to_numpy()

    
    if remove_outliers:
        X , y = remove_outliers_zscore (X,y)
    if log:
      for i in range(5,X.shape[1],1):
        min_value = np.min(X[:,i],0) 
        if (min_value< 0):
            X[:,i] = X[:,i] + abs(min_value) + 0.0000001
        
        X[:,i] = np.log1p(X[:,i])
    
    
    X_train, X_test,y_train, y_test = train_test_split(X,y , 
                                   random_state=10,  
                                   test_size=0.2,  
                                   shuffle=True)
    X_train,X_test = normalisation(X_train,X_test, StandardScaler())
       
    return X_train, X_test,y_train, y_test, X,y


## Prepration

### Parameters

In [10]:
#reading files 
imputed_file_knn = "Cleaned_data_knn_imputed_df.csv"
imputed_file_em = "Cleaned_data_em_imputed_df.csv"
imputed_file_ct = "Cleaned_data_imputed_df.csv" 
     

df_features = pd.read_csv(imputed_file_knn).columns.to_list()
df_features.remove("Turnover")
 

df_target = "Turnover"
kf = KFold(n_splits=10, random_state=0, shuffle = True)
norm =  StandardScaler()

### Reading files

In [11]:
X_train_knn ,X_test_knn , y_train_knn ,y_test_knn,X_knn,y_knn  = read_csv(imputed_file_knn,df_target, remove_outliers = False, log= False)
X_train_em ,X_test_em , y_train_em ,y_test_em,X_em,y_em = read_csv(imputed_file_em,df_target, remove_outliers = False, log= False)
X_train_ct ,X_test_ct , y_train_ct ,y_test_ct,X_ct,y_ct= read_csv(imputed_file_ct,df_target, remove_outliers = False, log= False)

# Aplying Machine learning model 


## Multicollinearity Assessment


In [12]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant


In [13]:
def compute_vif(df,considered_features):
    
    X = df[considered_features]
    # the calculation of variance inflation requires a constant
    X['intercept'] = 1
    
    # create dataframe to store vif values
    vif = pd.DataFrame()
    vif["Variable"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif = vif[vif['Variable']!='intercept']
    return vif

In [14]:

def Multi_assessment(X_df, y_df,df_features, model, imputing_title, model_title):
 
    
    df = pd.DataFrame(X_df,columns=df_features)
    X = df.to_numpy()
    y = y_df
    
    acc_train_0 = []
    acc_test_0 = []
    acc_valid_0 = []
    
    acc_train_1 = []
    acc_test_1 = []
    acc_valid_1 = []
    
    acc_train_2 = []
    acc_test_2 = []
    acc_valid_2 = []
    
    data_list = []
    for i in range(42):
       
        considered_features = df.columns.to_list()
        a = compute_vif(df,considered_features).sort_values('VIF', ascending=False)
        var_list = a['Variable'].to_list()
        vif_list = a['VIF'].to_list()
        
        var = var_list[0]
        #print(var)
        data_list.append(var)
        df = df.drop(var, axis=1)
        
        
        #'''    
        X = df.to_numpy()
        y = y_knn
         
        
        X_train, X_test,y_train, y_test = train_test_split(X,y , 
                                       random_state=10,  
                                       test_size=0.2,  
                                       shuffle=True)
    
        X_train,X_test = normalisation(X_train,X_test, StandardScaler()) 
        b = apply_model(model, X_train,y_train.reshape(-1, 1), X_test, y_test.reshape(-1, 1), to_string = False)
        print(imputing_title,X_train.shape[1], model_title, b)
        '''
        acc_train_0.append( b[0][0][0])
        acc_valid_0.append( b[1][0][0])
        acc_test_0.append( b[2][0][0])
        acc_train_1.append( b[0][1][0])
        acc_valid_1.append( b[1][1][0])
        acc_test_1.append( b[2][1][0])
        acc_train_2.append( b[0][2][0])
        acc_valid_2.append( b[1][2][0])
        acc_test_2.append( b[2][2][0])
        '''
    
    '''
    plt.figure(figsize=(12, 6)) 
    plt.title("Feature reducion with Multicollinearity - "+ imputing_title+" imputing method - "+ model_title+" model")
    plt.xticks(rotation=90)
    # Adding legends  
    plt.plot(data_list, acc_train_0, label="Train MAE")
    plt.plot(data_list, acc_valid_0, label="Valid MAE")
    plt.plot(data_list, acc_test_0, label="Test MAE")
    # Adding a title
    plt.legend()
    # Display the plot
 


    plt.figure(figsize=(12, 6)) 
    plt.title("Feature reducion with Multicollinearity - "+ imputing_title+" imputing method - "+ model_title+" model")
    plt.xticks(rotation=90)
    # Adding legends  
    plt.plot(data_list, acc_train_1, label="Train MSE")
    plt.plot(data_list, acc_valid_1, label="Valid MSE")
    plt.plot(data_list, acc_test_1, label="Test MSE")
    # Adding a title
    plt.legend()
    # Display the plot
 


    plt.figure(figsize=(12, 6)) 
    plt.title("Feature reducion with Multicollinearity - "+ imputing_title+" imputing method - "+ model_title+" model")
    plt.xticks(rotation=90)
    # Adding legends  
    plt.plot(data_list, acc_train_2, label="Train R2")
    plt.plot(data_list, acc_valid_2, label="Valid R2")
    plt.plot(data_list, acc_test_2, label="Test R2")
    # Adding a title
    plt.legend()
    # Display the plot
    plt.show()
    
    np.savetxt(model_title+'_acc_train_0.txt',acc_train_0)
    np.savetxt(model_title+'_acc_train_1.txt',acc_train_1)
    np.savetxt(model_title+'_acc_train_2.txt',acc_train_2)
    np.savetxt(model_title+'_acc_valid_0.txt',acc_valid_0)
    np.savetxt(model_title+'_acc_valid_1.txt',acc_valid_1)
    np.savetxt(model_title+'_acc_valid_2.txt',acc_valid_2)
    np.savetxt(model_title+'_acc_test_0.txt',acc_test_0)
    np.savetxt(model_title+'_acc_test_1.txt',acc_test_1)
    np.savetxt(model_title+'_acc_test_2.txt',acc_test_2)
    '''
    return  acc_train_0 ,acc_test_0,acc_valid_0,acc_train_1,acc_test_1,acc_valid_1,acc_train_2,acc_test_2,acc_valid_2



In [None]:
 models = [
          LinearRegression(),
          GradientBoostingRegressor(),
          RandomForestRegressor(), 
          SVR(), 
          MLPRegressor( alpha=1e-5, max_iter=5000,hidden_layer_sizes=(32, 32), random_state=1),
         ]
models_name = [
         'Linear Regression',
          'Gradient Boosting Regressor',
          'Random Forest Regressor', 
           'SVR', 
           'MLP Regressor'  
]

for model, model_title in zip( models,models_name):
        print(model)
        Multi_assessment(X_knn,y_knn,df_features, model, 'KNN', model_title)
        Multi_assessment(X_em,y_em,df_features, model, 'EM', model_title)
        Multi_assessment(X_ct,y_ct,df_features, model, 'CT', model_title)

 

LinearRegression()
KNN 42 Linear Regression ([(2890.3242703851647, 49.3724868844038), (63712161.28694139, 1939962.047524835), (0.8478721193991948, 0.007069427153916134), 'train'], [(2910.9210256577344, 222.2873502150475), (65449583.45168023, 17568529.69324636), (0.812814860798907, 0.08791571686156459), 'valid'], [(2620.0461878978285, 43.0775528679847), (38349594.47323994, 829080.7288915061), (0.9012848253355312, 0.0021341255386827926), 'test'])
KNN 41 Linear Regression ([(2889.870084662584, 49.203879535237455), (63722712.391827784, 1943770.5739593648), (0.8478467615396845, 0.007079681896303957), 'train'], [(2910.7437037409954, 223.97988458156803), (65430670.70811603, 17597125.857435368), (0.8128586653328099, 0.08796378310696158), 'valid'], [(2616.164465730306, 41.70155876092836), (38363761.72087201, 831139.277549937), (0.9012483576141973, 0.0021394244210611794), 'test'])
KNN 40 Linear Regression ([(2890.358629043163, 49.64924268690159), (63738204.65022925, 1945916.5423223772), (0.84780



KNN 7 MLP Regressor ([(3719.909074577623, 142.60063449068983), (122882100.38924606, 12108573.424981184), (0.7065806213535427, 0.031348831246910354), 'train'], [(3883.8719943730684, 337.89428037697826), (140494232.4220938, 53492416.11578716), (0.6380988432090204, 0.09235641018022485), 'valid'], [(3822.7244926514154, 175.72272939225755), (116991293.89508367, 15530578.8846321), (0.6988542859525132, 0.03997705395050679), 'test'])




KNN 6 MLP Regressor ([(4572.351984824129, 158.48900205732238), (173831607.2660995, 8718481.08891189), (0.5850824616929101, 0.022811233544185915), 'train'], [(4752.720179679222, 375.8181789786632), (195843885.5627884, 61723623.80691597), (0.49996750352216235, 0.10401891894768066), 'valid'], [(4666.087867072591, 142.27596263672612), (152817062.68567377, 6675357.527837169), (0.6066356569883984, 0.01718294791273583), 'test'])




KNN 5 MLP Regressor ([(4138.673655152322, 80.79723275129791), (185855871.8218004, 6866336.171310597), (0.5566515740127475, 0.011233202795888323), 'train'], [(4273.198522925833, 363.61365320597446), (207898782.77426448, 55507172.26415021), (0.4406426232004105, 0.19375394316858835), 'valid'], [(3976.1550119730464, 75.96794489105635), (147146665.1735113, 3716807.429622767), (0.6212317508589784, 0.00956738394888194), 'test'])




KNN 4 MLP Regressor ([(4122.365906464895, 207.4732353862567), (201076603.62871608, 32641335.92553089), (0.5208841812922842, 0.07187923556401231), 'train'], [(4206.040187029308, 404.1374581747428), (209310634.90474313, 66931362.02024526), (0.43876434590778784, 0.19753435346679704), 'valid'], [(3947.7983786075847, 223.9087758089579), (176171584.02773133, 30134888.765462097), (0.5465190981263484, 0.07756981133275621), 'test'])
KNN 3 MLP Regressor ([(5138.400784014948, 263.30688175541854), (345470892.7011936, 15376771.109376835), (0.17507258004319676, 0.04948390966057083), 'train'], [(5207.182856693533, 838.8077781787687), (360510932.2297436, 174743715.14132097), (0.1454615271199105, 0.07059632467595013), 'valid'], [(5031.77756816619, 311.3328494767603), (321310939.1559416, 19924822.412345458), (0.1729178387397011, 0.051288217036184865), 'test'])
KNN 2 MLP Regressor ([(6211.653075667332, 121.1971215638117), (409314715.60445756, 20187338.75235712), (0.024454143036627674, 0.00207035963540668



EM 7 MLP Regressor ([(4010.874460334157, 236.48151363630598), (128480230.28216735, 17361045.401069958), (0.6919459362968498, 0.05414518867801871), 'train'], [(4205.7352566165955, 476.10210976375345), (148481675.71311435, 56163258.650437325), (0.6154888033054647, 0.10390558335315628), 'valid'], [(4149.111072074114, 274.80008187608195), (127538517.69571443, 22449855.54582283), (0.6717048192109281, 0.05778787081945294), 'test'])


