# Applying ML models on the dataset

In [1]:
#import libraries
import warnings

def function_that_warns():
    warnings.warn("This is a warning message", UserWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    function_that_warns()  # This will not show a warning
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import lime
import lime.lime_tabular
import shap
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor


# Functions

In [2]:
def draw_histograms(arr, variables, n_rows, n_cols):
    df = pd.DataFrame(arr, columns =variables)
    fig=plt.figure(figsize=(20, 20))
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

In [3]:
# normalsie the input base on the norm parametrs, if norm is none, no normalisation will happen
def normalisation(X_train,X_test, norm ):
    if norm != None:
        scaler1 = norm
        X_train  =  scaler1.fit_transform(X_train)
        X_test  =  scaler1.transform(X_test)
    return X_train, X_test

In [4]:
def remove_outliers_zscore(X,y, threshold=3):
    z_scores = np.abs(stats.zscore(X))
    mask = (z_scores < threshold).all(axis=1)
    X = X[mask]
    y = y[mask]
    print(X.shape)
    return  X,y
def read_csv(path, target , remove_outliers = True, log = True  ):
    df = pd.read_csv(path)
    df['Region'] = pd.factorize( df['Region'] )[0]
    df['Industry'] = pd.factorize( df['Industry'] )[0]
    df['Employee _size'] = pd.factorize( df['Employee _size'] )[0]

    
    y = df[target].to_numpy()
    X = df.drop(target, axis=1).to_numpy()
    if remove_outliers:
        X , y = remove_outliers_zscore (X,y)
    if log:
      
        X[:,5:] = np.log1p(X[:,5:])
    
    X_train, X_test,y_train, y_test = train_test_split(X,y , 
                                   random_state=10,  
                                   test_size=0.2,  
                                   shuffle=True)
    X_train,X_test = normalisation(X_train,X_test, StandardScaler())
       
    return X_train, X_test,y_train, y_test, X,y




In [5]:
# calculate different performance metric for the model clf
def clf_score(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    ma = mean_absolute_error(y_test,y_pred)
    ms = mean_squared_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)   
    return [ma,ms,r2]

In [6]:
# calculate mean and std for kfold results
def show_statics(arr,type):  
   
    ma = np.mean([ x[0] for x in arr]), np.std([ x[0] for x in arr])
    ms = np.mean([ x[1] for x in arr]), np.std([ x[1] for x in arr])
    r2 = np.mean([ x[2] for x in arr]), np.std([ x[2] for x in arr])
    
    return [ma,ms,r2,type ] 
    
# convert input array into a string 
def make_sring(a):
    return  a[3]+' '+str(round(a[0][0],2))+' +-' + str(round(a[0][1],2))+ ' '+ str(round(a[1][0],0))+' +-' + str(round(a[1][1],0))+ ' '+str(round(a[2][0]*100,2))+' +-' + str(round(a[2][1]*100,2))+ ' '
 

In [7]:
# train a model and 
def train_clf(clf, X_tr,y_tr,X_test, y_test,   print_ind = False, to_string = True):
    metrics_train = []
    metrics_valid = []
    metrics_test = []
    for i, (train_index, valid_index) in enumerate(kf.split(X_tr)):
             
        X_train = X_tr[train_index]
        y_train = y_tr[train_index]
        X_valid = X_tr[valid_index]
        y_valid = y_tr[valid_index]
        
        
       
        
        clf.fit(X_train, y_train.ravel())

        res = clf_score(clf,X_train,y_train)
        if (print_ind): print('Train: ', res)
        metrics_train.append(res)
        
        res = clf_score(clf,X_valid,y_valid)
        if (print_ind): print('Valid: ', res)
        metrics_valid.append(res)
        
        res = clf_score(clf,X_test,y_test)
        if (print_ind): print('Test: ', res)
        metrics_test.append(res)
    #print(show_statics( Accs))
    if to_string :
        return make_sring(show_statics( metrics_train,'train')), make_sring(show_statics( metrics_valid,'valid'))  , make_sring(show_statics( metrics_test,'test'))  
    else:
        return (show_statics( metrics_train,'train')), (show_statics( metrics_valid,'valid'))  , (show_statics( metrics_test,'test'))  


In [8]:
def apply_model(model,X_train,y_train,X_test, y_test, to_string = True ):
       
    ma_t,ms_t,r2_t = train_clf(model,X_train,y_train.reshape(-1, 1),X_test, y_test,False, to_string )
     
    return ma_t,ms_t,r2_t

In [9]:
def read_csv(path, target , remove_outliers = True, log = True  ):
    df = pd.read_csv(path)
    df.head()
    df['Region'] = pd.factorize( df['Region'] )[0]
    df['Industry'] = pd.factorize( df['Industry'] )[0]
    df['Employee _size'] = pd.factorize( df['Employee _size'] )[0]

    
    y = df[target].to_numpy()
    X = df.drop(target, axis=1).to_numpy()
    if remove_outliers:
        X , y = remove_outliers_zscore (X,y)
    if log:
      for i in range(5,X.shape[1],1):
        min_value = np.min(X[:,i],0) 
        if (min_value< 0):
            X[:,i] = X[:,i] + abs(min_value) + 0.0000001
        
        X[:,i] = np.log1p(X[:,i])
    
    
    X_train, X_test,y_train, y_test = train_test_split(X,y , 
                                   random_state=10,  
                                   test_size=0.2,  
                                   shuffle=True)
    #X_train,X_test = normalisation(X_train,X_test, StandardScaler())
       
    return X_train, X_test,y_train, y_test, X,y


## Prepration

### Parameters

In [10]:
#reading files 
imputed_file_knn = "Cleaned_data_knn_imputed_df.csv"
imputed_file_em = "Cleaned_data_em_imputed_df.csv"
imputed_file_ct = "Cleaned_data_imputed_df.csv" 
     
df_features =  pd.read_csv(imputed_file_knn).columns
df_target = "Turnover"
kf = KFold(n_splits=10, random_state=0, shuffle = True)
norm =  StandardScaler()

### Reading files

In [11]:
X_train_knn ,X_test_knn , y_train_knn ,y_test_knn,X_knn,y_knn  = read_csv(imputed_file_knn,df_target, remove_outliers = False, log= False)
X_train_em ,X_test_em , y_train_em ,y_test_em,X_em,y_em = read_csv(imputed_file_em,df_target, remove_outliers = False, log= False)
X_train_ct ,X_test_ct , y_train_ct ,y_test_ct,X_ct,y_ct= read_csv(imputed_file_ct,df_target, remove_outliers = False, log= False)

# Aplying Machine learning model 


## PCA

In [12]:
acc = []
for n_components in range(2,43,1):
    pca = PCA(n_components=n_components)



    
    pca_X_train_knn = pca.fit_transform(X_train_knn)
    pca_X_test_knn = pca.transform(X_test_knn)
    eigenvalues_knn = pca.explained_variance_
    
    pca_X_train_em = pca.fit_transform(X_train_em)
    pca_X_test_em = pca.transform(X_test_em)
    eigenvalues_em = pca.explained_variance_
    
    
    model = LinearRegression(positive=False)
    #print(n_components , model)
    knn_LR =  apply_model(model, pca_X_train_knn, y_train_knn  ,pca_X_test_knn  ,y_test_knn , to_string = False)
    EM_LR = apply_model(model, pca_X_train_em, y_train_em ,pca_X_test_em  ,y_test_em, to_string = False )
    CT_LR = apply_model(model, X_train_ct, y_train_ct ,X_test_ct  ,y_test_ct, to_string = False )
    #print('-----')
    acc.append([knn_LR,EM_LR])
    

In [13]:

def PCA_assessment(X_train,y_train, X_test, y_test,  model, imputing_title, model_title):
   
 
    
    acc_train_0 = []
    acc_test_0 = []
    acc_valid_0 = []
    
    acc_train_1 = []
    acc_test_1 = []
    acc_valid_1 = []
    
    acc_train_2 = []
    acc_test_2 = []
    acc_valid_2 = []
    
    data_list = []
    for i in range(42,1,-1):


        pca = PCA(n_components=n_components)
       
        pca_X_train = pca.fit_transform(X_train)
        pca_X_test = pca.transform(X_test)
        eigenvalues_ = pca.explained_variance_

        
    
 
 
        b = apply_model(model, X_train,y_train.reshape(-1, 1), X_test, y_test.reshape(-1, 1), to_string = True)
        print(model_title , '-', imputing_title ,i, b)
        acc_train_0.append( b[0][0][0])
        acc_valid_0.append( b[1][0][0])
        acc_test_0.append( b[2][0][0])
        acc_train_1.append( b[0][1][0])
        acc_valid_1.append( b[1][1][0])
        acc_test_1.append( b[2][1][0])
        acc_train_2.append( b[0][2][0])
        acc_valid_2.append( b[1][2][0])
        acc_test_2.append( b[2][2][0])
        #'''
    
    '''
    plt.figure(figsize=(12, 6)) 
    plt.title("Feature reducion with Multicollinearity - "+ imputing_title+" imputing method - "+ model_title+" model")
    plt.xticks(rotation=90)
    # Adding legends  
    plt.plot(data_list, acc_train_0, label="Train MAE")
    plt.plot(data_list, acc_valid_0, label="Valid MAE")
    plt.plot(data_list, acc_test_0, label="Test MAE")
    # Adding a title
    plt.legend()
    # Display the plot
    plt.show()


    plt.figure(figsize=(12, 6)) 
    plt.title("Feature reducion with Multicollinearity - "+ imputing_title+" imputing method - "+ model_title+" model")
    plt.xticks(rotation=90)
    # Adding legends  
    plt.plot(data_list, acc_train_1, label="Train MSE")
    plt.plot(data_list, acc_valid_1, label="Valid MSE")
    plt.plot(data_list, acc_test_1, label="Test MSE")
    # Adding a title
    plt.legend()
    # Display the plot
    plt.show()


    plt.figure(figsize=(12, 6)) 
    plt.title("Feature reducion with Multicollinearity - "+ imputing_title+" imputing method - "+ model_title+" model")
    plt.xticks(rotation=90)
    # Adding legends  
    plt.plot(data_list, acc_train_2, label="Train R2")
    plt.plot(data_list, acc_valid_2, label="Valid R2")
    plt.plot(data_list, acc_test_2, label="Test R2")
    # Adding a title
    plt.legend()
    # Display the plot
    plt.show()

    np.savetxt(model_title+'_acc_train_0.txt',acc_train_0)
    np.savetxt(model_title+'_acc_train_1.txt',acc_train_1)
    np.savetxt(model_title+'_acc_train_2.txt',acc_train_2)
    np.savetxt(model_title+'_acc_valid_0.txt',acc_valid_0)
    np.savetxt(model_title+'_acc_valid_1.txt',acc_valid_1)
    np.savetxt(model_title+'_acc_valid_2.txt',acc_valid_2)
    np.savetxt(model_title+'_acc_test_0.txt',acc_test_0)
    np.savetxt(model_title+'_acc_test_1.txt',acc_test_1)
    np.savetxt(model_title+'_acc_test_2.txt',acc_test_2)
    '''
    return  acc_train_0 ,acc_test_0,acc_valid_0,acc_train_1,acc_test_1,acc_valid_1,acc_train_2,acc_test_2,acc_valid_2



In [None]:
 models = [
          LinearRegression(),
          GradientBoostingRegressor(),
          RandomForestRegressor(), 
          SVR(), 
          MLPRegressor( alpha=1e-5, max_iter=5000,hidden_layer_sizes=(32, 32), random_state=1),
         ]
models_name = [
         'Linear Regression',
          'Gradient Boosting Regressor',
          'Random Forest Regressor', 
          'SVR', 
          'MLP Regressor'  
]

for model, model_title in zip( models,models_name):
        print(model_title)
        PCA_assessment(X_train_knn ,y_train_knn, X_test_knn , y_test_knn, model, 'KNN', model_title)
        PCA_assessment(X_train_em , y_train_em,X_test_em  ,y_test_knn, model, 'EM', model_title)
        PCA_assessment(X_train_ct , y_train_ct,  X_test_ct ,y_test_knn, model, 'CT', model_title)

 

Linear Regression
Linear Regression - KNN 42 ('train 2890.81 +-49.64 63707439.0 +-1938650.0 84.79 +-0.71 ', 'valid 2911.63 +-222.27 65453751.0 +-17555662.0 81.28 +-8.79 ', 'test 2622.44 +-43.23 38350510.0 +-829790.0 90.13 +-0.21 ')
Linear Regression - KNN 41 ('train 2890.81 +-49.64 63707439.0 +-1938650.0 84.79 +-0.71 ', 'valid 2911.63 +-222.27 65453751.0 +-17555662.0 81.28 +-8.79 ', 'test 2622.44 +-43.23 38350510.0 +-829790.0 90.13 +-0.21 ')
Linear Regression - KNN 40 ('train 2890.81 +-49.64 63707439.0 +-1938650.0 84.79 +-0.71 ', 'valid 2911.63 +-222.27 65453751.0 +-17555662.0 81.28 +-8.79 ', 'test 2622.44 +-43.23 38350510.0 +-829790.0 90.13 +-0.21 ')
Linear Regression - KNN 39 ('train 2890.81 +-49.64 63707439.0 +-1938650.0 84.79 +-0.71 ', 'valid 2911.63 +-222.27 65453751.0 +-17555662.0 81.28 +-8.79 ', 'test 2622.44 +-43.23 38350510.0 +-829790.0 90.13 +-0.21 ')
Linear Regression - KNN 38 ('train 2890.81 +-49.64 63707439.0 +-1938650.0 84.79 +-0.71 ', 'valid 2911.63 +-222.27 65453751.0 +