# Applying ML models on the dataset

In [1]:
#import libraries
import warnings

def function_that_warns():
    warnings.warn("This is a warning message", UserWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    function_that_warns()  # This will not show a warning
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import lime
import lime.lime_tabular
import shap
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor


# Functions

In [2]:
def draw_histograms(arr, variables, n_rows, n_cols):
    df = pd.DataFrame(arr, columns =variables)
    fig=plt.figure(figsize=(20, 20))
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

In [3]:
# normalsie the input base on the norm parametrs, if norm is none, no normalisation will happen
def normalisation(X_train,X_test, norm ):
    if norm != None:
        scaler1 = norm
        X_train  =  scaler1.fit_transform(X_train)
        X_test  =  scaler1.transform(X_test)
    return X_train, X_test

In [4]:
def remove_outliers_zscore(X,y, threshold=3):
    z_scores = np.abs(stats.zscore(X))
    mask = (z_scores < threshold).all(axis=1)
    X = X[mask]
    y = y[mask]
    print(X.shape)
    return  X,y



In [5]:
# calculate different performance metric for the model clf
def clf_score(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    ma = mean_absolute_error(y_test,y_pred)
    ms = mean_squared_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)   
    return [ma,ms,r2]

In [6]:
# calculate mean and std for kfold results
def show_statics(arr,type):  
   
    ma = np.mean([ x[0] for x in arr]), np.std([ x[0] for x in arr])
    ms = np.mean([ x[1] for x in arr]), np.std([ x[1] for x in arr])
    r2 = np.mean([ x[2] for x in arr]), np.std([ x[2] for x in arr])
    
    return [ma,ms,r2,type ] 
    
# convert input array into a string 
def make_sring(a):
    return  a[3]+' '+str(round(a[0][0],2))+' +-' + str(round(a[0][1],2))+ ' '+ str(round(a[1][0],0))+' +-' + str(round(a[1][1],0))+ ' '+str(round(a[2][0]*100,2))+' +-' + str(round(a[2][1]*100,2))+ ' '
 

In [7]:
# train a model and 
def train_clf(clf, X_tr,y_tr,X_test, y_test,   print_ind = False, to_string = True):
    metrics_train = []
    metrics_valid = []
    metrics_test = []
    for i, (train_index, valid_index) in enumerate(kf.split(X_tr)):
             
        X_train = X_tr[train_index]
        y_train = y_tr[train_index]
        X_valid = X_tr[valid_index]
        y_valid = y_tr[valid_index]
        
        
       
        
        clf.fit(X_train, y_train.ravel())

        res = clf_score(clf,X_train,y_train)
        if (print_ind): print('Train: ', res)
        metrics_train.append(res)
        
        res = clf_score(clf,X_valid,y_valid)
        if (print_ind): print('Valid: ', res)
        metrics_valid.append(res)
        
        res = clf_score(clf,X_test,y_test)
        if (print_ind): print('Test: ', res)
        metrics_test.append(res)
    #print(show_statics( Accs))
    if to_string :
        return make_sring(show_statics( metrics_train,'train')), make_sring(show_statics( metrics_valid,'valid'))  , make_sring(show_statics( metrics_test,'test'))  
    else:
        return (show_statics( metrics_train,'train')), (show_statics( metrics_valid,'valid'))  , (show_statics( metrics_test,'test'))  


In [8]:
def apply_model(model,X_train,y_train,X_test, y_test, to_string = True ):
       
    ma_t,ms_t,r2_t = train_clf(model,X_train,y_train.reshape(-1, 1),X_test, y_test,False, to_string )
     
    return ma_t,ms_t,r2_t

In [9]:
def read_csv(path, target , remove_outliers = True, log = True  ):
    df = pd.read_csv(path)
    df.head()
    df['Region'] = pd.factorize( df['Region'] )[0]
    df['Industry'] = pd.factorize( df['Industry'] )[0]
    df['Employee _size'] = pd.factorize( df['Employee _size'] )[0]

    
    y = df[target].to_numpy()
    X = df.drop(target, axis=1).to_numpy()

    
    if remove_outliers:
        X , y = remove_outliers_zscore (X,y)
    if log:
      for i in range(5,X.shape[1],1):
        min_value = np.min(X[:,i],0) 
        if (min_value< 0):
            X[:,i] = X[:,i] + abs(min_value) + 0.0000001
        
        X[:,i] = np.log1p(X[:,i])
    
    
    X_train, X_test,y_train, y_test = train_test_split(X,y , 
                                   random_state=10,  
                                   test_size=0.2,  
                                   shuffle=True)
    X_train,X_test = normalisation(X_train,X_test, StandardScaler())
     
       
    return X_train, X_test,y_train, y_test, X,y


## Prepration

### Parameters

In [10]:
#reading files 
imputed_file_knn = "Cleaned_data_knn_imputed_df.csv"
imputed_file_em = "Cleaned_data_em_imputed_df.csv"
imputed_file_ct = "Cleaned_data_imputed_df.csv" 
     
df_features =  pd.read_csv(imputed_file_knn).columns
df_target = "Turnover"
kf = KFold(n_splits=10, random_state=0, shuffle = True)
norm =  StandardScaler()

### Reading files

In [11]:
X_train_knn ,X_test_knn , y_train_knn ,y_test_knn,X_knn,y_knn  = read_csv(imputed_file_knn,df_target, remove_outliers = False, log= False)
X_train_em ,X_test_em , y_train_em ,y_test_em,X_em,y_em = read_csv(imputed_file_em,df_target, remove_outliers = False, log= False)
X_train_ct ,X_test_ct , y_train_ct ,y_test_ct,X_ct,y_ct= read_csv(imputed_file_ct,df_target, remove_outliers = False, log= False)

# Aplying Machine learning model 


## Domain Knowledge

In [12]:

df_features = pd.read_csv(imputed_file_knn).columns.to_list()
df_features.remove("Turnover")
df_features.append("Turnover")

df_knn = pd.DataFrame(data= np.concatenate((X_knn,y_knn.reshape(-1,1)), axis = 1),columns= df_features)
df_em = pd.DataFrame(data= np.concatenate((X_em,y_em.reshape(-1,1)), axis = 1),columns= df_features)
df_ct = pd.DataFrame(data= np.concatenate((X_ct,y_ct.reshape(-1,1)), axis = 1),columns= df_features)


In [13]:
def domain_knowledge(numerical_df , threshold = 0.8):


    correlation_matrix = numerical_df.corr()
    
    # Find pairs of features with correlation greater than the threshold
    high_correlation_pairs = np.where(correlation_matrix > threshold)
    high_correlation_pairs = [(correlation_matrix.index[x], correlation_matrix.columns[y]) 
                          for x, y in zip(*high_correlation_pairs) if x != y and x < y]
    
    features_to_drop = []
    # Display the high correlation pairs
    for pair in high_correlation_pairs:
        feature1, feature2 = pair
        correlation_value = correlation_matrix.loc[feature1, feature2]
        #print(f"Correlation between {feature1} and {feature2}: {correlation_value:.2f}")
        features_to_drop.append(feature1)
        
        
    return list(set(features_to_drop))



In [16]:
def remove_columns(df,X_train,X_test, threshold = 0.8):
    df_dk = domain_knowledge(df , threshold = 0.8)
    features_indexs = []
    for f in df_dk:
        ind = df_features.index(f)
        #print(f, ind)
        features_indexs.append(ind)
    new_X_train = pd.DataFrame(X_train)
    new_X_train.drop(new_X_train.columns[features_indexs], axis=1, inplace=True)

    new_X_test = pd.DataFrame(X_test)
    new_X_test.drop(new_X_test.columns[features_indexs], axis=1, inplace=True)
    
    return new_X_train.to_numpy() , new_X_test.to_numpy()  






new_X_train_knn , new_X_test_knn = remove_columns(df_knn,X_train_knn,X_test_knn, threshold = 0.8)
new_X_train_em , new_X_test_em   = remove_columns(df_em,X_train_em,X_test_em, threshold = 0.8)
new_X_train_ct , new_X_test_ct   = remove_columns(df_ct,X_train_ct,X_test_ct, threshold = 0.8)

In [15]:
models = [
          LinearRegression(),
          GradientBoostingRegressor(),
          RandomForestRegressor(), 
          SVR(), 
          MLPRegressor( alpha=1e-5, max_iter=5000,hidden_layer_sizes=(32, 32), random_state=1),
         ] 
for model in models:
        print(model)
        print('KNN : ', apply_model(model, new_X_train_knn, y_train_knn ,new_X_test_knn  ,y_test_knn))
        print('EM  : ', apply_model(model, new_X_train_em, y_train_em ,new_X_test_em  ,y_test_em))
        print('CT  : ',apply_model(model, new_X_train_ct, y_train_ct ,new_X_test_ct  ,y_test_ct))

 

LinearRegression()
KNN :  ('train 6226.67 +-135.08 388360546.0 +-19438550.0 7.44 +-0.27 ', 'valid 6235.17 +-552.87 389860988.0 +-175293359.0 8.0 +-3.22 ', 'test 6117.95 +-67.63 361698524.0 +-127812.0 6.9 +-0.03 ')
EM  :  ('train 6224.65 +-133.71 389488893.0 +-19417232.0 7.17 +-0.26 ', 'valid 6232.88 +-571.24 391093178.0 +-175137966.0 7.65 +-3.34 ', 'test 6139.93 +-60.92 362552967.0 +-230331.0 6.68 +-0.06 ')
CT  :  ('train 6224.65 +-133.71 389488893.0 +-19417232.0 7.17 +-0.26 ', 'valid 6232.88 +-571.24 391093178.0 +-175137966.0 7.65 +-3.34 ', 'test 6139.93 +-60.92 362552967.0 +-230331.0 6.68 +-0.06 ')
GradientBoostingRegressor()
KNN :  ('train 2301.06 +-60.68 43082502.0 +-1970272.0 89.71 +-0.67 ', 'valid 2440.53 +-216.21 53987104.0 +-17434614.0 84.88 +-6.73 ', 'test 2277.99 +-50.53 38273017.0 +-1484430.0 90.15 +-0.38 ')
EM  :  ('train 2259.9 +-26.47 40070888.0 +-1215588.0 90.43 +-0.51 ', 'valid 2409.84 +-233.59 53075624.0 +-19919291.0 85.56 +-6.08 ', 'test 2261.17 +-28.46 37006367.0 +-2



KNN :  ('train 3333.8 +-155.22 92202532.0 +-15237141.0 77.86 +-4.47 ', 'valid 3734.13 +-395.49 132704947.0 +-56657500.0 65.49 +-13.24 ', 'test 3592.04 +-149.41 107636958.0 +-20019205.0 72.29 +-5.15 ')
EM  :  ('train 3562.91 +-785.63 132486361.0 +-84931769.0 68.27 +-20.51 ', 'valid 3863.7 +-707.46 167999814.0 +-97888145.0 57.0 +-21.06 ', 'test 3875.64 +-653.66 163052802.0 +-66739396.0 58.03 +-17.18 ')
CT  :  ('train 3562.44 +-785.23 133018570.0 +-84689135.0 68.13 +-20.46 ', 'valid 3848.04 +-713.7 164907343.0 +-98590969.0 57.65 +-21.59 ', 'test 3877.46 +-653.0 163873177.0 +-66482200.0 57.82 +-17.11 ')
