In [1]:
!pip install ReliefF
!pip install category_encoders



In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

from ReliefF import ReliefF

import category_encoders as ce

from scipy.stats import chi2_contingency

np.random.seed(1)

  import pandas.util.testing as tm


In [4]:
#https://www.kaggle.com/klmsathishkumar/predict-your-bmi-here
def ordinal_encoding(df,col,mapping):
    ordinal_encoder = ce.OrdinalEncoder(cols = [col],return_df = True,mapping = [{'col':col,'mapping':mapping}])
    df_final = ordinal_encoder.fit_transform(df)
    return df_final

# Import Data

In [5]:
data_employ = pd.read_csv('Datasets/employee/Employee.csv')
data_employ

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1
...,...,...,...,...,...,...,...,...,...
4648,Bachelors,2013,Bangalore,3,26,Female,No,4,0
4649,Masters,2013,Pune,2,37,Male,No,2,1
4650,Masters,2018,New Delhi,3,27,Male,No,5,1
4651,Bachelors,2012,Bangalore,3,30,Male,Yes,2,0


In [6]:
mapping_edu = {"Bachelors":0,"Masters":1, "PHD":2}
mapping_city = {"Bangalore":0,"Pune":1, "New Delhi":2}
mapping_gender = {"Male":0,"Female":1}
mapping_bench = {"No":0,"Yes":1}
data_employ["Education"] = ordinal_encoding(data_employ['Education'],"Education",mapping_edu)
data_employ["City"] = ordinal_encoding(data_employ['City'],"City",mapping_city)
data_employ["Gender"] = ordinal_encoding(data_employ['Gender'],"Gender",mapping_gender)
data_employ["EverBenched"] = ordinal_encoding(data_employ['EverBenched'],"EverBenched",mapping_bench)

In [7]:
data_employ

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,0,2017,0,3,34,0,0,0,0
1,0,2013,1,1,28,1,0,3,1
2,0,2014,2,3,38,1,0,2,0
3,1,2016,0,3,27,0,0,5,1
4,1,2017,1,3,24,0,1,2,1
...,...,...,...,...,...,...,...,...,...
4648,0,2013,0,3,26,1,0,4,0
4649,1,2013,1,2,37,0,0,2,1
4650,1,2018,2,3,27,0,0,5,1
4651,0,2012,0,3,30,0,1,2,0


##CHI2

In [8]:
#Aqui solo aparece la ultima comparacion, pero esta se hizo con todas las columnas de la tabla
chi_employ = data_employ
ct = pd.crosstab(data_employ.Education,data_employ.LeaveOrNot, margins=True)
ct

LeaveOrNot,0,1,All
Education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2472,1129,3601
1,447,426,873
2,134,45,179
All,3053,1600,4653


In [9]:
obs = np.array([ct.iloc[0][0:5].values,
                  ct.iloc[1][0:5].values])
obs

array([[2472, 1129, 3601],
       [ 447,  426,  873]])

In [10]:
#Si el 2do valor es menor que 0.05, se puede quedar la columna
chi2_contingency(obs)[0:3]

(94.29817185031831, 3.3374299265503668e-21, 2)

In [11]:
chi_employ = chi_employ.drop(columns = ['LeaveOrNot'])
chi_employ = chi_employ.drop(columns = ['City'])
chi_employ = chi_employ.drop(columns = ['Age'])
chi_employ = chi_employ.drop(columns = ['ExperienceInCurrentDomain'])
chi_employ

Unnamed: 0,Education,JoiningYear,PaymentTier,Gender,EverBenched
0,0,2017,3,0,0
1,0,2013,1,1,0
2,0,2014,3,1,0
3,1,2016,3,0,0
4,1,2017,3,0,1
...,...,...,...,...,...
4648,0,2013,3,1,0
4649,1,2013,2,0,0
4650,1,2018,3,0,0
4651,0,2012,3,0,1


##RELIEF

In [12]:
y = np.array(data_employ['LeaveOrNot'])

data_employ =  np.array(data_employ.drop(columns = ['LeaveOrNot']))


fs = ReliefF(n_neighbors=5, n_features_to_keep=5)

relief_employ = fs.fit_transform(data_employ, y)
print(relief_employ)

[[0 3 0 0 0]
 [0 1 1 0 1]
 [0 3 1 0 2]
 ...
 [1 3 0 0 2]
 [0 3 0 1 0]
 [0 3 0 1 0]]


In [13]:
relief_employ = pd.DataFrame(relief_employ)
relief_employ

Unnamed: 0,0,1,2,3,4
0,0,3,0,0,0
1,0,1,1,0,1
2,0,3,1,0,2
3,1,3,0,0,0
4,1,3,0,1,1
...,...,...,...,...,...
4648,0,3,1,0,0
4649,1,2,0,0,1
4650,1,3,0,0,2
4651,0,3,0,1,0


# Preprocess

In [14]:
scaler = StandardScaler()
relief_employ = scaler.fit_transform(relief_employ)
chi_employ = scaler.fit_transform(chi_employ)

#Split Data

In [15]:
X_train_relief, X_test_relief, y_train_relief, y_test_relief = train_test_split(relief_employ, y, test_size=0.33, random_state=0)
X_train_chi, X_test_chi, y_train_chi, y_test_chi = train_test_split(chi_employ, y, test_size=0.33, random_state=0)

# Models

In [16]:
discriminative = LogisticRegression(random_state = 0)
generative = GaussianNB()
    
discriminative.fit(X_train_relief, y_train_relief)
generative.fit(X_train_relief, y_train_relief)
    
dis_relief = discriminative.score(X_test_relief, y_test_relief)
gen_relief = generative.score(X_test_relief, y_test_relief)

In [17]:
discriminative = LogisticRegression(random_state = 0)
generative = GaussianNB()
    
discriminative.fit(X_train_chi, y_train_chi)
generative.fit(X_train_chi, y_train_chi)
    
dis_chi = discriminative.score(X_test_chi, y_test_chi)
gen_chi = generative.score(X_test_chi, y_test_chi)

In [18]:
print (f'Dis_relief: {dis_relief}\nDis_chi: {dis_chi}\nGen_relief: {gen_relief}\nGen_chi: {gen_chi}\n')

Dis_relief: 0.70703125
Dis_chi: 0.6979166666666666
Gen_relief: 0.6712239583333334
Gen_chi: 0.6653645833333334

