In [116]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [117]:
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder

In [118]:
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, univariate_selection, RFE
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score, RepeatedStratifiedKFold,\
StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix 
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.over_sampling import ADASYN

In [119]:
df = pd.read_csv('ibm.csv')
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [120]:
df['Attrition']=pd.Series(np.where(df.Attrition.values == 'Yes', 1, 0))

In [121]:
df = df.drop(['EmployeeCount','StandardHours'], axis=1).copy()
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,2,...,3,1,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,2,3,...,4,4,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,4,4,...,3,2,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,5,4,...,3,3,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,7,1,...,3,4,1,6,3,3,2,2,2,2


In [122]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [123]:
df_coba=df[['Attrition','OverTime','TotalWorkingYears','JobLevel','MonthlyIncome','MaritalStatus','YearsWithCurrManager','StockOptionLevel','BusinessTravel','JobRole','JobInvolvement','JobSatisfaction','EnvironmentSatisfaction']].copy()

# Data Splitting

In [124]:
X = df.drop('Attrition', axis=1)
y = df['Attrition']

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=.2, stratify=y, random_state=2021) #check random_state

# Modelling

In [125]:
knn = KNeighborsClassifier(n_neighbors=3)
sm = ADASYN()

In [126]:
one_hot_cols=['BusinessTravel','Department','EducationField','Gender','JobRole','OverTime']

In [127]:
knn_pipe_pca1 = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=1))
])

knn_pipe_pca2 = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2))
])

knn_pipe_num_im = Pipeline([
    ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan)),
    ('scaler', StandardScaler())
])

knn_pipe_cat_im = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('onehot', OneHotEncoder(drop='first'))
])

knn_pipe_cat = Pipeline([
    ('onehot', OneHotEncoder(drop='first')),
])

#transformer
knn_transformer = ColumnTransformer([
    ##('pipe_num_im', knn_pipe_num_im, ['DailyRate', 'HourlyRate']),
    ##('pipe_cat_im', knn_pipe_cat_im, ['MaritalStatus']),
    ('pipe_cat', knn_pipe_cat, one_hot_cols),
    ##('pipe_pca1',knn_pipe_pca1,['MonthlyIncome','JobLevel']),
    ##('pipe_pca2',knn_pipe_pca1,['YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager'])
])

#dtc
knn_pipe_combine = Pipeline([
    ('transformer', knn_transformer),
    ('resampling', sm),
    ('knn', knn)
])

# CV Score

In [128]:
skfold = StratifiedKFold(n_splits = 5, random_state=2021,shuffle=True)

In [129]:
knn_bench_cv = cross_val_score(knn_pipe_combine, X_train_val, y_train_val, cv = skfold, scoring='recall')

In [130]:
knn_bench_cv

array([0.26315789, 0.31578947, 0.34210526, 0.21052632, 0.34210526])

# Feature Selection

In [131]:
df_coba.head()

Unnamed: 0,Attrition,OverTime,TotalWorkingYears,JobLevel,MonthlyIncome,MaritalStatus,YearsWithCurrManager,StockOptionLevel,BusinessTravel,JobRole,JobInvolvement,JobSatisfaction,EnvironmentSatisfaction
0,1,Yes,8,2,5993,Single,5,0,Travel_Rarely,Sales Executive,3,4,2
1,0,No,10,2,5130,Married,7,1,Travel_Frequently,Research Scientist,2,2,3
2,1,Yes,7,1,2090,Single,0,0,Travel_Rarely,Laboratory Technician,2,3,4
3,0,Yes,8,1,2909,Married,0,0,Travel_Frequently,Research Scientist,3,3,4
4,0,No,6,1,3468,Married,2,1,Travel_Rarely,Laboratory Technician,3,2,1


In [132]:
X_coba = df_coba.drop('Attrition', axis=1)
y_coba = df_coba['Attrition']

X_train_val, X_test, y_train_val, y_test = train_test_split(X_coba, y_coba, test_size=.2, stratify=y, random_state=2021) #check random_state

# KNN

In [133]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
labelencoder = LabelEncoder()
ordinalencoder = OrdinalEncoder()

In [134]:
knn_pipe_pca1 = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=1))
])

knn_pipe_pca2 = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2))
])

knn_pipe_num_im = Pipeline([
    ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan)),
    ('scaler', StandardScaler())
])

knn_pipe_cat_im = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('onehot', OneHotEncoder(drop='first'))
])

knn1_pipe_cat = Pipeline([
    ('ordinalencoder', ordinalencoder),
])

#transformer
knn1_transformer = ColumnTransformer([
    ##('pipe_num_im', knn_pipe_num_im, ['DailyRate', 'HourlyRate']),
    ##('pipe_cat_im', knn_pipe_cat_im, ['MaritalStatus']),
    ('pipe_cat', knn1_pipe_cat, ['OverTime','MaritalStatus','BusinessTravel','JobRole']),
    ##('pipe_pca1',knn_pipe_pca1,['MonthlyIncome','JobLevel']),
    ##('pipe_pca2',knn_pipe_pca1,['YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager'])
])

#dtc
knn1_pipe_combine = Pipeline([
    ('transformer', knn1_transformer),
    ('resampling', sm),
    ('knn', knn)
])

In [135]:
enc = OneHotEncoder(handle_unknown='ignore')
# passing bridge-types-cat column (label encoded values of bridge_types)
enc_df = pd.DataFrame(enc.fit_transform(df_coba[['OverTime','MaritalStatus','BusinessTravel','JobRole']]).toarray())
# merge with main df bridge_df on key values
df_coba1 = df_coba.join(enc_df)
df_coba1

Unnamed: 0,Attrition,OverTime,TotalWorkingYears,JobLevel,MonthlyIncome,MaritalStatus,YearsWithCurrManager,StockOptionLevel,BusinessTravel,JobRole,...,7,8,9,10,11,12,13,14,15,16
0,1,Yes,8,2,5993,Single,5,0,Travel_Rarely,Sales Executive,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,No,10,2,5130,Married,7,1,Travel_Frequently,Research Scientist,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,Yes,7,1,2090,Single,0,0,Travel_Rarely,Laboratory Technician,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,Yes,8,1,2909,Married,0,0,Travel_Frequently,Research Scientist,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,No,6,1,3468,Married,2,1,Travel_Rarely,Laboratory Technician,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,0,No,17,2,2571,Married,3,1,Travel_Frequently,Laboratory Technician,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1466,0,No,9,3,9991,Married,7,1,Travel_Rarely,Healthcare Representative,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1467,0,Yes,6,2,6142,Married,3,1,Travel_Rarely,Manufacturing Director,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1468,0,No,17,2,5390,Married,8,0,Travel_Frequently,Sales Executive,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [136]:
def makeOverSamplesADASYN(X,y):
 #input DataFrame
 #X →Independent Variable in DataFrame\
 #y →dependent Variable in Pandas DataFrame format
 from imblearn.over_sampling import ADASYN 
 sm = ADASYN()
 X1, y1 = sm.fit_sample(X, y)
 return(X1,y1)

In [137]:
knn_bench_cv_coba = cross_val_score(knn1_pipe_combine, X_train_val, y_train_val, cv = skfold, scoring='recall')

In [138]:
knn_bench_cv_coba

array([0.39473684, 0.42105263, 0.31578947, 0.18421053, 0.44736842])

In [139]:
df_coba['Attrition'].value_counts()

0    1233
1     237
Name: Attrition, dtype: int64

Ordinal Encoder

In [140]:
enc = OrdinalEncoder()
enc.fit(df_coba[['OverTime','MaritalStatus','BusinessTravel','JobRole']])
df_coba[['OverTime','MaritalStatus','BusinessTravel','JobRole']] = enc.transform(df[['OverTime','MaritalStatus','BusinessTravel','JobRole']])

In [141]:
df_coba

Unnamed: 0,Attrition,OverTime,TotalWorkingYears,JobLevel,MonthlyIncome,MaritalStatus,YearsWithCurrManager,StockOptionLevel,BusinessTravel,JobRole,JobInvolvement,JobSatisfaction,EnvironmentSatisfaction
0,1,1.0,8,2,5993,2.0,5,0,2.0,7.0,3,4,2
1,0,0.0,10,2,5130,1.0,7,1,1.0,6.0,2,2,3
2,1,1.0,7,1,2090,2.0,0,0,2.0,2.0,2,3,4
3,0,1.0,8,1,2909,1.0,0,0,1.0,6.0,3,3,4
4,0,0.0,6,1,3468,1.0,2,1,2.0,2.0,3,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,0,0.0,17,2,2571,1.0,3,1,1.0,2.0,4,4,3
1466,0,0.0,9,3,9991,1.0,7,1,2.0,0.0,2,1,4
1467,0,1.0,6,2,6142,1.0,3,1,2.0,4.0,4,2,2
1468,0,0.0,17,2,5390,1.0,8,0,1.0,7.0,2,2,4


In [142]:
X_coba = df_coba.drop('Attrition', axis=1)
y_coba = df_coba['Attrition']

X_train_val, X_test, y_train_val, y_test = train_test_split(X_coba, y_coba, test_size=.2, stratify=y, random_state=2021) #check random_state

In [143]:
XA,YA=makeOverSamplesADASYN(X_train_val,y_train_val)

In [144]:
XA

Unnamed: 0,OverTime,TotalWorkingYears,JobLevel,MonthlyIncome,MaritalStatus,YearsWithCurrManager,StockOptionLevel,BusinessTravel,JobRole,JobInvolvement,JobSatisfaction,EnvironmentSatisfaction
0,0.000000,10,1,2258,1.000000,7,1,2.000000,2.000000,2,3,4
1,0.000000,17,2,5237,1.000000,7,2,2.000000,0.000000,3,3,3
2,1.000000,21,4,17123,0.000000,2,2,2.000000,3.000000,2,2,3
3,0.000000,10,2,5228,1.000000,5,1,2.000000,4.000000,3,2,3
4,0.000000,10,1,3423,2.000000,7,0,2.000000,1.000000,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...
1935,0.000000,8,2,4239,0.118168,1,0,1.881832,7.000000,3,1,1
1936,0.116115,6,2,4692,1.116115,2,0,0.116115,7.000000,3,3,4
1937,1.000000,7,2,4602,1.337233,3,0,1.000000,7.000000,3,3,4
1938,0.666045,13,2,4556,2.000000,7,0,1.000000,6.666045,2,3,2


In [145]:
YA.value_counts()

0    986
1    954
Name: Attrition, dtype: int64

# Mulai Ulang

In [171]:
knn = KNeighborsClassifier(n_neighbors=5)

In [157]:
df_coba=df[['Attrition','OverTime','TotalWorkingYears','JobLevel','MonthlyIncome','MaritalStatus','YearsWithCurrManager','StockOptionLevel','BusinessTravel','JobRole','JobInvolvement','JobSatisfaction','EnvironmentSatisfaction']].copy()

In [158]:
df_coba.head()

Unnamed: 0,Attrition,OverTime,TotalWorkingYears,JobLevel,MonthlyIncome,MaritalStatus,YearsWithCurrManager,StockOptionLevel,BusinessTravel,JobRole,JobInvolvement,JobSatisfaction,EnvironmentSatisfaction
0,1,Yes,8,2,5993,Single,5,0,Travel_Rarely,Sales Executive,3,4,2
1,0,No,10,2,5130,Married,7,1,Travel_Frequently,Research Scientist,2,2,3
2,1,Yes,7,1,2090,Single,0,0,Travel_Rarely,Laboratory Technician,2,3,4
3,0,Yes,8,1,2909,Married,0,0,Travel_Frequently,Research Scientist,3,3,4
4,0,No,6,1,3468,Married,2,1,Travel_Rarely,Laboratory Technician,3,2,1


In [172]:
knn2_pipe_cat = Pipeline([
    ('ordinalencoder', ordinalencoder),
])

#transformer
knn2_transformer = ColumnTransformer([
    ('pipe_cat2', knn2_pipe_cat, ['OverTime','MaritalStatus','BusinessTravel','JobRole'])
])

#dtc
knn2_pipe_combine = Pipeline([
    ('transformer', knn2_transformer),
    ('resampling', sm),
    ('knn', knn)
])

In [173]:
X_coba1 = df_coba.drop('Attrition', axis=1)
y_coba1 = df_coba['Attrition']

X_train_val, X_test, y_train_val, y_test = train_test_split(X_coba1, y_coba1, test_size=.2, stratify=y, random_state=2021) #check random_state

In [174]:
knn_bench_cv_coba = cross_val_score(knn2_pipe_combine, X_train_val, y_train_val, cv = skfold, scoring='recall')

In [175]:
knn_bench_cv_coba

array([0.28947368, 0.39473684, 0.36842105, 0.34210526, 0.57894737])

In [176]:
knn_bench_cv_coba.mean()

0.39473684210526316

In [177]:
model_knn_default=knn2_pipe_combine.fit(X_train_val, y_train_val)
y_pred_knn_default= model_knn_default.predict(X_test) 

In [178]:
print(classification_report(y_test, y_pred_knn_default))

              precision    recall  f1-score   support

           0       0.89      0.70      0.78       247
           1       0.26      0.55      0.35        47

    accuracy                           0.68       294
   macro avg       0.58      0.63      0.57       294
weighted avg       0.79      0.68      0.72       294



# Coba Logit

In [166]:
logit = LogisticRegression(random_state=2021)
smote = SMOTE()

In [167]:
# combine all pipeline
logit_pipe_combine = Pipeline([
    ('transformer', knn2_transformer),
    ('logit', logit)
])

In [168]:
logit_bench_cv = cross_val_score(logit_pipe_combine, X_train_val, y_train_val, cv = skfold, scoring='recall')

In [169]:
logit_bench_cv

array([0.        , 0.02631579, 0.        , 0.        , 0.        ])

In [170]:
df_coba['Attrition'].value_counts()

0    1233
1     237
Name: Attrition, dtype: int64