In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

import pickle


In [2]:
class FeatureEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        label_encoded_df=X.copy()
        le = preprocessing.LabelEncoder()
        for col in label_encoded_df.select_dtypes(include=['object']).columns:
            label_encoded_df[col]=le.fit_transform(label_encoded_df[col])
        return label_encoded_df
            
class FeatureSelector(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X=X.drop(['StandardHours', 'EmployeeCount', 'Over18'],axis=1)
        return X

class shuffle(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X=X.sample(frac=1, random_state=12345).reset_index()
        return X
    
pipe=Pipeline([
    ("encoder", FeatureEncoder()),
    ("selector", FeatureSelector()),
    ("shuffler", shuffle())
])

class data_preparing:
    def __init__(self,x, pipe):
        self.x=x
        self.pipe=pipe
    
    def encode_select(self):
        x_dummy=pipe.fit_transform(self.x)
        y=x_dummy.loc[:,'Attrition']
        x=x_dummy.drop('Attrition',axis=1)
        return x,y
    
    def scale(self, x):
        scaler=preprocessing.StandardScaler()
        x=scaler.fit_transform(x)
        return x
            
     

In [3]:
SEED=12345

In [4]:
with open('preprocessing.pickle', 'rb') as f:
    data=pickle.load(f)

In [5]:
#Train %80, test=%10, val=%10  
train_ratio = 0.8
validation_ratio = 0.1
test_ratio = 0.1


# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
x_train, x_test, y_train, y_test = train_test_split(data.x, data.y, test_size=1 - train_ratio)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

print(len(x_train), len(x_val), len(x_test))

1176 147 147


In [6]:
#model_svm = svm.SVC(kernel='linear')

In [7]:
model_rf=RandomForestClassifier()

In [8]:
cv=KFold(n_splits=5, shuffle=True, random_state=SEED)

In [9]:
x=np.append(x_train, x_val,axis=0)
y=np.append(y_train, y_val,axis=0)
print(x.shape, y.shape)
print(x_train.shape)
print(x_val.shape)

(1323, 32) (1323,)
(1176, 32)
(147, 32)


In [10]:
scores=cross_validate(model_rf, x, y, scoring=["accuracy","f1", "precision","recall"], cv=cv)

In [11]:
print(scores)

{'fit_time': array([0.12798691, 0.11199284, 0.11563802, 0.11419106, 0.11332798]), 'score_time': array([0.00539589, 0.00515819, 0.00538182, 0.00513577, 0.00514913]), 'test_accuracy': array([0.84150943, 0.86792453, 0.86037736, 0.8219697 , 0.85606061]), 'test_f1': array([0.16      , 0.18604651, 0.2745098 , 0.1754386 , 0.24      ]), 'test_precision': array([0.57142857, 1.        , 0.875     , 0.55555556, 0.54545455]), 'test_recall': array([0.09302326, 0.1025641 , 0.1627907 , 0.10416667, 0.15384615])}
