In [98]:
from sklearn.base import BaseEstimator, TransformerMixin

In [99]:
class remove_one_valued_col(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self

    def transform(self,X):
        for col in X.columns:
            if(X[col].nunique() == 1):
                X.drop(col,inplace=True, axis=1)
        return X

class mapping(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        X['Attrition'] = X['Attrition'].map({'Yes':1, 'No':0})
        X['OverTime'] = X['OverTime'].map({'Yes':1, 'No':0})
        X['MaritalStatus'] = X['MaritalStatus'].map({'Single':0, 'Married':1,'Divorced':0.5})
        X.drop("EmployeeNumber",inplace = True, axis = 1)
        X.drop('JobLevel', inplace = True,axis = 1)
        return X

class one_hot_encoder(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        import pandas as pd
        X = pd.get_dummies(X)
        return X

class normalization(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        from sklearn.preprocessing import MinMaxScaler
        scale = MinMaxScaler()
        X = pd.DataFrame(scale.fit_transform(X.values), columns=X.columns, index=X.index)
        return X

class train_test_valid_split(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        from sklearn.model_selection import train_test_split
        SEED = 12345
        X_train, X_test, y_train, y_test = train_test_split(X.drop('Attrition', axis=1), X["Attrition"], 
                                                            test_size=0.15, random_state=SEED)
        X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, 
                                                        test_size=0.176, random_state=SEED)
        return X_train,y_train,X_test,y_test,X_validation,y_validation

                

In [105]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("one valued column remover",remove_one_valued_col()),
    ("mapper",mapping()),
    ("one hot encoder",one_hot_encoder()),
    ("scaler",normalization()),
    ("Train-Test-Validation Splitter",train_test_valid_split())
])

In [106]:
import pickle

file_name = "preprocessing.pickle"
with open(file_name, 'wb') as handle:
    pickle.dump(pipe, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [107]:
with open(file_name, 'rb') as handle:
    pipelinee = pickle.load(handle)

In [117]:
data = pd.read_csv("data.csv")

In [118]:
X_train,y_train,X_test,y_test,X_validation,y_validation = pipelinee.fit_transform(data)