# Supervised Learning

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [51]:
data = pd.read_csv('Travel.csv')
data.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


## Splitting Data

In [52]:
from sklearn.model_selection import train_test_split
X = data.drop('ProdTaken', axis=1)
y = data['ProdTaken']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=42)

In [53]:
train =  pd.concat([X_train, pd.DataFrame(y_train)], axis=1)
test =  pd.concat([X_test, pd.DataFrame(y_test)], axis=1)

train.shape, test.shape

((3910, 20), (978, 20))

In [56]:
import pickle
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import SMOTE

class CustomeridDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(['CustomerID'], axis=1)

class DuplicatedDropper(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop_duplicates()

class MissingImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        for i in (X.select_dtypes(include = 'number').columns):
            imputer = SimpleImputer(strategy='median')
            X[i] = imputer.fit_transform(X[[i]])
    
        for i in (X.select_dtypes(include = ['object','category']).columns):
            imputer = SimpleImputer(strategy='most_frequent')
            X[i] = imputer.fit_transform(X[[i]])
    
        return X

class ExtractAgeStructure(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['AgeStructure'] = pd.cut(X['Age'], [15,24,54,64], labels=['Early working age','Prime working age', 'Mature working age'])
        return X

class DeleteOutliers(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        nums = ['DurationOfPitch','NumberOfPersonVisiting','NumberOfFollowups','NumberOfTrips','MonthlyIncome']
        for col in nums:
            q1 = X[col].quantile(0.25)
            q3 = X[col].quantile(0.75)
            iqr = q3 - q1
            low_lim = q1 - 1.5*iqr
            up_lim = q3 + 1.5*iqr
            outlier = []
            for x in X[col]:
                if ((x > up_lim) or (x < low_lim)):
                    i = X[X[col]== x].index
                    X.drop(i, axis=0,inplace=True)    
        return X

class NumericTransformation(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
  
    def transform(self, X):
        nums = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups','NumberOfTrips', 'NumberOfChildrenVisiting', 'MonthlyIncome']
        for i in nums:
              X[i]= StandardScaler().fit_transform(X[i].values.reshape(len(X), 1))
        
        return X

class FeatureEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
  
    def transform(self, X):
        typeofcontact_dict = {'Self Enquiry' : 0, 'Company Invited' : 1}
        X['TypeofContact'] = [typeofcontact_dict[i] for i in X['TypeofContact']]

        X.loc[X['Gender'] == 'Fe Male', 'Gender'] = 'Female'
        gender_dict = {'Male' : 0, 'Female' : 1}
        X['Gender'] = [gender_dict[i] for i in X['Gender']]

        X.loc[X['MaritalStatus'] == 'Unmarried', 'MaritalStatus'] = 'Single'

        encoder = OneHotEncoder()
        categorical_cols = ['Occupation', 'ProductPitched', 'MaritalStatus','Designation','AgeStructure']
        for i in categorical_cols:
            matrix = encoder.fit_transform(X[[i]]).toarray()
            column_names = X[i].unique().tolist()
            for j in range(len(matrix.T)):
                X[column_names[j]] = matrix.T[j]
            X.drop([i], axis=1, inplace=True)
        
        return X

class BalancingClass(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
  
    def transform(self, X):
        y = X['ProdTaken'].values
        column_names = X.drop(['ProdTaken'], axis=1).columns.tolist()
        X = X.drop(['ProdTaken'], axis=1).values

        smote = SMOTE(sampling_strategy=1,random_state = 42)
        X, y = smote.fit_resample(X, y)

        X = pd.DataFrame(X, columns=column_names)
        y = pd.DataFrame(y, columns=['ProdTaken'])

        X = pd.concat([y, X], axis=1)
        
        return X

class ColumnsDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
  
    def transform(self, X):
        significant_features = ['Passport', 'Basic', 'Manager', 'Divorced', 'MonthlyIncome', 'Deluxe', 'AVP',
                     'Prime working age', 'Age', 'Mature working age', 'King', 'Executive', 'Single',
                     'Married', 'PreferredPropertyStar', 'NumberOfFollowups', 'CityTier', 'DurationOfPitch',
                     'PitchSatisfactionScore', 'Small Business', 'Gender', 'Standard', 'Senior Manager',
                     'Large Business', 'Salaried', 'NumberOfPersonVisiting']
        X.drop(columns=([col for col in X.columns.tolist() if col not in significant_features]), axis=1, inplace=True)
        return X

with open('preprocessing_pipe.pkl', 'rb') as f:
    preprocessing_pipe = pickle.load(f)

In [57]:
train = preprocessing_pipe.fit_transform(train)
test = preprocessing_pipe.transform(test)

In [2]:
print("Hello world 1")

Hello world 1
