In [22]:
# Import the packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
np.random.seed(42)

#### Get the raw data

In [18]:
raw_data = pd.read_csv('../data/raw/marketing_data.csv')
raw_data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Response,Complain,Country
0,1826,1970,Graduation,Divorced,"$84,835.00",0,0,6/16/14,0,189,...,6,1,0,0,0,0,0,1,0,SP
1,1,1961,Graduation,Single,"$57,091.00",0,0,6/15/14,0,464,...,7,5,0,0,0,0,1,1,0,CA
2,10476,1958,Graduation,Married,"$67,267.00",0,1,5/13/14,0,134,...,5,2,0,0,0,0,0,0,0,US
3,1386,1967,Graduation,Together,"$32,474.00",1,1,5/11/14,0,10,...,2,7,0,0,0,0,0,0,0,AUS
4,5371,1989,Graduation,Single,"$21,474.00",1,0,4/8/14,0,6,...,2,7,1,0,0,0,0,1,0,SP


In [3]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   2240 non-null   int64 
 1   Year_Birth           2240 non-null   int64 
 2   Education            2240 non-null   object
 3   Marital_Status       2240 non-null   object
 4    Income              2216 non-null   object
 5   Kidhome              2240 non-null   int64 
 6   Teenhome             2240 non-null   int64 
 7   Dt_Customer          2240 non-null   object
 8   Recency              2240 non-null   int64 
 9   MntWines             2240 non-null   int64 
 10  MntFruits            2240 non-null   int64 
 11  MntMeatProducts      2240 non-null   int64 
 12  MntFishProducts      2240 non-null   int64 
 13  MntSweetProducts     2240 non-null   int64 
 14  MntGoldProds         2240 non-null   int64 
 15  NumDealsPurchases    2240 non-null   int64 
 16  NumWeb

In [4]:
class IncomeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('in the IncomeTransformer init method')
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # Replacing column's name and cleaning its values
        X = pd.DataFrame(X).copy()
        X = X.rename(columns={' Income ': 'Income'})
        X.loc[:, 'Income'] = X.loc[:, 'Income'].str.replace('[$,]', '').astype(float)
        
        # Removing outliers
        q1 = X.loc[:, 'Income'].quantile(0.25)
        q3 = X.loc[:, 'Income'].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        
        return X[(X['Income'] >= lower_bound) & (X['Income'] <= upper_bound)]

In [5]:
class YearBirthToAgeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('in the YearBirthToAgeTransformer init method')
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # Adding 'Age' column
        X = pd.DataFrame(X).copy()
        X['Age'] = pd.to_datetime('2020-12-19').year - X['Year_Birth']
        
        # Removing outliers
        q1 = X.loc[:, 'Age'].quantile(0.25)
        q3 = X.loc[:, 'Age'].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        
        return X[(X['Age'] >= lower_bound) & (X['Age'] <= upper_bound)]

In [6]:
class EducationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('in the EducationTransformer init method')
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = pd.DataFrame(X).copy()
        X['Education'].replace({'Graduation': 'Bachelor', '2n Cycle': 'Master'}, inplace=True)
        
        return X

In [7]:
class MaritalStatusTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('in the MaritalStatusTransformer init method')
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = pd.DataFrame(X).copy()
        X['Marital_Status'] = X['Marital_Status'].replace(['YOLO', 'Alone', 'Absurd'], 'Single')
        
        return X

In [8]:
class KidTeenTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('in the KidTeenTransformer init method')
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = pd.DataFrame(X).copy()
        X['HasKid'] = np.where(X['Kidhome'] > 0, 1, 0)
        X['HasTeen'] = np.where(X['Teenhome'] > 0, 1, 0)
        
        return X

In [9]:
class DaysSinceEnrolledTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('in the DaysSinceEnrolledTransformer init method')
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # Replacing column values into days and change its name
        X = pd.DataFrame(X).copy()
        X['Dt_Customer'] = pd.to_datetime(X['Dt_Customer'])
        X['Dt_Customer'] = (pd.to_datetime('2020-12-19') - X['Dt_Customer']).dt.days
        X.rename(columns={'Dt_Customer': 'Days_Since_Enrolled'}, inplace=True)
        
        # Removing outliers
        q1 = X.loc[:, 'Days_Since_Enrolled'].quantile(0.25)
        q3 = X.loc[:, 'Days_Since_Enrolled'].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        
        return X[(X['Days_Since_Enrolled'] >= lower_bound) & (X['Days_Since_Enrolled'] <= upper_bound)]

In [10]:
class RecencyTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('in the RecencyTransformer init method')
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = pd.DataFrame(X).copy()
        
        q1 = X.loc[:, 'Recency'].quantile(0.25)
        q3 = X.loc[:, 'Recency'].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        
        return X[(X['Recency'] >= lower_bound) & (X['Recency'] <= upper_bound)]

In [11]:
class CountryTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('in the CountryTransformer init method')
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = pd.DataFrame(X).copy()
        X['Country'].replace({'AUS': 'AU', 'GER': 'DE', 'IND': 'IN'}, inplace=True)
        
        return X

In [20]:
class DropColumnTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        print('in the DropColumnTransformer init method')
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = pd.DataFrame(X).copy()
        X = X.drop(self.columns, axis=1)
        
        return X

In [36]:
cols_to_drop = ['ID', 'Year_Birth', 'Kidhome', 'Teenhome']

middle_pipe = Pipeline([
    ('income_transformer', IncomeTransformer()),
    ('year_birth_to_age_transformer', YearBirthToAgeTransformer()),
    ('education_transformer', EducationTransformer()),
    ('marital_status_transformer', MaritalStatusTransformer()),
    ('kid_teen_transformer', KidTeenTransformer()),
    ('days_since_enrolled_transformer', DaysSinceEnrolledTransformer()),
    ('recency_transformer', RecencyTransformer()),
    ('country_transformer', CountryTransformer()),
    ('drop_column_transformer', DropColumnTransformer(cols_to_drop))
])

temp_data = middle_pipe.fit_transform(raw_data)

in the IncomeTransformer init method
in the YearBirthToAgeTransformer init method
in the EducationTransformer init method
in the MaritalStatusTransformer init method
in the KidTeenTransformer init method
in the DaysSinceEnrolledTransformer init method
in the RecencyTransformer init method
in the CountryTransformer init method
in the DropColumnTransformer init method


In [29]:
def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == sklearn.pipeline.Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))
    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == sklearn.pipeline.Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names

In [37]:
cat_cols = ['Education', 'Marital_Status', 'Country']

cat_pipe = ColumnTransformer([
    ('ohe_encoder', OneHotEncoder(drop='first'), cat_cols)
], remainder='passthrough')

end_pipe = Pipeline([
    ('categorical', cat_pipe),
    ('numerical', StandardScaler())
])

preprocessed_data = pd.DataFrame(end_pipe.fit_transform(temp_data))
preprocessed_data.columns = get_feature_names(cat_pipe)

preprocessed_data.head()

Unnamed: 0,ohe_encoder__x0_Basic,ohe_encoder__x0_Master,ohe_encoder__x0_PhD,ohe_encoder__x1_Married,ohe_encoder__x1_Single,ohe_encoder__x1_Together,ohe_encoder__x1_Widow,ohe_encoder__x2_CA,ohe_encoder__x2_DE,ohe_encoder__x2_IN,...,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Response,Complain,Age,HasKid,HasTeen
0,-0.158444,-0.584857,-0.524694,-0.795062,-0.525397,-0.589047,-0.188938,-0.369592,-0.235646,-0.265308,...,-0.282531,-0.283466,-0.280655,-0.262358,-0.117444,2.370996,-0.095673,-0.093624,-0.853262,-0.967424
1,-0.158444,-0.584857,-0.524694,-0.795062,1.903324,-0.589047,-0.188938,2.705689,-0.235646,-0.265308,...,-0.282531,-0.283466,-0.280655,-0.262358,8.514693,2.370996,-0.095673,0.6754,-0.853262,-0.967424
2,-0.158444,-0.584857,-0.524694,1.257763,-0.525397,-0.589047,-0.188938,-0.369592,-0.235646,-0.265308,...,-0.282531,-0.283466,-0.280655,-0.262358,-0.117444,-0.421764,-0.095673,0.931742,-0.853262,1.033673
3,-0.158444,-0.584857,-0.524694,-0.795062,-0.525397,1.697658,-0.188938,-0.369592,-0.235646,-0.265308,...,-0.282531,-0.283466,-0.280655,-0.262358,-0.117444,-0.421764,-0.095673,0.162718,1.171973,1.033673
4,-0.158444,-0.584857,-0.524694,-0.795062,1.903324,-0.589047,-0.188938,-0.369592,-0.235646,-0.265308,...,3.539436,-0.283466,-0.280655,-0.262358,-0.117444,2.370996,-0.095673,-1.717119,1.171973,-0.967424


In [39]:
preprocessed_data.to_csv('../data/processed/data_for_modeling.csv', index=False)