In [3]:
"""
Summary:     A collections of functions to generate features.
Description:
Author:      Kunyu He, CAPP'20
"""

import os
import logging
import time
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler


#----------------------------------------------------------------------------#
INPUT_DIR = "./data/"
OUTPUT_DIR = "./processed_data/"
LOG_DIR = "./logs/featureEngineering/"

TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"

SCALERS = [StandardScaler, MinMaxScaler]

# logging
logger= logging.getLogger('featureEngineering')
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
logger.addHandler(ch)

pd.set_option('mode.chained_assignment', None)


#----------------------------------------------------------------------------#
def read_data(file_name, drop_na=False):
    """
    Read credit data in the .csv file and data types from the .json file.

    Inputs:
        - data_file (string): name of the data file.
        - drop_na (bool): whether to drop rows with any missing values

    Returns:
        (DataFrame) clean data set with correct data types

    """
    data = pd.read_csv(INPUT_DIR + file_name)

    if drop_na:
        data.dropna(axis=0, inplace=True)

    return data


def ask():
    """
    Ask user for choice of an imputer and a scaler.

    """
    scaler_index = int(input(("Up till now we support:\n"
                              "\t1. StandardScaler\n"
                              "\t2. MinMaxScaler\n"
                              "Please input a scaler index (1 or 2):\n")))

    return scaler_index


class FeaturePipeLine:
    """
    Preprocess pipeline for a data set from CSV file. Modify the class
    variables to fill in missing values, combine multinomial variables to ones
    with less levels and binaries, and apply one-hot-encoding. Then split data
    into features and traget, drop rows with missing labels and some columns.
    At last, apply scaling.

    """
    TO_DESCRETIZE = {'Age': 5}
    RIGHT_INCLUSIVE = {'Age': True}

    TO_FILL_NA = {'Cabin': "None",
                  'Embarked': "Unknown"}

    TO_COMBINE = {}
    TO_BINARIES = {'Sex': 'auto',
                   'Cabin': 'auto'}
    TO_ONE_HOT = {'PClass', 'Embarked', 'Age'}

    TARGET = 'Survived'
    TO_DROP = ['PassengerId', 'Ticket', 'Name']

    SCALERS = [StandardScaler, MinMaxScaler]
    SCALER_NAMES = ["Standard Scaler", "MinMax Scaler"]

    def __init__(self, file_name, ask_user=True, verbose=True, drop_na=False):
        """
        Construct a preprocessing pipeline given name of the data file.

        Inputs:
            - file_name (string): name of the data file
            - verbose (bool): whether to make extended printing in
                preprocessing
            - drop_na (bool): whether to drop rows with missing values

        """
        logger.info(("**-----------------------------------------------**\n"))
        logger.info("Creating the preprocessing pipeline for '{}'.".format(\
            file_name))
        self.data = read_data(file_name, drop_na)
        self.verbose = verbose
        logger.info("Finished reading cleaned data.")

        if ask_user:
            self.scaler_index = ask()
        else:
            self.scaler_index = 1
        logger.info("Pipeline using scaler {}".\
                    format(self.SCALER_NAMES[self.scaler_index - 1]))

        self.X = None
        self.y = None

    def discretize(self):
        """
        Discretizes continuous variables into multinomials.

        """
        logger.info(("\n\n**-------------------------------------------**\n"))
        logger.info("Start to discretizes continuous variables:")

        for var, n in self.TO_DESCRETIZE.items():
            self.data[var] = pd.cut(self.data[var], n,
                                    right=self.RIGHT_INCLUSIVE[var]).cat.codes

            if self.verbose:
                if not self.data[var].isnull().sum():
                    logger.info(("\tThere are missing values in '{}', "
                        "discretized it into {} bins, where '-1' indicates "
                        "that the value is missing.".format(var, n + 1)))
                else:
                    logger.info("\tDiscretized '{}' into {} bins.".\
                                format(var, n))

        return self

    def fill_na(self):
        """
        Fill in missing data with desired entry.

        """
        logger.info(("\n\n**-------------------------------------------**\n"))
        logger.info("Start to fill in missing values:")

        for var, fill in self.TO_FILL_NA.items():
            self.data[var].fillna(value=fill, inplace=True)

            if self.verbose:
                logger.info("\tFilled missing values in '{}' with '{}'.".\
                      format(var, fill))

            if fill == "None":
                to_combine = [col for col in list(self.data[var].unique())
                              if col != "None"]
                self.TO_COMBINE[var] = {"Yes": to_combine}
                logger.info("\t\t'{}' added to 'TO_COMBINE'".format(var))

        return self

    def to_combine(self):
        """
        Combine some unecessary levels of multinomials.

        """
        logger.info(("\n\n**-------------------------------------------**\n"))
        logger.info("Start to combine unnecessary levels of multinomials.")

        for var, dict_combine in self.TO_COMBINE.items():
            for combined, lst_combine in dict_combine.items():
                self.data.loc[self.data[var].isin(lst_combine), var] = combined

            if self.verbose:
                logger.info("\tCombinations of levels on '{}'.".format(var))

        return self

    def to_binary(self):
        """
        Trasform variables to binaries.

        """
        logger.info(("\n\n**-------------------------------------------**\n"))
        logger.info(("Start to transform the following variables: {} to "
                     "Binaries.").format(list(self.TO_BINARIES.keys())))

        for var, cats in self.TO_BINARIES.items():
            enc = OrdinalEncoder(categories=cats)
            self.data[var] = enc.fit_transform(np.array(self.data[var]).\
                                               reshape(-1, 1))

        return self

    def one_hot(self):
        """
        Ccreates binary/dummy variables from multinomials, drops the original
        and inserts the dummies back.

        """
        logger.info(("\n\n**-------------------------------------------**\n"))
        logger.info(("Start to apply one-hot-encoding to the following "
                     "categorical variables: {}\n").format(self.TO_ONE_HOT))

        for var in self.TO_ONE_HOT:
            dummies = pd.get_dummies(self.data[var], prefix=var)
            self.data.drop(var, axis=1, inplace=True)
            self.data = pd.concat([self.data, dummies], axis=1)

        return self


In [10]:
pipe = FeaturePipeLine(TRAIN_FILE, ask_user=False)
pipe.discretize().fill_na().to_combine().to_binary().one_hot()

**-----------------------------------------------**

Creating the preprocessing pipeline for 'train.csv'.
Finished reading cleaned data.


Up till now we support:
	1. StandardScaler
	2. MinMaxScaler
Please input a scaler index (1 or 2):
1


Pipeline using scaler Standard Scaler


**-------------------------------------------**

Start to discretizes continuous variables:
	There are missing values in 'Age', discretized it into 6 bins, where '-1' indicates that the value is missing.


**-------------------------------------------**

Start to fill in missing values:
	Filled missing values in 'Cabin' with 'None'.
		'Cabin' added to 'TO_COMBINE'
	Filled missing values in 'Embarked' with 'Unknown'.


**-------------------------------------------**

Start to combine unnecessary levels of multinomials.
	Combinations of levels on 'Cabin'.


**-------------------------------------------**

Start to transform the following variables: ['Sex', 'Cabin'] to Binaries.


**-------------------------------------------**

Start to apply one-hot-encoding to the following categorical variables: {'PClass', 'Age', 'Embarked'}



KeyError: 'PClass'

In [11]:
pipe.data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1.0,1,1,0,A/5 21171,7.25,0.0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.0,2,1,0,PC 17599,71.2833,1.0,C
2,3,1,3,"Heikkinen, Miss. Laina",0.0,1,0,0,STON/O2. 3101282,7.925,0.0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.0,2,1,0,113803,53.1,1.0,S
4,5,0,3,"Allen, Mr. William Henry",1.0,2,0,0,373450,8.05,0.0,S


In [8]:
pipe.data.Age.value_counts()

 1    346
 2    188
-1    177
 0    100
 3     69
 4     11
Name: Age, dtype: int64

In [9]:
pipe.data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64