In [1]:
import util

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.linear_model import LogisticRegression

In [2]:
class ImputeMode(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        
        if self.columns is None:
            raise ValueError('Columns perameter empty, requires a list of column names')
        
    def fit(self, X, y=None):
        self.mode = [X[col].mode()[0] for col in self.columns]
        return self
        
    def transform(self, X):
        X = X.copy(deep=True)
        for index, col in enumerate(self.columns):
            X[col].fillna(self.mode[index], inplace=True)
        return X
    
    
class ImputeAvg(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, strategy='mean'):
        self.columns = columns
        self.strategy = strategy
        
        if self.columns is None:
            raise ValueError('Columns perameter empty, requires a list of column names')
    
    def fit(self, X, y=None):
        if self.strategy == 'mean':
            self.avg_value = [X[col].mean() for col in self.columns]
        elif self.strategy == 'median':
            self.avg_value = [X[col].median() for col in self.columns]
        else:
            raise ValueError(f'Unknown strategy parameter [{self.strategy}]')
        return self
    
    def transform(self, X):
        X = X.copy(deep=True)
        for index, col in enumerate(self.columns):
            X[col].fillna(self.avg_value[index], inplace=True)
        return X
    
    
class RoundFloats(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        if self.columns is None:
            raise ValueError('Columns perameter not set')
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for i in self.columns:
            X[i] = X[i].round()
        return X

    
def impute_avg_1feature(df_fit=pd.DataFrame(), df_transform=pd.DataFrame(), feature=None, target=None, agg='median'):
    df_fit = df_fit.copy(deep=True)
    df_transform = df_transform.copy(deep=True)
        
    for i in df_fit[feature].unique():
        # Fit
        subset = df_fit[df_fit[feature] == i]
        impute_value = eval(f'round(subset[target].{agg}(), 1)')
        
        # Transform
        subset = df_transform[df_transform[feature] == i]
        null_target_values = subset[subset[target].isnull()].copy()
        
        df_transform.loc[null_target_values.index, target] = impute_value
    return df_transform[target]


def impute_avg_2features(df_fit=pd.DataFrame(), df_transform=pd.DataFrame(), features=[], target=None, agg='median'):
    if df_transform.empty:
        df_transform = df_fit
    df_fit = df_fit.copy(deep=True)
    df_transform = df_transform.copy(deep=True)
    
    # Fit
    feature_1 = features[0]
    feature_2 = features[1]
    for i in df_fit[feature_1].unique():
        for j in df_fit[feature_2].unique():
            subset = df_fit[(df_fit[feature_1] == i) & (df_fit[feature_2] == j)]
            impute_value = eval(f'round(subset[target].{agg}(), 1)')
    
            # Transform
            subset = df_transform[(df_transform[feature_1] == i) & (df_transform[feature_2] == j)]
            null_target_values = subset[subset[target].isnull()].copy()

            df_transform.loc[null_target_values.index, target] = impute_value
    return df_transform.loc[:,target]


def drop_name_feature(df):
    return df.drop('Name', axis=1)


def generate_dummies(df1, df2, columns):
    df = pd.concat([df1, df2])
    df = pd.get_dummies(df, columns=columns)
    df1 = df[:len(df1)]
    df2 = df[len(df1):]
    return df1, df2

In [3]:
df_train = pd.read_csv('train.csv', index_col=[0])
df_test = pd.read_csv('test.csv', index_col=[0])

df_train.Age = impute_avg_2features(df_train, df_train, features=['Pclass', 'Sex'], target='Age')
df_test.Age = impute_avg_2features(df_train, df_test, features=['Pclass', 'Sex'], target='Age')

In [4]:
X = df_train.drop(['Survived', 'Ticket', 'Cabin'], axis=1)
y = df_train.Survived

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=.2,
                                                    random_state=32)
catagorical_pipe = make_pipeline(
    (ImputeMode(['Embarked'])),
    (FunctionTransformer(extract_title)),
    (FunctionTransformer(drop_name_feature)),
    (FunctionTransformer(encode_sex))
)

numeric_col = X_train.select_dtypes('number').columns
numeric_pipe = make_pipeline(
    (ImputeAvg(numeric_col, 'mean')),
    (RoundFloats(['Age'])),
    (FunctionTransformer(expand_age))
)

custom_pipeline = make_pipeline(
    (catagorical_pipe),
    (numeric_pipe)
)

X_train, X_test = custom_pipeline.fit_transform(X_train), custom_pipeline.transform(X_test)
X_train, X_test = generate_dummies(X_train, X_test, ['Embarked', 'Title'])
X_train

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Infant,Child,Teen,YoungAdult,Adult,Age40+,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_uncommon
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
294,3,0,24.0,0,0,8.8500,0,0,0,1,0,0,0,0,1,0,1,0,0,0
618,3,0,26.0,1,0,16.1000,0,0,0,1,0,0,0,0,1,0,0,0,1,0
425,3,1,18.0,1,1,20.2125,0,0,1,0,0,0,0,0,1,0,0,1,0,0
889,3,0,22.0,1,2,23.4500,0,0,0,1,0,0,0,0,1,0,1,0,0,0
231,1,0,35.0,1,0,83.4750,0,0,0,0,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,3,1,28.0,1,0,15.8500,0,0,0,1,0,0,0,0,1,0,0,1,0,0
89,1,0,23.0,3,2,263.0000,0,0,0,1,0,0,0,0,1,0,1,0,0,0
311,1,0,24.0,0,0,83.1583,0,0,0,1,0,0,1,0,0,0,1,0,0,0
556,1,1,62.0,0,0,26.5500,0,0,0,0,0,1,0,0,1,0,0,1,0,0
