In [3]:
import pandas as pd
import numpy as np
import sklearn.impute
import sklearn.model_selection
import sklearn.preprocessing

import acquire

In [None]:
df = acquire.get_titanic_data()
print('%d rows and %d columns' % df.shape)
df.head()

- are class and pclass the same?
- are embarked and embarked_town the same?
- are the 1s and 0s in survived booleans? alone?
- dataframe index vs passenger_id?
- what does the distribution of fare look like?
- In deck, is None null? missing?

These are questions we'll save for exploration

- what's the relationship between sibsp and alone?
- what's the relationship between survived and alone?

In [None]:
df.isna()

In [None]:
df.isna().sum()

In [None]:
df.isna().mean()

In [None]:
# drop deck with almost 80% not there

df = df.drop(columns='deck')

In [None]:
# check on class and pclass

pd.crosstab(df.pclass, df['class'])

In [None]:
# takeaway let's just use one since they are the same thing
# let's keep the numbers

df = df.drop(columns='class')

In [None]:
df

In [None]:
pd.crosstab(df.embark_town, df.embarked)

In [None]:
df = df.drop(columns='embarked')

In [None]:
df.head()

Remaining data issue:

- age has some missing values
- pclass is class encoded
- embark_town has a couple of missing values
- embark_town is a string, how do we represent this?

In [None]:
train, test = sklearn.model_selection.train_test_split(df, random_state=123, train_size=.8)

In [None]:
train.head()

In [None]:
# just dropping 148 rows on a data set this size might be problematic at this point

train.age.isna().sum()

**impute** - to fill in missing values

Stratgies for imputing:

- fill with 0
- fill with the average
- fill with the median
- fill with subgroup mean
    - fill with the average or median or whatever by another column
- build a model to predict missing values

In [None]:
# How many vlaues are in each subgroup?

train.groupby(['pclass', 'sex', 'embark_town']).size()

In [None]:
# applying a custom aggregation function to find missing values in the subgroups

train.groupby(['pclass', 'sex', 'embark_town']).agg(lambda s: s.isna().sum())

In [None]:
# fill with 0

# train.age.fillna(0)

for filling with overall average, there's two steps:

1. Find the average (from the training data)
1. Fill the missing values in train and test

Two ways to make the happen:

1. "manually" with pandas
1. scikit-learn

In [None]:
# manually with pandas

# avg_age = train.age.mean()
# train.age = train.age.fillna(avg_age)
# test.age = test.age.fillna(avg_age)

In [None]:
# This is the model when using sklearn

# 1. make the thing
imputer = sklearn.impute.SimpleImputer(strategy='mean')

# 2. fit the thing
imputer.fit(train[['age']])

# 3. use the thing
train.age = imputer.transform(train[['age']])
test.age = imputer.transform(test[['age']])

In [None]:
train.embark_town.isna().sum()

In [None]:
train.embark_town.value_counts()

In [None]:
train.embark_town = train.embark_town.fillna('Southampton')
test.embark_town = test.embark_town.fillna('Southampton')

**encoding** - turning a string into a number

two strategies:
    
- associate each unique value with a number -- label encoding
- one-hot encoding: turn each unique value into a sperate column with either 1 or 0
    - curse of dimensionality -- new column per unique value could be problematic

When to use one or the other?

- use the label encoder when the categories have an inherit order
- use one-hot encoding when there is no order

In [None]:
# make the thing
encoder = sklearn.preprocessing.OneHotEncoder()

# fit the thing
encoder.fit(train[['embark_town']])

#transform the thing
# .todense to convert from sparse matrix to plain old 2d numpy
m = encoder.transform(train[['embark_town']]).todense()
m

In [None]:
encoder.categories_

In [None]:
pd.concat([
    train.embark_town,
    pd.DataFrame(m, columns=encoder.categories_[0], index=train.index)
    
], axis=1)

In [None]:
# there will only be a single 1 in all the produced columns

(pd.DataFrame(m, columns=encoder.categories_[0]).sum(axis=1) == 1). all()

In [None]:
# bringing it all together, we'll one-hot encode embark_town,
# and then add those one-hot encoded columns back to our training
# and test dataframes

# make the thing
encoder = sklearn.preprocessing.OneHotEncoder()

# fit the thing
encoder.fit(train[['embark_town']])

cols = ['embark_town_' + c for c in encoder.categories_[0]]

#transform the thing
# .todense to convert from sparse matrix to plain old 2d numpy
m = encoder.transform(train[['embark_town']]).todense()

train = pd.concat([
    train,
    pd.DataFrame(m, columns=cols, index=train.index)
], axis=1).drop(columns='embark_town')

m = encoder.transform(test[['embark_town']]).todense()

test = pd.concat([
    test,
    pd.DataFrame(m, columns=cols, index=test.index)
], axis=1).drop(columns='embark_town')

In [None]:
train.head()

In [None]:
def drop_columns(df):
    return df.drop(columns=[
        'deck',  # too many missing values
        'class', # same as pclass
        'embarked', # same as embarked_town
    ])

def impute_age(train, test):
    imputer = sklearn.impute.SimpleImputer(strategy='mean')
    imputer.fit(train[['age']])
    train.age = imputer.transform(train[['age']])
    test.age = imputer.transform(test[['age']])
    return train, test

def impute_embark_town(train, test):
    train.embark_town = train.embark_town.fillna('Southampton')
    test.embark_town = test.embark_town.fillna('Southampton')
    return train, test

def encode_embark_town(train, test):
    encoder = sklearn.preprocessing.OneHotEncoder()
    encoder.fit(train[['embark_town']])
    # nice new column names
    cols = ['embark_town_' + c for c in encoder.categories_[0]]
    m = encoder.transform(train[['embark_town']]).todense()

    train = pd.concat([
        train,
        pd.DataFrame(m, columns=cols, index=train.index)
    ], axis=1).drop(columns='embark_town')

    m = encoder.transform(test[['embark_town']]).todense()

    test = pd.concat([
        test,
        pd.DataFrame(m, columns=cols, index=test.index)
    ], axis=1).drop(columns='embark_town')
    
    return train, test


def prep_titanic_data(df):
    df = drop_columns(df)
    train, test = sklearn.model_selection.train_test_split(df, train_size=.8, random_state=123)
    train, test = impute_age(train, test)
    train, test = impute_embark_town(train, test)
    train, test = encode_embark_town(train, test)
    
    return train, test

In [None]:
df = acquire.get_titanic_data()

train, test = prep_titanic_data(df)

In [None]:
train.head()