In [1]:
import os
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
os.chdir("..")
PATH = os.getcwd()

## Import Data

In [3]:
from src.data.obtain import get_raw_data
df = get_raw_data()

# Scrub Data

### Column Names

In [4]:
print("Fixing column names. (Removing special characters, converting to lowercase. Renaming long columns)")
df.columns = map(lambda i: i.lower().translate(None, './()& '), 
                 df.columns.tolist())

df.rename(columns={'siblingsspousesaboard': 'sibsp'}, inplace=True)

Fixing column names. (Removing special characters, converting to lowercase. Renaming long columns)


### Missings

In [5]:
have_missing = \
(df
 .isnull()
 .sum()
 .where(lambda x: x > 0)
 .dropna()
 .index
 .tolist()
)

print("The following columns have missing data: \n{}".format(have_missing))

The following columns have missing data: 
['age', 'fare', 'cabinnumber', 'embarked']


In [9]:
def json_save(x, PATH):
    """
    Save a Python Dict as a JSON file for persistence
    
    Parameters
    ----------
    x: dict
        The Python dict to be persisted
    PATH: string
        The full path to destination folder
        
    Returns
    -------
    None
    """
    import json
    with open(PATH, 'w') as fp:
        json.dump(x, fp, sort_keys=True, indent=4)

In [10]:
json_save({'have_missing': have_missing}, './src/data/have_missing.json')

In [None]:
pd.DataFrame({
    'type': df[have_missing].dtypes,
    'perc_missing': df[have_missing].isnull().mean().round(2),
    'num_missing': df[have_missing].isnull().sum()
})

In [None]:
for COL in have_missing:
    """
    Create a missing flag for each column
    that has missing data.
    """
    newCOL = COL + '__is_null'
    df.loc[:, newCOL] = df.loc[:, COL].isnull().astype(int)

In [None]:
# impute missings
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))

df['age'].plot.hist(bins=20, ax=ax[0], title='Age')
df['fare'].plot.hist(bins=20, ax=ax[1], title='Fare')
;

In [None]:
print("Age is approximately normally distributed, but Fare is skewed.")
print("Using the mean for Age and Median for Fare to impute missing data.")
print("Cabin Number has over 70% values missing. Dropping this variable.")
print("Embarked has only 2 values missing. Imputing with Mode.")

age_fillna = df['age'].mean()
fare_fillna = df['fare'].median()
embarked_fillna = df['embarked'].describe()['top']

df['age'].fillna(value=age_fillna, inplace=True)
df['fare'].fillna(value=fare_fillna, inplace=True)
df['embarked'].fillna(value=embarked_fillna, inplace=True)

### Create new columns

In [None]:
print("Creating a column for Gender")

df.loc[:, 'gender'] = df['name'].map(lambda i: 1 if 'Miss' in i or 'Mrs' in i else 0)

In [None]:
print("Creating Dummies for Embarked and Passenger Class. \nDone. Now dropping these.")

df = df.join(pd.get_dummies(df['embarked'], prefix='embarked'))
df = df.join(pd.get_dummies(df['passengerclass'], prefix='pclass'))

df.drop(['embarked', 'passengerclass'], axis=1, inplace=True)

### Remove Zero Variance Columns

In [None]:
perc_uniqs_catg = \
(df
 .describe(include=['O'])
 .T
 .assign(perc_uniq = lambda fr: fr['unique']/fr['count'])
 .loc[:, 'perc_uniq']
 .astype(float)
 .round(2)
)

perc_uniqs_catg

In [None]:
print("Dropping cabinnumber, ticket and name as they have no predictive value. (Too many uniques)")
df.drop(['cabinnumber', 'ticket', 'name'], axis=1, inplace=True)

### Convert to numeric

In [None]:
print("Downcasting numerics to occupy less space.")
df = df.apply(lambda c: pd.to_numeric(c, downcast='integer'))

## Backup

In [None]:
df.to_csv("data/04-processed/titanic.csv")