In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

train = pd.read_csv('train.csv')
holdout = pd.read_csv('test.csv')

In [None]:
holdout.head()

In [None]:
# %load functions.py
def process_missing(df):
    """Handle various missing values from the data set

    Usage
    ------

    holdout = process_missing(holdout)
    """
    df["Fare"] = df["Fare"].fillna(train["Fare"].mean())
    df["Embarked"] = df["Embarked"].fillna("S")
    return df

def process_age(df):
    """Process the Age column into pre-defined 'bins' 

    Usage
    ------

    train = process_age(train)
    """
    df["Age"] = df["Age"].fillna(-0.5)
    cut_points = [-1,0,5,12,18,35,60,100]
    label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

def process_fare(df):
    """Process the Fare column into pre-defined 'bins' 

    Usage
    ------

    train = process_fare(train)
    """
    cut_points = [-1,12,50,100,1000]
    label_names = ["0-12","12-50","50-100","100+"]
    df["Fare_categories"] = pd.cut(df["Fare"],cut_points,labels=label_names)
    return df

def process_cabin(df):
    """Process the Cabin column into pre-defined 'bins' 

    Usage
    ------

    train process_cabin(train)
    """
    df["Cabin_type"] = df["Cabin"].str[0]
    df["Cabin_type"] = df["Cabin_type"].fillna("Unknown")
    df = df.drop('Cabin',axis=1)
    return df

def process_titles(df):
    """Extract and categorize the title from the name column 

    Usage
    ------

    train = process_titles(train)
    """
    titles = {
        "Mr" :         "Mr",
        "Mme":         "Mrs",
        "Ms":          "Mrs",
        "Mrs" :        "Mrs",
        "Master" :     "Master",
        "Mlle":        "Miss",
        "Miss" :       "Miss",
        "Capt":        "Officer",
        "Col":         "Officer",
        "Major":       "Officer",
        "Dr":          "Officer",
        "Rev":         "Officer",
        "Jonkheer":    "Royalty",
        "Don":         "Royalty",
        "Sir" :        "Royalty",
        "Countess":    "Royalty",
        "Dona":        "Royalty",
        "Lady" :       "Royalty"
    }
    extracted_titles = df["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
    df["Title"] = extracted_titles.map(titles)
    return df

def create_dummies(df,column_name):
    """Create Dummy Columns (One Hot Encoding) from a single Column

    Usage
    ------

    train = create_dummies(train,"Age")
    """
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [None]:
def process_df(df):
    df = process_missing(df)
    df = process_age(df)
    df = process_fare(df)
    df = process_titles(df)
    df = process_cabin(df)
    df = create_dummies(df, 'Age_categories')
    df = create_dummies(df, 'Fare_categories')
    df = create_dummies(df, 'Title')
    df = create_dummies(df, 'Cabin_type')
    df = create_dummies(df, 'Sex')
    return df

In [None]:
train = process_df(train)
holdout = process_df(holdout)

In [None]:
holdout.head()

In [None]:
# data types
train.dtypes

In [None]:
#histograms

train_hist_SibSp = plt.hist(train['SibSp'])
plt.title('SibSp Histogram')
plt.xlabel('Num SibSp')
plt.ylabel('Total')
plt.show()

train_hist_Parch = plt.hist(train['Parch'])
plt.title('Parch Histogram')
plt.xlabel('Num Parch')
plt.ylabel('Total')
plt.show()

In [None]:
#pivot tables

train_pivot_SibSp = train.pivot_table(index='SibSp', values='Survived')
train_pivot_SibSp.plot.bar()
plt.title('SibSp Pivot')
plt.xlabel('Num SibSp')
plt.ylabel('Frequency')
plt.show()

train_pivot_Parch = train.pivot_table(index='Parch', values='Survived')
train_pivot_Parch.plot.bar()
plt.title('Parch Pivot')
plt.xlabel('Num Parch')
plt.ylabel('Frequency')
plt.show()

In [None]:
train['ParchSibSp'] = train['Parch'] + train['SibSp']
holdout['ParchSibSp'] = holdout['Parch'] + holdout['SibSp']

train_hist_ParchSibSp = plt.hist(train['ParchSibSp'])
plt.title('ParchSibSp Histogram')
plt.xlabel('Num ParchSibSp')
plt.ylabel('Total')
plt.show()

train_pivot_ParchSibSp = train.pivot_table(index='ParchSibSp', values='Survived')
train_pivot_ParchSibSp.plot.bar()
plt.title('ParchSibSp Pivot')
plt.xlabel('Num ParchSibSp')
plt.ylabel('Frequency')
plt.show()

# Summary
## Feature Preparation and Engineering
Train and holdout data sets were engineered as follows. Missing values for Fare and Embarked were filled in with the mean and na, respectively. Ages were bucketed into six (6) categories: Missing, Infant , Child, Teenager, Young Adult, Adult, and Senior. Fares were bucketed into four (4) categories: 0-12, 12-50, 50-100, and 100. Cabin Type null values were replaced with 'Unknown'. Titles were extracted from the name column and categorized as: Mr, Mrs, Master, Miss, Officer, and Royalty. Dummy columns were created for the following fields: Age_categories, Fare_categories, Title, Cabin_type, and Sex. Finally, SibSp and Parch columns were summed to obtain a new column describing total number of siblings, spouses, parents, and children per passenger. 

## Data Exploration
### SibSp
Integer values with a geometric data distribution. Approximately 600 passengers had no siblings or spouses aboard. About 200 had one (1) sibling or spouse aboard. About 75 had two or more siblings or spouses aboard.

SibSp likely correlated with survival rates. Poisson distribution. If no siblings or spouses aboard, survival rate was about 35%. If one (1) or two (2) siblings or spouses were aboard, survival rate increased to 55% and 45%, respectively. If three (3) or more siblings or spouses were aboard, survival rates dropped below 25%.

### Parch
Integer values with a geometric data distribution. Approximately 675 passengers had no parents or children aboard. About 100 had one (1) parent or child aboard. About 75 had two or more siblings or spouses aboard. About 100 had two (2) parents and/or children aboard. Less than 50 had three (3) or more parents or children aboard.

Parch likely correlated with survival rates. Distribution unclear. If no parents or children were aboard, survival rate was about 35%. If one (1) to three (3) parents or children were aboard, survival rate increased to 55 - 60%. If four (4) or more parents or children were aboard, survival rates dropped below 20%.

### ParchSibSp
Integer values with a geometric data distribution. Approximately 550 passengers had no siblings, spouses, parents, or children aboard. About 150 had one (1) sibling, spouse, parent, or child aboard. About 100 had two (2) siblings, spouses, parents, or children aboard. About 100 had three (3) or more siblings, spouses, parents, or children aboard.

ParchSibSp likely correlated with survival rates. Distribution unclear. If no parents, children, siblings, or spouses were aboard, survival rate was about 30%. If one (1) to three (3) parents, children, siblings, or spouses were aboard, survival rate increased to 55% to 70%. If four (4) or more parents, children, siblings, or spouses were aboard, survival rates ranged between 15 -30%.

In [None]:
def process_ParchSipSp(df):
    """Process the ParchSipSp column into pre-defined 'bins' 

    Usage
    ------

    train = process_ParchSipSp(train)
    """
    cut_points = [-1, 0, 100]
    label_names = [0, 1]
    df["isalone"] = pd.cut(df["ParchSibSp"],cut_points,labels=label_names)
    return df

train = process_ParchSipSp(train)
holdout = process_ParchSipSp(holdout)

train.head()

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def select_features(df):
    df = df.select_dtypes([np.number])
    all_X = df.drop(['Survived','PassengerId'],axis=1)
    all_y = df['Survived']
    clf = RandomForestClassifier(random_state=1)
    selector = RFECV(clf, cv=10)
    selector.fit(all_X, all_y)
    optimized_columns = all_X.columns[selector.support_]
    
    print(optimized_columns)
    return optimized_columns
    
best_features = select_features(train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

def select_model(df, feature_list):
    all_X = df.loc[:, feature_list]
    all_y = df['Survived']
    dict_list = [
    {
        "name": "LogisticRegression",
        "estimator": LogisticRegression(),
        "hyperparameters":
            {
                "solver": ["newton-cg", "lbfgs", "liblinear"]
            }
    },
    {
        "name": "KNeighborsClassifier",
        "estimator": KNeighborsClassifier(),
        "hyperparameters":
            {
                "n_neighbors": range(1,20,2),
                "weights": ["distance", "uniform"],
                "algorithm": ["ball_tree", "kd_tree", "brute"],
                "p": [1,2]
            }
    },
    {
        "name": "RandomForestClassifier",
        "estimator": RandomForestClassifier(random_state=1),
        "hyperparameters":
            {
                "n_estimators": [4, 6, 9],
                "criterion": ["entropy", "gini"],
                "max_depth": [2, 5, 10],
                "max_features": ["log2", "sqrt"],
                "min_samples_leaf": [1, 5, 8],
                "min_samples_split": [2, 3, 5]
            }
    }
    ]
    
    counter = 0
    
    for param in dict_list:
        print(param['name'])
        grid = GridSearchCV(param['estimator'], param_grid = param['hyperparameters'], cv = 10)
        grid.fit(all_X, all_y)
        best_params = grid.best_params_
        print(grid.best_params_)
        best_score = grid.best_score_
        print(grid.best_score_)
        
        dict_list[counter]['name'] = param['name']
        dict_list[counter]['estimator'] = param['estimator']
        dict_list[counter]['hyperparameters'] = best_params
        dict_list[counter]['score'] = best_score
        
        counter = counter + 1
        
    return dict_list
    print('Done')

models = select_model(train, best_features)

In [None]:
dict_BestModel = max(models, key=lambda d: d['score'])
dict_BestModel

In [None]:
def save_submission_file(df, feature_list, dict_BestModel, output_csvFileName):
    all_X = df.loc[:, feature_list]
    all_y = df['Survived']
    
    dict_BestModel['estimator'].fit(all_X, all_y)
    holdout_predictions = dict_BestModel['estimator'].predict(holdout.loc[:, feature_list])

    holdout_ids = holdout["PassengerId"]
    submission_df = {"PassengerId": holdout_ids,
                     "Survived": holdout_predictions}
    submission = pd.DataFrame(submission_df)
    
    print(submission[:10])

    return submission.to_csv(output_csvFileName, index=False)

submission = save_submission_file(train, best_features, dict_BestModel, 'Titantic.csv')
