In [41]:
import os,sys,re

#data manipulation
import pandas as pd

#stats
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold \
                                    , cross_val_score ,cross_val_predict
from sklearn.base import clone
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score \
                            ,precision_recall_curve, roc_curve, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier

# data
import sklearn.datasets
from sklearn.datasets import fetch_mldata #mldata.org is down, don't use
from tensorflow.examples.tutorials.mnist import input_data

#graphs 
import matplotlib
import matplotlib.pyplot as plt

#display
from IPython.display import HTML

#some variables
seed = 42 #random seed
np.random.seed(seed)

#magic
%matplotlib  inline

In [42]:
titan_train = pd.read_csv("{}/mldata/titanic/train.csv".format(os.getcwd()))

In [43]:
titan_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [44]:
titan_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [45]:
titan_train.shape

(891, 12)

# create some dummy variables

First, sex

In [46]:
titan_train["sex_bin"] = pd.get_dummies(titan_train["Sex"])["male"]

In [47]:
titan_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,sex_bin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


In [48]:
titan_train.drop("Sex", inplace = True, axis = 1)

now embarked

In [49]:
def make_dummies(df,change_col, 
                 values, 
                 change_to, 
                 col_prefix = None, 
                 na = False
                ):
    
    #sensible names
    for code, loc in zip(values,change_to):
        df[change_col].replace(code, loc, inplace = True)
    
    #merge dataframes
    df= pd.merge(left = df, 
                 right = pd.get_dummies(df[change_col],
                                        prefix = col_prefix,
                                        dummy_na = na),
                 left_index = True,
                 right_index = True,
                 how = "left",
                 validate = "1:m"
                 )
    
    #drop this col as now redundant
    df.drop(change_col, inplace = True, axis = 1)
    
    return df

In [50]:

titan_train = make_dummies(titan_train,
                            "Embarked",
                            ["S", "C", "Q"],
                            ["Southampton",
                             "Cherbourg", 
                             "Queenstown"]
                           )

In [51]:
titan_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,sex_bin,_Cherbourg,_Queenstown,_Southampton
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,1,0,0,1


In [52]:
titan_train["Cabin"].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [53]:
#regex
r = re.compile("([a-zA-Z]+)([0-9]+)")
deck = []
room_num = []

#seperate out rooms and decks
for room in titan_train["Cabin"].replace(np.nan, "NaN").tolist():
    matches = r.match(room)
    #try/except for NaN value that return matches = None
    try:
        deck.append(matches.group(1))
        room_num.append(matches.group(2))
    except:
        if len(room) == 1:
            deck.append(room)
            room_num.append("NaN")
        else:
            deck.append("NaN")
            room_num.append("NaN")

titan_train["deck"] = deck
titan_train["room"] = room_num

In [54]:
titan_train = make_dummies(titan_train,
                           "deck",
                           ["A", "B", "C", "D", "E", "F", "G", "T"],
                           ["deck_A", 
                            "deck_B", 
                            "deck_C", 
                            "deck_D",
                            "deck_E",
                            "deck_F",
                            "deck_G",
                            "deck_T"]
                          )
                           

In [55]:
HTML(titan_train.head().to_html())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,sex_bin,_Cherbourg,_Queenstown,_Southampton,room,_NaN,_T,_deck_A,_deck_B,_deck_C,_deck_D,_deck_E,_deck_F,_deck_G
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,1,0,0,1,,1,0,0,0,0,0,0,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,1,0,0,85.0,0,0,0,0,1,0,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,0,0,1,,1,0,0,0,0,0,0,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,0,0,1,123.0,0,0,0,0,1,0,0,0,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,1,0,0,1,,1,0,0,0,0,0,0,0,0
