In [None]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
import csv
import re

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PolynomialFeatures

def get_best_model_and_accuracy(model, params, X, y):
    grid = GridSearchCV(model, # the model to grid search
                        params, # the parameter set to try 
                        error_score=0.) # if a parameter set raises an error, continue and set the performance as a big, fat 0
    grid.fit(X, y) # fit the model and parameters
    # our classical metric for performance
    print ("Best Accuracy: {}".format(grid.best_score_))
    # the best parameters that caused the best accuracy
    print ("Best Parameters: {}".format(grid.best_params_))
    # the average time it took a model to fit to the data (in seconds)
    print ("Average Time to Fit (s): {}".format(round(grid.cv_results_['mean_fit_time'].mean(), 3)))
    # the average time it took a model to predict out of sample data (in seconds)
    # this metric gives us insight into how this model will perform in real-time analysis
    print ("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))

%matplotlib inline

root = '/Users/schwalmdaniel/github/kaggle/titanic'
#root = 'd:/dev/python/kaggle/titanic'

train=pd.read_csv(root + "/train.csv")
test=pd.read_csv(root + "/test.csv")

# have a look at the ds
train.describe()

In [None]:
train['Ticket'].unique()

train['ticket_prefix'] = train['Ticket'].apply(lambda x: None if x.split(' ')[0].isnumeric() \
                                               else re.sub(r'\W','', x.split(' ')[0]).lower())
test['ticket_prefix'] = test['Ticket'].apply(lambda x: None if x.split(' ')[0].isnumeric() \
                                               else re.sub(r'\W','', x.split(' ')[0]).lower())
test['ticket_prefix'].unique()

In [None]:
train['name_prefix'] = train['Name'].apply(lambda x: x.lower().strip().split(',')[1].strip().split(' ')[0])
test['name_prefix'] = test['Name'].apply(lambda x: x.lower().strip().split(',')[1].strip().split(' ')[0])


In [None]:
# check missing values
train.isnull().sum()

In [None]:
# check null accuracy
train['Survived'].value_counts(normalize=True) 

In [None]:
# look at the heatmap of the correlation matrix of our dataset
sns.heatmap(train.corr())

In [None]:
# numerical correlations
train.corr()['Survived']

In [None]:
# drop complicated columns first
#train = train.drop(['Name','Cabin','Ticket'],axis=1)
#test = test.drop(['Name','Cabin','Ticket'],axis=1)
train = train.drop(['Name','Ticket'],axis=1)
test = test.drop(['Name','Ticket'],axis=1)

In [None]:
train['Cabin'].fillna('Unknown', inplace=True)
test['Cabin'].fillna('Unknown', inplace=True)

train['deck'] = train['Cabin'].apply(lambda x: re.sub(r'[\d ]','',x[:len(x) if x.find(' ') < 0 else x.find(' ') ]))
test['deck'] = test['Cabin'].apply(lambda x: re.sub(r'[\d ]','',x[:len(x) if x.find(' ') < 0 else x.find(' ') ]))

train['multicabin'] = train['Cabin'].apply(lambda x: 1 if ' ' in x else 0 )
test['multicabin'] = test['Cabin'].apply(lambda x:  1 if ' ' in x else 0)

train = train.drop(['Cabin'],axis=1)
test = test.drop(['Cabin'],axis=1)

train['deck'].unique()

In [None]:
# dummify features

train = pd.get_dummies(train, 
               columns = ['Pclass', 'Sex', 'Embarked','deck','ticket_prefix','name_prefix'],  # which columns to dummify
               prefix_sep='__')  # the separator between the prefix (column name) and cell value

train = train.drop(['Sex__male'],axis=1) # drop because of dummy trap
test = pd.get_dummies(test, 
               columns = ['Pclass', 'Sex', 'Embarked','deck','ticket_prefix','name_prefix'],  # which columns to dummify
               prefix_sep='__')  # the separator between the prefix (column name) and cell value

test = test.drop(['Sex__male'],axis=1) # drop because of dummy trap
train = train.drop(['deck__T'],axis=1) # drop because test set does not contain it

test = test.drop(list((set([x for x in test.columns.tolist() if 'ticket_prefix' in x or 'name_prefix' in x])\
                     ^ set([x for x in train.columns.tolist() if 'ticket_prefix' in x or 'name_prefix' in x]))),axis=1,errors='ignore')
train = train.drop(list((set([x for x in test.columns.tolist() if 'ticket_prefix' in x or 'name_prefix' in x])\
                     ^ set([x for x in train.columns.tolist() if 'ticket_prefix' in x or 'name_prefix' in x]))),axis=1,errors='ignore')






In [None]:
print (set(test.columns.tolist()) ^ set(train.columns.tolist()))

In [None]:
sns.heatmap(train.corr())

In [None]:
# imput null ages with mean TODO: check age survived
imputer = Imputer(strategy='mean')

train = pd.DataFrame(imputer.fit_transform(train), columns=train.columns.tolist())
test = pd.DataFrame(imputer.fit_transform(test), columns=test.columns.tolist())

#train['Age'] = imputer.fit_transform(train['Age']).ravel()

#train['Age'].fillna(train['Age'].mean(), inplace=True)
#test['Age'].fillna(test['Age'].mean(), inplace=True)

# imput null ages 0
#train['Age'].fillna(0,inplace=True)

#train['Age'].unique()

In [None]:
# fill missing fare

#test['Fare'].fillna(test['Fare'].mean(), inplace=True)

X = train.drop(['PassengerId','Survived'], axis=1)
# create our feature matrix by removing the response variable
print ("learning from {} rows".format(X.shape[0]))
y = train['Survived']

In [None]:
rf = RandomForestClassifier(max_depth=7,n_estimators=50)

poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)

pipeline = Pipeline([('imputer', Imputer(strategy='mean')), ('poly_features', poly), ('classify', rf)])


In [None]:
poly = PolynomialFeatures(degree=1, include_bias=False, interaction_only=False)

pipe_params = {'poly_features__degree':[1], 'poly_features__interaction_only':[True], 
               'classify__n_neighbors':[3, 4, 5, 6]}

knn_params = {'classify__n_neighbors':[1, 2, 3, 4, 5, 6, 7]}

knn = KNeighborsClassifier(n_neighbors=7)

pipeline = Pipeline([('imputer', Imputer(strategy='mean')), ('poly_features', poly), \
                     ('standardize', MinMaxScaler()), ('classify', knn)])

grid = GridSearchCV(pipeline, pipe_params)
grid.fit(X, y)

print (grid.best_score_, grid.best_params_)

#X.describe()

In [None]:
X.head()

In [None]:
y_test = train['Survived']
x_test = test.drop(['PassengerId'],axis=1)

pipeline.fit(X,y)
preds = pipeline.predict(x_test)

print (pipeline.score(X,y))

predicted = pd.DataFrame()
predicted['PassengerId'] = test['PassengerId']
predicted['Survived'] = preds
predicted[['PassengerId', 'Survived']] = predicted[['PassengerId', 'Survived']].astype(int)
predicted.to_csv(root + '/submission.csv', index=False,quoting=csv.QUOTE_NONNUMERIC)

predicted.head()