In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
%matplotlib inline  

In [2]:
from sklearn.feature_selection import SelectFromModel
from sklearn import svm
def new_feature_selector():
    return SelectFromModel(svm.LinearSVC(penalty='l1', dual=False, random_state=0))

# feature_selector = SelectFromModel(svm.SVC())
# feature_selector = SelectFromModel(svm.SVC())

In [3]:
train = pd.read_csv('train.csv', index_col='PassengerId')
X = pd.DataFrame()
y = train['Survived']

In [4]:
title = train['Name'].map(lambda x: \
                  (x.split(',')[1].split(' ')[2] if ' the ' in x \
                    else x.split(',')[1].split(' ')[1]) if (',' in x) \
                  else x.split(' ')[1]) #.drop_duplicates()
X['title_Mr'] = title.map({'Mr.': 1}).fillna(0).astype(int)
X['title_Mrs'] = title.map({'Mrs.': 1, 'Mlle.': 1}).fillna(0).astype(int)
X['title_Miss'] = title.map({'Miss.': 1, 'Mme': 1}).fillna(0).astype(int)
X['title_Master'] = title.map({'Master.': 1}).fillna(0).astype(int)
X['title_Other'] = ((X['title_Mr'] + X['title_Mrs'] + X['title_Miss'] + X['title_Master']) == 0).astype(int)
# 0.787859777551

In [5]:
X['Pclass'] = train['Pclass']
# 0.770006525933

In [6]:
X['male'] = (train['Sex'] == 'male').astype(int)
# 0.781255816593

In [7]:
X['FamilySize'] = train['SibSp'] + train['Parch'] + 1
# 0.829397060493

In [8]:
X['Alone'] = (X['FamilySize'] == 1).astype(int)
# 0.828273464987

In [9]:
X['SiblingCount'] = train['SibSp']
# 0.829409544887

In [10]:
X['ParentChildCount'] = train['Parch']
# 0.830520655998

In [11]:
X['Age'] = train['Age'].fillna(train['Age'].median())
# 0.812667971853

In [12]:
X['Age'] = train['Age'].fillna(train['Age'].mean())
# 0.812667971853

In [13]:
missing = train['Age'].isnull().sum()
samples = train[ train['Age'].notnull() ]['Age'].sample(missing, random_state=0).values
X['Age'] = train['Age']
X.loc[ X['Age'].isnull(), 'Age'] = samples
# 0.810483486551

In [14]:
from sklearn import linear_model
reg = linear_model.Lasso()
X['Age'] = train['Age']
X_present = X[ X['Age'].notnull() ].drop('Age', axis=1)
y_present = X[ X['Age'].notnull() ]['Age']
missing = X[ X['Age'].isnull() ].drop('Age', axis=1)
reg.fit(X_present, y_present)
missing['Age'] = reg.predict(missing)

X['Age'] = X['Age'].fillna(missing['Age']) #missing[ missing['Age'].notnull() ]['Age']

# 0.817099931903

In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

embarked_labels = train['Embarked']
embarked_labels = embarked_labels.fillna(embarked_labels.mode()[0])
enc = LabelEncoder()
enc.fit(embarked_labels)
classes = enc.classes_
embarked_categories = enc.transform(embarked_labels).reshape(-1, 1)
# print embarked_categories
enc = OneHotEncoder()
enc.fit(embarked_categories)
encoded_categories = enc.transform(embarked_categories).toarray()
for i in np.arange(len(classes)):
    X['Embarked_'+classes[i]] = encoded_categories[:, i].astype(int)
    
# df['PassengerId'] = X['PassengerId']
# df.index = 'PassengerId'
# X = X.concat(df)
# X.head(3)

# 0.813728577914

In [16]:
X['Fare'] = train['Fare']
# 0.729670865963

In [17]:
# TODO:
# normalize inputs
# polynomial transform

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)


# SVC

In [19]:
from sklearn import svm
clf = svm.SVC(random_state=0)

print cross_val_score(Pipeline([('classification', clf)]), X, y, cv=10).mean()
print cross_val_score(Pipeline([('feature_selection', new_feature_selector()), ('classification', clf)]), X, y, cv=10).mean()
# 0.728546986721

from sklearn.metrics import confusion_matrix
# clf = svm.SVC(random_state=0)
# pipline = Pipeline([('feature_selection', new_feature_selector()), ('classification', clf)])
# pipline.fit(X_train, y_train)
# y_pred = pipeline.predict(X_test)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print confusion_matrix(y_test, y_pred)

X_test[ y_pred < y_test ] # False negative
X_test[ y_pred > y_test ] # False positive

# TODO grid search
# TODO other classifiers
True

0.747673646578
0.728546986721
[[152  32]
 [ 49  62]]


True

In [None]:
from sklearn.model_selection import GridSearchCV

clf = svm.SVC(random_state=0)

param_grid = [
  {'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.15, 1, 10, 100, 1000], 'kernel':['rbf','poly','sigmoid']}
]

grid = GridSearchCV(clf, param_grid, cv=10, n_jobs=-1)
grid.fit(X, y)
np.array(grid.cv_results_['mean_test_score'])

grid.best_params_

# Linear SVC

In [None]:
from sklearn import svm
clf = svm.LinearSVC(C=0.03, random_state=0)

print cross_val_score(Pipeline([('classification', clf)]), X, y, cv=10).mean()
print cross_val_score(Pipeline([('feature_selection', new_feature_selector()), ('classification', clf)]), X, y, cv=10).mean()
# 0.818185790489

from sklearn.metrics import confusion_matrix
# clf = svm.LinearSVC(random_state=0)
# pipline = Pipeline([('feature_selection', new_feature_selector()), ('classification', clf)])
# pipline.fit(X_train, y_train)
# y_pred = pipeline.predict(X_test)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print confusion_matrix(y_test, y_pred)

X_test[ y_pred < y_test ] # False negative
X_test[ y_pred > y_test ] # False positive

# TODO other classifiers
True

In [None]:
# from sklearn.model_selection import GridSearchCV

# param_grid = [
#   {'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.15, 1, 10, 100, 1000]}
# ]

# grid = GridSearchCV(svm.LinearSVC(random_state=0, penalty='l2'), param_grid, cv=10) # , n_jobs=-1
# grid.fit(X, y)
# np.array(grid.cv_results_['mean_test_score'])

# array([ 0.80695847,  0.81593715,  0.82042649,  0.82154882,  0.82491582,
#         0.82042649,  0.81705948,  0.78787879,  0.74747475,  0.68911336,
#         0.65432099])

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=14, weights='distance')

print cross_val_score(Pipeline([('classification', clf)]), X, y, cv=10).mean()
print cross_val_score(Pipeline([('feature_selection', new_feature_selector()), ('classification', clf)]), X, y, cv=10).mean()
# 0.72958205652

from sklearn.metrics import confusion_matrix
# clf = svm.LinearSVC(random_state=0)
# pipline = Pipeline([('feature_selection', new_feature_selector()), ('classification', clf)])
# pipline.fit(X_train, y_train)
# y_pred = pipeline.predict(X_test)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print confusion_matrix(y_test, y_pred)

X_test[ y_pred < y_test ] # False negative
X_test[ y_pred > y_test ] # False positive

# TODO other classifiers
True

In [None]:
# from sklearn.model_selection import GridSearchCV

# param_grid = [
#   {'n_neighbors': np.arange(1,30), 'weights':['uniform', 'distance']}
# ]

# grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=10, n_jobs=-1)
# grid.fit(X, y)
# np.array(grid.cv_results_['mean_test_score'])

# grid.best_params_ # {'n_neighbors': 14, 'weights': 'distance'}