In [2]:
from __future__ import print_function
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib
import pandas_profiling
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

%matplotlib inline
plt.rcParams["figure.figsize"] = (15, 8)
pd.options.display.float_format = '{:.2f}'.format

In [54]:
fn = 'train.csv'
df = pd.read_csv(fn)

In [55]:
df.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [56]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.38,2.31,29.7,0.52,0.38,32.2
std,257.35,0.49,0.84,14.53,1.1,0.81,49.69
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.12,0.0,0.0,7.91
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.45
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.33


In [57]:
df[df['Cabin'].isnull()==False]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00,1,0,PC 17599,71.28,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00,1,0,113803,53.10,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.00,0,0,17463,51.86,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.00,1,1,PP 9549,16.70,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.00,0,0,113783,26.55,C103,S
21,22,1,2,"Beesley, Mr. Lawrence",male,34.00,0,0,248698,13.00,D56,S
23,24,1,1,"Sloper, Mr. William Thompson",male,28.00,0,0,113788,35.50,A6,S
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.00,3,2,19950,263.00,C23 C25 C27,S
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.52,B78,C
52,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.00,1,0,PC 17572,76.73,D33,C


In [107]:
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, MinMaxScaler, Imputer, LabelBinarizer, OneHotEncoder,StandardScaler
from sklearn.feature_extraction import DictVectorizer

def get_sex_col(df):
    return df[['Sex']]

def get_num_cols(df):
    return df[['Age', 'Fare']]

def get_pclass_col(df):
    return df[['Pclass']]

def get_port_col(df):
    le = LabelEncoder()
    return le.fit_transform(df['Embarked'].fillna('S').T).reshape(-1, 1).astype('float')


def get_cabin_col(df):
    le = LabelEncoder()
    return le.fit_transform(df['Cabin'].fillna('NaN').T).reshape(-1, 1).astype('float')
    
def get_rel_col(df):
    return np.sum(df[['SibSp','Parch']] , axis=1).values.reshape(-1, 1).astype('float')  

pipeline = make_union(*[
    make_pipeline(FunctionTransformer(get_num_cols, validate=False), Imputer(strategy='mean'), MinMaxScaler()),
    make_pipeline(FunctionTransformer(get_pclass_col, validate=False), OneHotEncoder(sparse=False)),
    make_pipeline(FunctionTransformer(get_sex_col, validate=False), LabelBinarizer()),
    make_pipeline(FunctionTransformer(get_port_col, validate=False), StandardScaler()),
    make_pipeline(FunctionTransformer(get_cabin_col, validate=False), StandardScaler()),
    make_pipeline(FunctionTransformer(get_rel_col, validate=False), StandardScaler())
])


In [108]:
df_train = df.copy()

In [109]:
x_train = pipeline.fit_transform(df_train)
x_train.shape

(891, 9)

In [110]:
y_train = df_train['Survived']
y_train.shape

(891,)

In [114]:
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV(cv=10)
lr.fit(x_train, y_train)
lr

LogisticRegressionCV(Cs=10, class_weight=None, cv=10, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [115]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, lr.predict(x_train))

0.80808080808080807

In [146]:
from sklearn.linear_model import SGDClassifier, RidgeClassifierCV
sgd = SGDClassifier()
sgd.fit(x_train, y_train)
sgd

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [147]:
accuracy_score(y_train, sgd.predict(x_train))

0.73063973063973064

In [154]:
rdg = RidgeClassifierCV(alphas=(0.01, 0.1, 1.0))
rdg.fit(x_train, y_train)
rdg

RidgeClassifierCV(alphas=(0.01, 0.1, 1.0), class_weight=None, cv=None,
         fit_intercept=True, normalize=False, scoring=None)

In [155]:
accuracy_score(y_train, rdg.predict(x_train))

0.80359147025813693

In [156]:
df_test = pd.read_csv('test.csv')
x_test = pipeline.fit_transform(df_test) 
x_test.shape

(418, 9)

In [157]:
y_test = lr.predict(x_test)

In [158]:
df_predicted = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_test})

In [159]:
df_predicted.to_csv('sample_submission.csv', sep=',', index=False)