# Titanic: Machine Learning from Disaster

In [473]:
import pandas as pd

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [474]:
train = pd.read_csv('kg-data/train.csv')
train.shape

(891, 12)

In [475]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [487]:
def normalize(df):
#     return (df-df.mean())/df.std()
    return (df - df.min()) / (df.max() - df.min())

def extract_features(df):
    X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    X.loc[:, 'Sex'] = X['Sex'].map({'male': 1.0, 'female': 0.0})
    X.loc[:, 'Embarked'] = X['Embarked'].map({'S': 1.0, 'C': 2.0, 'Q': 3.0})
    
    X.loc[:, 'Age'] = normalize(X['Age'])
    X.loc[:, 'Fare'] = normalize(X['Fare'])

    X = X.fillna(0)
    
    if 'Survived' in df.columns:
        Y = df['Survived']
        return X.values, Y.values

    return X.values, None

X, Y = extract_features(train)

# train['Cabin'].unique()
# len(train['Cabin'].unique())
# labels, uniques = train['Embarked'].factorize()
# labels
# uniques

In [488]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)
print('train/test: %d/%d' % (len(X_train), len(X_test)))

train/test: 712/179


## Model

In [130]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [489]:
# model = SVC(kernel='linear', gamma='auto')
# model = SVC(kernel='poly', degree=6, gamma='auto')
model = SVC(kernel='rbf', gamma='auto')
# model = SVC(kernel='sigmoid', gamma='auto')
model.fit(X_train, Y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [490]:
Y_pred = model.predict(X_test)

In [491]:
print(confusion_matrix(Y_test, Y_pred))  
print(classification_report(Y_test, Y_pred))

[[105  10]
 [ 23  41]]
              precision    recall  f1-score   support

           0       0.82      0.91      0.86       115
           1       0.80      0.64      0.71        64

    accuracy                           0.82       179
   macro avg       0.81      0.78      0.79       179
weighted avg       0.81      0.82      0.81       179



## Submission

In [459]:
sub_test = pd.read_csv('kg-data/test.csv', sep=',')
sub_test.shape

(418, 11)

In [460]:
sub_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [461]:
X_sub_test, _ = extract_features(sub_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [462]:
Y_sub_pred = model.predict(X_sub_test)

In [463]:
sub_pred_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
for i in range(len(sub_test)):
    pidx = sub_test['PassengerId'].loc[i]
    sub_pred_df.loc[i] = (pidx, Y_sub_pred[i])

In [464]:
sub_pred_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [465]:
sub_pred_df.to_csv('kg-data/submission.csv', sep=',', index=False)

## Kaggle

In [None]:
!kaggle competitions download -c titanic -p 'kg-data'

In [466]:
!kaggle competitions submit -c titanic -f 'kg-data/submission.csv' -m 'SVC, RBF kernel'

100%|██████████████████████████████████████| 2.77k/2.77k [00:00<00:00, 22.8kB/s]
Successfully submitted to Titanic: Machine Learning from Disaster