# Building Predictive Models - Part 1

In [1]:
import pandas as pd
import numpy as np
import os

### Import Data

In [2]:
# set the path of processed data
processed_data_path = os.path.join(os.path.pardir, 'data', 'processed')
train_file_path = os.path.join(processed_data_path, 'train.csv')
test_file_path = os.path.join(processed_data_path, 'test.csv')

In [7]:
train_df = pd.read_csv(train_file_path, index_col = 'PassengerId')
test_df = pd.read_csv(test_file_path, index_col = 'PassengerId')

In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 32 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
IsAdult               891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_Lady            891 non-null int64
Title_Master          891 non-null int64
Title_Miss            891 non-null int64
Title_Mr              891 non-

In [10]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 31 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
IsAdult               418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 non-null int64
Title_Lady            418 non-null int64
Title_Master          418 non-null int64
Title_Miss            418 non-null int64
Title_Mr              418 non-null int64
Title_Mrs             418 n

### Data Preparation

In [16]:
X = train_df.loc[:, 'Age':].values.astype('float')
y = train_df['Survived'].ravel()
print(X.shape, y.shape)

(891, 31) (891,)


In [17]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(712, 31) (712,)
(179, 31) (179,)


In [18]:
# average survival in train and test
print('mean survival in train: {0:0.3f}'.format(np.mean(y_train)))
print('mean survival in test: {0:0.3f}'.format(np.mean(y_test)))

mean survival in train: 0.383
mean survival in test: 0.385


#### Check Scikit-Learn Version

In [19]:
import sklearn
sklearn.__version__

'0.19.2'

Make sure you have Scikit-Learn v0.19. Otherwise update it and restart kernel.

In [20]:
#!conda update -y scikit-learn

### Baseline Model

In [21]:
# import function
from sklearn.dummy import DummyClassifier

In [22]:
# create baseline model
model_dummy = DummyClassifier(strategy = 'most_frequent', random_state = 0)

In [23]:
# train model
model_dummy.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [24]:
# default score - accuracy
print('score for baseline model: {0:0.2f}'.format(model_dummy.score(X_test, y_test)))

score for baseline model: 0.61


In [25]:
# performance metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [26]:
# accuracy score
print('accuracy for baseline model: {0:0.2f}'.format(accuracy_score(y_test, model_dummy.predict(X_test))))

accuracy for baseline model: 0.61


In [27]:
# confusion matrix
print('confusion matrix for baseline model: \n {0}'.format(confusion_matrix(y_test, model_dummy.predict(X_test))))

confusion matrix for baseline model: 
 [[110   0]
 [ 69   0]]


In [28]:
# precision and recall scores
print('precision for baseline model: {0:0.2f}'.format(precision_score(y_test, model_dummy.predict(X_test))))
print('recall for baseline model: {0:0.2f}'.format(recall_score(y_test, model_dummy.predict(X_test))))

precision for baseline model: 0.00
recall for baseline model: 0.00


  'precision', 'predicted', average, warn_for)


### First Kaggle Submission

In [29]:
# converting to matrix
test_X = test_df.values.astype('float')

In [30]:
# get predictions
predictions = model_dummy.predict(test_X)

In [31]:
df_submission = pd.DataFrame({ 'PassengerId': test_df.index, 'Survived': predictions })
df_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [32]:
submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
submission_file_path = os.path.join(submission_data_path, '01_dummy.csv')
print(submission_file_path)

..\data\external\01_dummy.csv


In [33]:
df_submission.to_csv(submission_file_path, index = False)

In [34]:
def get_submission_file(model, filename):
    # converting to matrix
    test_X = test_df.values.astype('float')
    # make predictions
    predictions = model.predict(test_X)
    # submission dataframe
    df_submission = pd.DataFrame({ 'PassengerId': test_df.index, 'Survived': predictions })
    # submission file
    submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
    submission_file_path = os.path.join(submission_data_path, filename)
    # write to file
    df_submission.to_csv(submission_file_path, index = False)

In [35]:
# get submission file
get_submission_file(model_dummy, '01_dummy.csv')

In [37]:
# submit using Kaggle API
!kaggle competitions submit titanic -f $submission_file_path -m "Baseline Model Submission"

Successfully submitted to Titanic: Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|██████████| 3.18k/3.18k [00:00<00:00, 23.3kB/s]
