# Building Predictive Models

In [1]:
import pandas as pd
import os
import numpy as np


### import the pre-processed data


In [2]:
processed_data_path = os.path.join(os.path.pardir, 'data','processed')
train_file_path = os.path.join(processed_data_path, 'train.csv')
test_file_path = os.path.join(processed_data_path, 'test.csv')

In [3]:
train_df = pd.read_csv(train_file_path, index_col='PassengerId')
test_df = pd.read_csv(test_file_path, index_col='PassengerId')

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_Lady            891 non-null int64
Title_Master          891 non-null int64
Title_Miss            891 non-null int64
Title_Mr              891 non-null int64
Title_Mrs             891 non-

In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 non-null int64
Title_Lady            418 non-null int64
Title_Master          418 non-null int64
Title_Miss            418 non-null int64
Title_Mr              418 non-null int64
Title_Mrs             418 non-null int64
Title_Officer         418 n

### Data Preparation

In [6]:
# most machine learning algorithms expect numerical arrays, so let's set those up
# Please note we are ONLY working on the training data frame here.  
X = train_df.loc[:,'Age':].as_matrix().astype('float') #all rows and columns, except SURVIVED
y = train_df['Survived'].ravel() #ravel creates a flat 1-d array

  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
print X.shape, y.shape

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(X.shape, y.shape)? (<ipython-input-7-90a94319c11e>, line 1)

In [None]:
print(X.shape, y.shape)

In [None]:
# X this means 891 rows, and 32 colums/features, not including survived
# y this means 891 one dimensional array, one feature, just containing 0 or 1 for Survived


In [9]:
#Now split the training data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(712, 32) (712,)
(179, 32) (179,)


In [10]:
#We've just split the data into 80/20 
# Now let's see how many positive class values are in the train and test data
# Mean will give us the proportion of positives (i.e. 1 = SURVIVED)
print('Mean survival in the 80% training data: {0:.3f}'.format(np.mean(y_train)))
print('Mean survival in the 20% training test data: {0:.3f}'.format(np.mean(y_test)))

Mean survival in the 80% training data: 0.383
Mean survival in the 20% training test data: 0.385


In [11]:
# We have similar proportions in the 80% and 20% splits.  This is good - this is what we want, 
# an even distribution
#
# Only 39% of data has positive cases, so 61% are going to be negative.  That's a little biased
# in terms of our source data.  This is known as class imbalance.