In [14]:
%matplotlib inline

from __future__ import print_function
import constants
from datamanager import DataManager
import matplotlib.pyplot as plt

dataframes = DataManager.extract(constants.FILES_PATHS)
train_df = dataframes["train.csv"]
test_df = dataframes["test.csv"]

print("TRAIN SET")
train_df.info()

print("\nTEST SET")

TRAIN SET
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [15]:
def transform(dataframe):
    # Clean and complete
    dataframe.loc[dataframe['Age'].isnull(),'Age'] = dataframe['Age'].median()
    dataframe.loc[dataframe['Embarked'].isnull(),'Embarked'] = 'C' # Assumed from the fare payed (80, close to 59 which is the average price from 'C')

    # Feature Engineering
    dataframe['FamilySize'] = dataframe['SibSp'] + dataframe['Parch'] + 1
    
    dataframe.loc[dataframe['FamilySize'] == 1, 'FamilySize'] = 0
    dataframe.loc[dataframe['FamilySize'] == 2, 'FamilySize'] = 1
    dataframe.loc[dataframe['FamilySize'] > 3, 'FamilySize'] = 2
    
    dataframe['Title'] = dataframe['Name'].str.extract("(.*\.)", expand=False).str.split(",", expand=False).str.get(1).str.strip()
    dataframe.loc[dataframe['Title'] == 'Mlle.', 'Title'] = 'Miss.'
    dataframe.loc[dataframe['Title'] == 'Ms.', 'Title'] = 'Mrs.'
    dataframe.loc[dataframe['Title'] == 'Mme.', 'Title'] = 'Mrs.'
    dataframe.loc[dataframe['Title'] == 'Lady.', 'Title'] = 'Miss.'
    dataframe.loc[~dataframe['Title'].isin(constants.FREQUENT_TITLES), 'Title'] = 'Rare'
    
    dataframe['IsChild'] = dataframe['Age'] < 18
    dataframe['IsMother'] = (dataframe['Sex'] == 'female') & (~dataframe['IsChild']) & (dataframe['Parch'] > 0) & (dataframe['Title'] != 'Miss.')

    # Numerize
    dataframe['Sex'] = dataframe['Sex'].map( {'male': 0, 'female': 1} ).astype(int)
    dataframe = DataManager.convert_to_classes(dataframe,'Fare')

    for i in range(len(constants.POSSIBLE_TITLES)):
        dataframe.loc[dataframe['Title'] == constants.POSSIBLE_TITLES[i], 'Title'] = i

    dataframe['Embarked'] = dataframe['Embarked'].map( {'Q': 0, 'S': 1, 'C': 2} ).astype(int)

    # Conversions to int
    dataframe['Age'] = dataframe['Age'].astype(int)
    dataframe['FamilySize'] = dataframe['FamilySize'].astype(int)
    dataframe['Title'] = dataframe['Title'].astype(int)
    dataframe['IsChild'] = dataframe['IsChild'].astype(int)
    dataframe['IsMother'] = dataframe['IsMother'].astype(int)

    # Drop unused
    dataframe = dataframe.drop(['PassengerId','Parch','SibSp','Name','Ticket','Cabin','Age'], axis=1)
    
    return dataframe

new_train_df = transform(train_df)

print("NEW TRAIN SET")
new_train_df.info()

NEW TRAIN SET
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Sex           891 non-null int64
Fare          891 non-null float64
Embarked      891 non-null int64
FamilySize    891 non-null int64
Title         891 non-null int64
IsChild       891 non-null int64
IsMother      891 non-null int64
dtypes: float64(1), int64(8)
memory usage: 62.7 KB


In [16]:
eps = 0
corr = new_train_df.corr()
#corr[corr['Survived'].abs() >= eps]['Survived']

corr['Survived']

Survived      1.000000
Pclass       -0.338481
Sex           0.543351
Fare          0.296834
Embarked      0.131042
FamilySize    0.174465
Title         0.440580
IsChild       0.122239
IsMother      0.175967
Name: Survived, dtype: float64