In [1]:
import numpy as np
import pandas as pd
import re
# import calendar

import seaborn as sns
import matplotlib.style as style
from pylab import rcParams


import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

import warnings
warnings.filterwarnings(action='ignore')

style.use('fivethirtyeight')

In [2]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier,  AdaBoostClassifier, ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, learning_curve, cross_validate, train_test_split, KFold
# from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import VotingClassifier
import xgboost as xgb

In [3]:
# These 4 functions are essentially the same code but the only difference is the `scoring` 
# parameter of the cross_validate func.
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring
def accuracy_compare(X, target, models, cv):
    # Step 1 : Create an empty DataFrame and a row_index counting variable
    model_df = pd.DataFrame()
    row_index_count = 0
    # Iterate through each Machine Learning Model
    for each_model in models:
        # With the row index, input the name of the current Model
        model_df.loc[row_index_count, 'Model Name'] = each_model.__class__.__name__
        # Step 2 : Create a Cross-Validated Object using `cross_validate` from Sklearn 
        cv_results = cross_validate(
            each_model,
            X,
            target,
            cv=cv,
            scoring='accuracy',
            return_train_score=True,
            n_jobs=-1
        )
        # Input the Accuracy of prediction of each column
        model_df.loc[row_index_count, 'Train Accuracy Mean'] = cv_results['train_score'].mean()
        model_df.loc[row_index_count, 'Test Accuracy Mean'] = cv_results['test_score'].mean()
        row_index_count = row_index_count + 1

    # Step 3 : sort the DataFrame values by the Test Accuracy Mean  
    model_df.sort_values(by=['Test Accuracy Mean'],
                            ascending=False,
                            inplace=True)

    return model_df.style.background_gradient(cmap='Greens')

def f1_compare(X, target, models, cv):
    # Step 1 : Create an empty DataFrame and a row_index counting variable
    model_df = pd.DataFrame()
    row_index_count = 0
    # Iterate through each Machine Learning Model
    for each_model in models:
        # With the row index, input the name of the current Model
        model_df.loc[row_index_count, 'Model Name'] = each_model.__class__.__name__
        # Step 2 : Create a Cross-Validated Object using `cross_validate` from Sklearn 
        cv_results = cross_validate(
            each_model,
            X,
            target,
            cv=cv,
            scoring='f1',
            return_train_score=True,
            n_jobs=-1
        )
        # Input the Accuracy of prediction of each column
        model_df.loc[row_index_count, 'Train F1 Score Mean'] = cv_results['train_score'].mean()
        model_df.loc[row_index_count, 'Test F1 Score Mean'] = cv_results['test_score'].mean()

        row_index_count = row_index_count + 1

    # Step 3 : sort the DataFrame values by the Test Accuracy Mean  
    model_df.sort_values(by=['Test F1 Score Mean'],
                            ascending=False,
                            inplace=True)

    return model_df.style.background_gradient(cmap='Oranges')
def precision_compare(X, target, models, cv):
    # Step 1 : Create an empty DataFrame and a row_index counting variable
    model_df = pd.DataFrame()
    row_index_count = 0
    # Iterate through each Machine Learning Model
    for each_model in models:
        # With the row index, input the name of the current Model
        model_df.loc[row_index_count, 'Model Name'] = each_model.__class__.__name__
        # Step 2 : Create a Cross-Validated Object using `cross_validate` from Sklearn 
        cv_results = cross_validate(
            each_model,
            X,
            target,
            cv=cv,
            scoring='precision',
            return_train_score=True,
            n_jobs=-1
        )
        # Input the Accuracy of prediction of each column
        model_df.loc[row_index_count, 'Train Precision Mean'] = cv_results['train_score'].mean()
        model_df.loc[row_index_count, 'Test Precision Mean'] = cv_results['test_score'].mean()

        row_index_count = row_index_count + 1

    # Step 3 : sort the DataFrame values by the Test Accuracy Mean  
    model_df.sort_values(by=['Test Precision Mean'],
                            ascending=False,
                            inplace=True)

    return model_df.style.background_gradient(cmap='Blues')

def recall_compare(X, target, models, cv):
    # Step 1 : Create an empty DataFrame and a row_index counting variable
    model_df = pd.DataFrame()
    row_index_count = 0
    # Iterate through each Machine Learning Model
    for each_model in models:
        # With the row index, input the name of the current Model
        model_df.loc[row_index_count, 'Model Name'] = each_model.__class__.__name__
        # Step 2 : Create a Cross-Validated Object using `cross_validate` from Sklearn 
        cv_results = cross_validate(
            each_model,
            X,
            target,
            cv=cv,
            scoring='recall',
            return_train_score=True,
            n_jobs=-1
        )
        # Input the Accuracy of prediction of each column
        model_df.loc[row_index_count, 'Train Recall Mean'] = cv_results['train_score'].mean()
        model_df.loc[row_index_count, 'Test Recall Mean'] = cv_results['test_score'].mean()

        row_index_count = row_index_count + 1

    # Step 3 : sort the DataFrame values by the Test Accuracy Mean  
    model_df.sort_values(by=['Test Recall Mean'],
                            ascending=False,
                            inplace=True)

    return model_df.style.background_gradient(cmap='Purples')


In [39]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
target = train_df['Survived']

train_lenght = len(train_df)
passengers_ID_test = test_df['PassengerId']

entire_dataset =  pd.concat(objs=[train_df, test_df], axis=0).reset_index(drop=True)
entire_dataset.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


(1309, 12)

In [40]:

# Adding a Family_Size feature
entire_dataset['Family_Size'] = 1
entire_dataset['Family_Size'] = entire_dataset['Parch']+entire_dataset['SibSp'] + 1

# Adding a Alone feature
entire_dataset['Alone'] = 0
alone_func = lambda x : 1 if x == 1 else 0
entire_dataset['Alone'] = entire_dataset['Family_Size'].apply(alone_func)

# Adding a Small_Family feature
entire_dataset['Small_Family'] = 0
small_fam_func = lambda x : 1 if 2 <= x <= 3 else 0
entire_dataset['Small_Family'] = entire_dataset['Family_Size'].apply(small_fam_func)

# Adding a Medium_Family feature
entire_dataset['Medium_Family'] = 0
medium_fam_func = lambda x : 1 if 4 <= x <= 6 else 0
entire_dataset['Medium_Family'] = entire_dataset['Family_Size'].apply(medium_fam_func)

# Adding a Large_Family feature
entire_dataset['Large_Family'] = 0
large_fam_func = lambda x : 1 if 7 <= x else 0
entire_dataset['Large_Family'] = entire_dataset['Family_Size'].apply(large_fam_func)

#Embarked
entire_dataset['Embarked'].fillna('S', inplace = True)

# Cabin_Multiple
cabin_func = lambda x : 0 if pd.isna(x) else len(x.split(' '))
entire_dataset['Cabin_Multiple'] = entire_dataset['Cabin'].apply(cabin_func)

# The feature with most missing values is Cabin with U for Unknown
entire_dataset['Cabin'].fillna('U', inplace=True)

# Cabin_Section
cabin_func_2 = lambda x : str(x)[0]
entire_dataset['Cabin_Section'] = entire_dataset['Cabin'].apply(cabin_func_2)

# Age
age_mean = entire_dataset['Age'].mean()
age_std = entire_dataset['Age'].std()
age_null_count = entire_dataset['Age'].isnull().sum()

# Return random integers from `low` (inclusive) to `high` (exclusive).
age_null_random = np.random.randint(age_mean - age_std, age_mean + age_std, size=age_null_count)
# Choose the rows that are null and fill them up
entire_dataset['Age'][np.isnan(entire_dataset['Age'])] = age_null_random
entire_dataset['Age'] = entire_dataset['Age'].astype(int)

# Name_Title
title_func = lambda x : x.split(',')[1].split('.')[0].strip()
entire_dataset['Name_Title'] = entire_dataset['Name'].apply(title_func)

#Social_Title
entire_dataset['Social_Title'] = entire_dataset['Name_Title']
entire_dataset['Social_Title'] = entire_dataset['Social_Title'].replace(['Sir', 'Jonkheer', 'Rev', 'Col', 'Lady', 'Major', 'Don', 'the Countess'], 'Royalty')
entire_dataset['Social_Title'] = entire_dataset['Social_Title'].replace(['Dr', 'Capt', 'Master'], 'Professional')
entire_dataset['Social_Title'] = entire_dataset['Social_Title'].replace(['Mme', 'Mrs', 'Dona'], 'Wife')
entire_dataset['Social_Title'] = entire_dataset['Social_Title'].replace(['Mr', 'Ms', 'Miss', 'Mlle'], 'Ordinary')

#Age_Categorical
entire_dataset['Age_Categorical'] = pd.cut(entire_dataset['Age'], 6)
entire_dataset['Age_Categorical'] = entire_dataset['Age_Categorical'].astype(object)

fare_mean = entire_dataset['Fare'].mean()
fare_std = entire_dataset['Fare'].std()
fare_null_count = entire_dataset['Fare'].isnull().sum()
# Return random integers from `low` (inclusive) to `high` (exclusive).
fare_null_random = np.random.randint(fare_mean - fare_std, fare_mean + fare_std, size=fare_null_count)
# Choose the rows that are null and fill them up
entire_dataset['Fare'][np.isnan(entire_dataset['Fare'])] = fare_null_random
entire_dataset['Fare'] = entire_dataset['Fare'].astype(float)

#Fare_Categorical
entire_dataset['Fare_Categorical'] = pd.qcut(entire_dataset['Fare'], 4)
entire_dataset['Fare_Categorical'] = entire_dataset['Fare_Categorical'].astype(object)

entire_dataset.drop(columns=['Survived', 'PassengerId', 'Name'], inplace= True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [41]:
x_train = entire_dataset
# Check for NON Categorical features
non_categorical_feature_mask = entire_dataset.dtypes!=object
# Filter non categorical columns using mask and turn it into a list
non_categorical_features = entire_dataset.columns[non_categorical_feature_mask].tolist()
display(len(non_categorical_features))
non_categorical_features

11

['Age',
 'Fare',
 'Parch',
 'Pclass',
 'SibSp',
 'Family_Size',
 'Alone',
 'Small_Family',
 'Medium_Family',
 'Large_Family',
 'Cabin_Multiple']

In [42]:
# Check for Categorical features
categorical_feature_mask = entire_dataset.dtypes==object
# filter categorical columns using mask and turn it into a list
categorical_cols = entire_dataset.columns[categorical_feature_mask].tolist()
display(len(categorical_cols))
categorical_cols

9

['Cabin',
 'Embarked',
 'Sex',
 'Ticket',
 'Cabin_Section',
 'Name_Title',
 'Social_Title',
 'Age_Categorical',
 'Fare_Categorical']

In [43]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
# Since not all features are categorical features,  `make_column_transformer` will only
# one hot encode the categorical feature and standarized the non-categorical features.
column_trans = make_column_transformer(
    (StandardScaler(), non_categorical_features),
    (OneHotEncoder(sparse=False), categorical_cols)
)

entire_dataset_custom = column_trans.fit_transform(entire_dataset)
entire_dataset_custom.shape

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


(1309, 1173)

In [44]:
x_train = entire_dataset_custom[:train_lenght]
x_test = entire_dataset_custom[train_lenght:]

In [45]:
target = train_df['Survived']
print(f'X_train shape: {x_train.shape}')
# print(f'y shape: {y.shape}')
print(f'X_test shape: {x_test.shape}')
print(f'y shape: {target.shape}')

X_train shape: (891, 1173)
X_test shape: (418, 1173)
y shape: (891,)


In [46]:
cv = StratifiedKFold(10, shuffle=True)

naive = GaussianNB()

knn = KNeighborsClassifier()

dtree = DecisionTreeClassifier()

rf = RandomForestClassifier()

svc = SVC(probability=True)

lr = LogisticRegression()

gradient = GradientBoostingClassifier()

ada = AdaBoostClassifier()

extra_dt = ExtraTreesClassifier()

models = [naive, knn,  dtree, rf, svc, lr, gradient, ada, extra_dt]

In [19]:
display(accuracy_compare(x_train, target, models, cv))
display(f1_compare(x_train, target, models, cv))
display(precision_compare(x_train, target, models, cv))
display(recall_compare(x_train, target, models, cv))

Unnamed: 0,Model Name,Train Accuracy Mean,Test Accuracy Mean
1,KNeighborsClassifier,0.854096,0.830381
5,LogisticRegression,0.89924,0.827023
6,GradientBoostingClassifier,0.869686,0.821531
2,DecisionTreeClassifier,1.0,0.82047
8,ExtraTreesClassifier,1.0,0.818211
7,AdaBoostClassifier,0.874674,0.809322
3,RandomForestClassifier,0.981668,0.804714
4,SVC,0.767301,0.763138
0,GaussianNB,0.949494,0.468165


Unnamed: 0,Model Name,Train F1 Score Mean,Test F1 Score Mean
6,GradientBoostingClassifier,0.820763,0.774035
5,LogisticRegression,0.86534,0.770524
2,DecisionTreeClassifier,1.0,0.753219
7,AdaBoostClassifier,0.838221,0.749037
3,RandomForestClassifier,0.974766,0.742901
1,KNeighborsClassifier,0.804503,0.736182
8,ExtraTreesClassifier,1.0,0.736097
4,SVC,0.630486,0.614728
0,GaussianNB,0.938294,0.560845


Unnamed: 0,Model Name,Train Precision Mean,Test Precision Mean
3,RandomForestClassifier,0.995278,0.810006
1,KNeighborsClassifier,0.832375,0.802158
8,ExtraTreesClassifier,1.0,0.79922
6,GradientBoostingClassifier,0.853946,0.798706
4,SVC,0.818293,0.79753
2,DecisionTreeClassifier,1.0,0.792173
7,AdaBoostClassifier,0.838132,0.78645
5,LogisticRegression,0.88644,0.779805
0,GaussianNB,0.882981,0.414398


Unnamed: 0,Model Name,Train Recall Mean,Test Recall Mean
0,GaussianNB,1.0,0.885882
5,LogisticRegression,0.845679,0.760756
7,AdaBoostClassifier,0.830753,0.742353
1,KNeighborsClassifier,0.775508,0.73084
6,GradientBoostingClassifier,0.788819,0.72521
2,DecisionTreeClassifier,1.0,0.71084
3,RandomForestClassifier,0.960688,0.698992
8,ExtraTreesClassifier,1.0,0.698992
4,SVC,0.511051,0.496807


In [20]:
voting_model = VotingClassifier(
    estimators=[('naive', naive),
                ("knn", knn),
                ('dtree', dtree),
                ('rf', rf),
                ('svc', svc),
                ('lr', lr), 
                ('gradient', gradient), 
                ('ada', ada), 
                ('extra_dt', extra_dt)]
    , voting='soft')

In [21]:
voting_model.fit(x_train, target)
submission = pd.DataFrame(index=passengers_ID_test)
submission['Survived'] = voting_model.predict(x_test)
submission.reset_index().to_csv('output/submission_no_tuning_soft.csv', index=False)



In [22]:
voting_model = VotingClassifier(
    estimators=[('naive', naive),
                ("knn", knn),
                ('dtree', dtree),
                ('rf', rf),
                ('svc', svc),
                ('lr', lr), 
                ('gradient', gradient), 
                ('ada', ada), 
                ('extra_dt', extra_dt)]
    , voting='hard')

In [23]:
voting_model.fit(x_train, target)
submission = pd.DataFrame(index=passengers_ID_test)
submission['Survived'] = voting_model.predict(x_test)
submission.reset_index().to_csv('output/submission_no_tuning_hard.csv', index=False)



----

In [47]:
def status(feature):
    print('Processing', feature, ': ok')

In [49]:
def get_combined_data():
    # reading train data
    train = pd.read_csv('./data/train.csv')
    
    # reading test data
    test = pd.read_csv('./data/test.csv')

    # extracting and then removing the targets from the training data 
    targets = train.Survived
    train.drop(['Survived'], 1, inplace=True)
    

    # merging train data and test data for future feature engineering
    # we'll also remove the PassengerID since this is not an informative feature
    combined = train.append(test)
    combined.reset_index(inplace=True)
    combined.drop(['index', 'PassengerId'], inplace=True, axis=1)
    
    return combined

combined = get_combined_data()

In [50]:
print(combined.shape)

(1309, 10)


In [52]:
data = pd.read_csv('./data/train.csv')
titles = set()
for name in data['Name']:
    titles.add(name.split(',')[1].split('.')[0].strip())

print(titles)
# set(['Sir', 'Major', 'the Countess', 'Don', 'Mlle', 'Capt', 'Dr', 'Lady', 'Rev', 'Mrs', 'Jonkheer', 'Master', 'Ms', 'Mr', 'Mme', 'Miss', 'Col'])

Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

def get_titles():
    # we extract the title from each name
    combined['Title'] = combined['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
    
    # a map of more aggregated title
    # we map each title
    combined['Title'] = combined.Title.map(Title_Dictionary)
    status('Title')
    return combined

{'Ms', 'Miss', 'Mrs', 'Capt', 'Dr', 'Mr', 'Master', 'Jonkheer', 'Don', 'Lady', 'Major', 'Mlle', 'Col', 'Rev', 'Mme', 'the Countess', 'Sir'}


In [53]:
combined = get_titles()
combined.head()

Processing Title : ok


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [54]:
combined[combined['Title'].isnull()]

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
1305,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C,


In [55]:
print(combined.iloc[:891].Age.isnull().sum())


177


In [56]:
print(combined.iloc[891:].Age.isnull().sum())
# 86

grouped_train = combined.iloc[:891].groupby(['Sex','Pclass','Title'])
grouped_median_train = grouped_train.median()
grouped_median_train = grouped_median_train.reset_index()[['Sex', 'Pclass', 'Title', 'Age']]

grouped_median_train.head()

86


Unnamed: 0,Sex,Pclass,Title,Age
0,female,1,Miss,30.0
1,female,1,Mrs,40.0
2,female,1,Officer,49.0
3,female,1,Royalty,40.5
4,female,2,Miss,24.0


In [57]:
def fill_age(row):
    condition = (
        (grouped_median_train['Sex'] == row['Sex']) & 
        (grouped_median_train['Title'] == row['Title']) & 
        (grouped_median_train['Pclass'] == row['Pclass'])
    ) 
    return grouped_median_train[condition]['Age'].values[0]


def process_age():
    global combined
    # a function that fills the missing values of the Age variable
    combined['Age'] = combined.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)
    status('age')
    return combined

combined = process_age()

Processing age : ok


In [58]:
def process_names():
    global combined
    # we clean the Name variable
    combined.drop('Name', axis=1, inplace=True)
    
    # encoding in dummy variable
    titles_dummies = pd.get_dummies(combined['Title'], prefix='Title')
    combined = pd.concat([combined, titles_dummies], axis=1)
    
    # removing the title variable
    combined.drop('Title', axis=1, inplace=True)
    
    status('names')
    return combined

In [59]:
combined = process_names()

combined.head()

Processing names : ok


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
0,3,male,22.0,1,0,A/5 21171,7.25,,S,0,0,1,0,0,0
1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,0,0,0,1,0,0
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,1,0,0,0,0
3,1,female,35.0,1,0,113803,53.1,C123,S,0,0,0,1,0,0
4,3,male,35.0,0,0,373450,8.05,,S,0,0,1,0,0,0


In [60]:
def process_fares():
    global combined
    # there's one missing fare value - replacing it with the mean.
    combined.Fare.fillna(combined.iloc[:891].Fare.mean(), inplace=True)
    status('fare')
    return combined

In [61]:
combined = process_fares()

Processing fare : ok


In [62]:
def process_embarked():
    global combined
    # two missing embarked values - filling them with the most frequent one in the train  set(S)
    combined.Embarked.fillna('S', inplace=True)
    # dummy encoding 
    embarked_dummies = pd.get_dummies(combined['Embarked'], prefix='Embarked')
    combined = pd.concat([combined, embarked_dummies], axis=1)
    combined.drop('Embarked', axis=1, inplace=True)
    status('embarked')
    return combined

In [63]:
combined = process_embarked()

combined.head()

Processing embarked : ok


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Embarked_C,Embarked_Q,Embarked_S
0,3,male,22.0,1,0,A/5 21171,7.25,,0,0,1,0,0,0,0,0,1
1,1,female,38.0,1,0,PC 17599,71.2833,C85,0,0,0,1,0,0,1,0,0
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,,0,1,0,0,0,0,0,0,1
3,1,female,35.0,1,0,113803,53.1,C123,0,0,0,1,0,0,0,0,1
4,3,male,35.0,0,0,373450,8.05,,0,0,1,0,0,0,0,0,1


In [64]:
train_cabin, test_cabin = set(), set()

for c in combined.iloc[:891]['Cabin']:
    try:
        train_cabin.add(c[0])
    except:
        train_cabin.add('U')
        
for c in combined.iloc[891:]['Cabin']:
    try:
        test_cabin.add(c[0])
    except:
        test_cabin.add('U')

print(train_cabin)
# set(['A', 'C', 'B', 'E', 'D', 'G', 'F', 'U', 'T'])

print(test_cabin)


{'C', 'U', 'A', 'B', 'E', 'G', 'F', 'D', 'T'}
{'C', 'U', 'A', 'B', 'E', 'G', 'F', 'D'}


In [65]:
def process_cabin():
    global combined    
    # replacing missing cabins with U (for Uknown)
    combined.Cabin.fillna('U', inplace=True)
    
    # mapping each Cabin value with the cabin letter
    combined['Cabin'] = combined['Cabin'].map(lambda c: c[0])
    
    # dummy encoding ...
    cabin_dummies = pd.get_dummies(combined['Cabin'], prefix='Cabin')    
    combined = pd.concat([combined, cabin_dummies], axis=1)

    combined.drop('Cabin', axis=1, inplace=True)
    status('cabin')
    return combined

In [66]:
combined = process_cabin()

Processing cabin : ok


In [67]:
combined.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Title_Master,Title_Miss,Title_Mr,...,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U
0,3,male,22.0,1,0,A/5 21171,7.25,0,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,female,38.0,1,0,PC 17599,71.2833,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,female,35.0,1,0,113803,53.1,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,3,male,35.0,0,0,373450,8.05,0,0,1,...,1,0,0,0,0,0,0,0,0,1


In [68]:
def process_sex():
    global combined
    # mapping string values to numerical one 
    combined['Sex'] = combined['Sex'].map({'male':1, 'female':0})
    status('Sex')
    return combined

In [69]:
combined = process_sex()

Processing Sex : ok


In [70]:
def process_pclass():
    
    global combined
    # encoding into 3 categories:
    pclass_dummies = pd.get_dummies(combined['Pclass'], prefix="Pclass")
    
    # adding dummy variable
    combined = pd.concat([combined, pclass_dummies],axis=1)
    
    # removing "Pclass"
    combined.drop('Pclass',axis=1,inplace=True)
    
    status('Pclass')
    return combined

In [71]:
combined = process_pclass()

Processing Pclass : ok


In [74]:
def cleanTicket(ticket):
    ticket = ticket.replace('.', '')
    ticket = ticket.replace('/', '')
    ticket = ticket.split()
    ticket = map(lambda t : t.strip(), ticket)
    ticket = list(filter(lambda t : not t.isdigit(), ticket))
    if len(ticket) > 0:
        return ticket[0]
    else: 
        return 'XXX'

tickets = set()
for t in combined['Ticket']:
    tickets.add(cleanTicket(t))

print(len(tickets))
#37


def process_ticket():
    
    global combined
    
    # a function that extracts each prefix of the ticket, returns 'XXX' if no prefix (i.e the ticket is a digit)
    def cleanTicket(ticket):
        ticket = ticket.replace('.','')
        ticket = ticket.replace('/','')
        ticket = ticket.split()
        ticket = map(lambda t : t.strip(), ticket)
        ticket = filter(lambda t : not t.isdigit(), ticket)
        if len(ticket) > 0:
            return ticket[0]
        else: 
            return 'XXX'
    

    # Extracting dummy variables from tickets:

    combined['Ticket'] = combined['Ticket'].map(cleanTicket)
    tickets_dummies = pd.get_dummies(combined['Ticket'], prefix='Ticket')
    combined = pd.concat([combined, tickets_dummies], axis=1)
    combined.drop('Ticket', inplace=True, axis=1)

    status('Ticket')
    return combined

combined = process_ticket()

37


TypeError: object of type 'filter' has no len()