In [212]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn import tree
import graphviz 
from sklearn.model_selection import RandomizedSearchCV

In [23]:
def extract_title(df):
    return df.Name.apply(lambda x: x.partition(',')[-1].split()[0])

In [24]:
def map_title(title):
    title_dictionary = {
        "Mr.": "Mr",
        "Miss.": "F_Unwed",
        "Mlle.": "F_Unwed",
        "Ms.": "F_Unwed",
        "Mrs.": "F_Wed",
        "Mme.": "F_Wed",
        "Master.": "Master",
        "Rev.": "Religon",
        "Dr.": "Professional",
        "Col.": "Professional",
        "Major.": "Professional",
        "Capt.": "Professional",
        "Lady.": "Royalty",
        "Sir.": "Royalty",
        "the": "Royalty",
        "Jonkheer.": "Royalty",
        "Don.": "Royalty",
        "Dona.": "Royalty"
    }

    return title_dictionary.get(title, "Other")

In [77]:
def extract_titles_from_names(df):
    df['Title'] = extract_title(df)
    return df['Title']

In [180]:
def map_titles_to_categories(df):
    df['Title'] = df.Title.apply(lambda x: map_title(x))
    return df


In [137]:
def compute_age(row, df):
    title = row['Title']
    gender = row['Sex']
    age = 0.0
    threshold = 5
    # find all passengers with same title
    same_titled_passengers = df[(df['Title'] == title) & (~df['Age'].isnull())]
    count = same_titled_passengers.size
    if (count > threshold):
        age = np.mean(same_titled_passengers['Age'])
    else:
        same_gender = df[df['Sex'] == gender]              
        age = np.mean(same_gender)                 
        
    
    return age

In [172]:
def impute_ages(df):
    no_ages = df[df.Age.isnull()]
    temp = no_ages.apply(lambda x: compute_age(x, df), axis=1)
    temp_df = pd.DataFrame(temp, columns=['Age'])
    df.update(temp_df)
    return df

In [209]:
def join_feature_name_with_importance_value(features, importances):
    """
    Join via a list of tuples, feature names with their importance values
    :param features: data frame whose features are represented by columns used by classifier
    :param importances: feature importance scores assigned by classifier
    :return: sorted list (highest importances first) of feature,importance tuples
    """
    if features.columns.shape[0] != importances.shape[0]:
        return []

    feature_importances = []
    for item in range(features.columns.shape[0]):
        feature_importances.append((features.columns[item], importances[item]))
    feature_importances_sorted = sorted(feature_importances, reverse=True, key=lambda kv: kv[1])

    return feature_importances_sorted

In [210]:
def display_important_features(classifier, features):
    importances = classifier.feature_importances_
    feature_importances = join_feature_name_with_importance_value(features, importances)
    print(feature_importances)


In [219]:
train = pd.read_csv('../data/train.csv')
y_predict = train['Survived']


In [221]:
train['Title'] = extract_titles_from_names(train)
train = impute_ages(train)
train = map_titles_to_categories(train)
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,F_Wed
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,F_Unwed
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,F_Wed
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr
5,6,0,3,"Moran, Mr. James",male,32.36809,0,0,330877,8.4583,,Q,Mr
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Mr
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Master
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,F_Wed
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,F_Wed


In [222]:
train = pd.get_dummies(train, columns=['Title'], drop_first=False)
train_records = pd.get_dummies(train, columns=['Sex'], drop_first=True)
train_records = train_records.drop(['PassengerId', 'Survived', 'Fare', 'Parch', 'SibSp','Name', 'Cabin', 'Ticket', 'Embarked'], axis=1)

In [223]:
train_X, val_X, train_y, val_y = train_test_split(train_records, y_predict, random_state=0)
val_X.head()

Unnamed: 0,Pclass,Age,Title_F_Unwed,Title_F_Wed,Title_Master,Title_Mr,Title_Professional,Title_Religon,Title_Royalty,Sex_male
495,3,32.36809,0,0,0,1,0,0,0,1
648,3,32.36809,0,0,0,1,0,0,0,1
278,3,7.0,0,0,1,0,0,0,0,1
31,1,35.898148,0,1,0,0,0,0,0,0
255,3,29.0,0,1,0,0,0,0,0,0


In [224]:
tree_count = 1800
    
rf_classifier: RandomForestClassifier = RandomForestClassifier(tree_count, max_features='sqrt',
                                                               min_samples_split= 5, min_samples_leaf= 2, max_depth = 70,
                                                               random_state=0)
rf_classifier.fit(train_X, train_y)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=70, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=1800, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [189]:
trees = rf_classifier.estimators_
sample_tree = trees[20]


In [218]:
dot_data = tree.export_graphviz(sample_tree, out_file=None, 
                         feature_names=train_X.columns.values,   
                         filled=True, rounded=True,  
                         special_characters=True)  

graph = graphviz.Source(dot_data)  
# graph

In [197]:
f = open("rf_follow.dot", "w")
f.write(dot_data)
f.close()

In [225]:
predictions = rf_classifier.predict(val_X)

In [226]:
validation_records = train.loc[val_X.index.values]

In [193]:
prediction_results = pd.DataFrame()
prediction_results['PassengerId'] = validation_records['PassengerId']
prediction_results['Name'] = validation_records['Name']
prediction_results['Age'] = validation_records['Age']
prediction_results['Pclass'] =  validation_records['Pclass']
prediction_results['Sex_male'] =  val_X['Sex_male']
prediction_results['Survived'] = val_y
prediction_results['Prediction'] = predictions
prediction_results['Error'] = prediction_results['Survived'] != prediction_results['Prediction']
print(prediction_results[prediction_results['Error']==True])

     PassengerId                                               Name  \
278          279                                 Rice, Master. Eric   
255          256            Touma, Mrs. Darwis (Hanne Youssef Razi)   
298          299                              Saalfeld, Mr. Adolphe   
346          347                          Smith, Miss. Marion Elsie   
803          804                    Thomas, Master. Assad Alexander   
474          475                        Strandberg, Miss. Ida Sofia   
519          520                                Pavlovic, Mr. Stefo   
55            56                                  Woolner, Mr. Hugh   
632          633                          Stahelin-Maeglin, Dr. Max   
587          588                   Frolicher-Stehli, Mr. Maxmillian   
740          741                        Hawksford, Mr. Walter James   
97            98                    Greenfield, Mr. William Bertram   
839          840                               Marechal, Mr. Pierre   
301   

In [227]:
print(predictions.shape)
print(prediction_results[prediction_results['Error']==False].count())

(223,)
PassengerId    179
Name           179
Age            179
Pclass         179
Sex_male       179
Survived       179
Prediction     179
Error          179
dtype: int64


In [228]:
179/223


0.8026905829596412

In [208]:
train[(train['Pclass']==3) & (train_X['Title_Master']==1)]


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title_F_Unwed,Title_F_Wed,Title_Master,Title_Mr,Title_Professional,Title_Religon,Title_Royalty
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,0,0,1,0,0,0,0
16,17,0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.125,,Q,0,0,1,0,0,0,0
59,60,0,3,"Goodwin, Master. William Frederick",male,11.0,5,2,CA 2144,46.9,,S,0,0,1,0,0,0,0
63,64,0,3,"Skoog, Master. Harald",male,4.0,3,2,347088,27.9,,S,0,0,1,0,0,0,0
65,66,1,3,"Moubarek, Master. Gerios",male,4.574167,1,1,2661,15.2458,,C,0,0,1,0,0,0,0
125,126,1,3,"Nicola-Yarred, Master. Elias",male,12.0,1,0,2651,11.2417,,C,0,0,1,0,0,0,0
159,160,0,3,"Sage, Master. Thomas Henry",male,4.574167,8,2,CA. 2343,69.55,,S,0,0,1,0,0,0,0
164,165,0,3,"Panula, Master. Eino Viljami",male,1.0,4,1,3101295,39.6875,,S,0,0,1,0,0,0,0
165,166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9.0,0,2,363291,20.525,,S,0,0,1,0,0,0,0
171,172,0,3,"Rice, Master. Arthur",male,4.0,4,1,382652,29.125,,Q,0,0,1,0,0,0,0


In [211]:
display_important_features(rf_classifier, train_X)

[('Age', 0.40554811489230497), ('Title_Mr', 0.18464388372059834), ('Sex_male', 0.16404493609341503), ('Pclass', 0.14053860691975167), ('Title_F_Wed', 0.04613744160079058), ('Title_F_Unwed', 0.03347884977304932), ('Title_Master', 0.013370788325870681), ('Title_Religon', 0.00500575199330976), ('Title_Professional', 0.00467503614978711), ('Title_Royalty', 0.0025565905311227127)]


In [213]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

In [215]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}


In [216]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_X,  train_y)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   34.8s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.1min finished


{'n_estimators': 1800,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_depth': 70}