In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression

In [3]:
from dm_utils import modeling
from dm_utils import model_preparation 
from dm_utils.model_wrappers import scikit_model 

In [4]:
import seaborn as sns

In [5]:
df = sns.load_dataset("titanic")
df.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


In [6]:
df = df[df["embark_town"].notnull() & df["age"].notnull()]

In [7]:
# one-hot encoding for categorical variables

embark_town_columns = model_preparation.onehot_column(df, "embark_town")
df[["embark_town"]+embark_town_columns].head()

Unnamed: 0,embark_town,embark_town_-_ONEHOT_-_Southampton,embark_town_-_ONEHOT_-_Cherbourg,embark_town_-_ONEHOT_-_Queenstown
0,Southampton,1,0,0
1,Cherbourg,0,1,0
2,Southampton,1,0,0
3,Southampton,1,0,0
4,Southampton,1,0,0


In [8]:
df["who"] = df["who"].replace({"child": 2, "woman": 1, "man": 0})
df["sex"] = df["sex"].replace({"female": 1, "male": 0})

In [9]:
feature_columns = embark_town_columns + ["pclass", "age", "sibsp"] + ["sex", "who"]

# Modeling

In [10]:
TARGET = "survived"

In [11]:
train_df, test_df = model_preparation.get_random_train_test(df, 0.7, TARGET)

### DecisionTree

In [12]:
_ = modeling.run_model(train_df, test_df, feature_columns, TARGET, DecisionTreeClassifier())

('train auc:', 0.9917325065839917)
('test  auc:', 0.8157388756637979)


In [13]:
modeling.run_model_n_times(df, feature_columns, TARGET, 0.7, DecisionTreeClassifier(), 50)

('train auc:', 0.9923417341734172)
('test  auc:', 0.7662424464383814)
deviation:
('train:', 0.0018498580356035683)
('test: ', 0.029416967029085583)


In [14]:
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(train_df[feature_columns], train_df[TARGET])

# visualize decision tree
scikit_model.print_decision_tree_with_names(clf.tree_, feature_columns, "    ")

        if row["who"] <= 0.500000:
            if row["pclass"] <= 1.500000:
                if row["age"] <= 53.000000:
                    neg = 30.000000
                    pos = 24.000000
                else:
                    neg = 13.000000
                    pos = 2.000000
            else:
                if row["age"] <= 32.250000:
                    neg = 121.000000
                    pos = 21.000000
                else:
                    neg = 73.000000
                    pos = 5.000000
        else:
            if row["pclass"] <= 2.500000:
                if row["age"] <= 37.000000:
                    neg = 1.000000
                    pos = 75.000000
                else:
                    neg = 4.000000
                    pos = 35.000000
            else:
                if row["sibsp"] <= 2.500000:
                    neg = 35.000000
                    pos = 37.000000
                else:
                    neg = 20.000000
                    pos = 3.0

### GBT

In [15]:
clf = GradientBoostingClassifier(max_depth=3, n_estimators=10)
clf.fit(train_df[feature_columns], train_df[TARGET])

# visualize gradient boosted trees
scikit_model.print_gbt_rules(clf, feature_columns, k=2)

def score_gbt(eval_df, model_name):
    """Score segment by gbt rule"""
    def score_by_gbt_tree_rule(row):
    # GBT model generated by Scikit-Learn
        score = 0.0
        ### tree_1 ###
        if row["who"] <= 0.500000:
            if row["pclass"] <= 1.500000:
                if row["age"] <= 53.000000:
                    score += 0.164502
                else:
                    score += -1.126744
            else:
                if row["age"] <= 32.250000:
                    score += -1.066338
                else:
                    score += -1.414081
        else:
            if row["pclass"] <= 2.500000:
                if row["age"] <= 37.000000:
                    score += 2.415686
                else:
                    score += 2.044612
            else:
                if row["sibsp"] <= 2.500000:
                    score += 0.452726
                else:
                    score += -1.138774

        ### tree_2 ###
        if row["who"] <= 0.500000:
     

### Logistic Regression

In [16]:
clf = LogisticRegression()
clf.fit(train_df[feature_columns], train_df[TARGET])

# show coefficients
scikit_model.get_lreg_coefficients(clf, feature_columns)

Unnamed: 0,name,value
0,(intercept),1.246548
1,pclass,-1.14305
2,sibsp,-0.513043
3,age,-0.019733
4,embark_town_-_ONEHOT_-_Queenstown,0.192953
5,embark_town_-_ONEHOT_-_Southampton,0.443067
6,embark_town_-_ONEHOT_-_Cherbourg,0.610528
7,who,1.121356
8,sex,1.586554
