# Overview

This notebook generates the tree classifier model used in the streamlit demo.

In [None]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# read csv data into pandas dataframe
df = pd.read_csv('/content/linenassesment.csv')

In [None]:
# Prepare the data by separating X and y
# dropping unimportant features, such as passenger id, name, ticket number and cabin number
# note that interesting features might be engieered from the dropped features above

# axis = 1 below means dropping by columns, 0 means by rows
X = df.drop(['Passed', 'UserId', 'Name', 'IdCard', 'Passport'], axis=1)
y = df['Passed']
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Education  891 non-null    int64  
 1   Sex        891 non-null    object 
 2   Age        891 non-null    float64
 3   Sibling    891 non-null    int64  
 4   Parent     891 non-null    int64  
 5   Placetest  891 non-null    int64  
 6   English    891 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 48.9+ KB


In [None]:
# Split the data into a training set and a test set. 
# Any number for the random_state is fine, see 42: https://en.wikipedia.org/wiki/42_(number) 
# We choose to use 20% (test_size=0.2) of the data set as the test set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

(712, 7)
(179, 7)


In [None]:
# We will train our decision tree classifier with the following features:
# Numerical Features: ['Age', 'Sibling', 'Placetest']
# Categorical Features:['Sex', 'English', 'Education'

num_features = ['Age', 'Sibling', 'Placetest']
cat_features = ['Sex', 'Education']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Create the preprocessing pipeline for numerical features
# There are two steps in this pipeline
# Pipeline(steps=[(name1, transform1), (name2, transform2), ...]) 
# NOTE the step names can be arbitrary

# Step 1 is what we discussed before - filling the missing values if any using mean
# Step 2 is feature scaling via standardization - making features look like normal-distributed 
# see sandardization: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
num_pipeline = Pipeline(
    steps=[
        ('num_imputer', SimpleImputer()),  # we will tune differet strategies later
        ('scaler', StandardScaler()),
        ]
)

# Create the preprocessing pipelines for the categorical features
# There are two steps in this pipeline:
# Step 1: filling the missing values if any using the most frequent value
# Step 2: one hot encoding

cat_pipeline = Pipeline(
    steps=[
        ('cat_imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder()),
    ]
)

# Assign features to the pipelines and Combine two pipelines to form the preprocessor
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, num_features),
        ('cat_pipeline', cat_pipeline, cat_features),
    ]
)

In [None]:
# Specify the model to use, which is DecisionTreeClassifier
# Make a full pipeline by combining preprocessor and the model
from sklearn.tree import DecisionTreeClassifier

pipeline_dt = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('clf_dt', DecisionTreeClassifier()),
    ]
)

In [None]:
# we show how to use GridSearch with K-fold cross validation (K=10) to fine tune the model
# we use the accuracy as the scoring metric with training score return_train_score=True
from sklearn.model_selection import GridSearchCV

# set up the values of hyperparameters you want to evaluate
# here you must use the step names as the prefix followed by two under_scores to sepecify the parameter names and the "full path" of the steps

# we are trying 2 different impputer strategies 
# 2x5 different decision tree models with different parameters
# in total we are trying 2x2x5 = 20 different combinations

param_grid_dt = [
    {
        'preprocessor__num_pipeline__num_imputer__strategy': ['mean', 'median'],
        'clf_dt__criterion': ['gini', 'entropy'], 
        'clf_dt__max_depth': [3, 4, 5, 6, 7],
    }
]

# set up the grid search 
grid_search_dt = GridSearchCV(pipeline_dt, param_grid_dt, cv=10, scoring='accuracy')

In [None]:
# train the model using the full pipeline
grid_search_dt.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num_pipeline',
                                                                         Pipeline(steps=[('num_imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['Age',
                                                                          'Sibling',
                                                                          'Placetest']),
                                                                        ('cat_pipeline',
                                                                   

In [None]:
# check the best performing parameter combination
grid_search_dt.best_params_

{'clf_dt__criterion': 'entropy',
 'clf_dt__max_depth': 4,
 'preprocessor__num_pipeline__num_imputer__strategy': 'mean'}

In [None]:
# best decistion tree model test score
grid_search_dt.best_score_

0.7921361502347418

In [None]:
clf_best = grid_search_dt.best_estimator_

In [None]:
# final test on the testing set
# To predict on new data: simply calling the predict method 
# the full pipeline steps will be applied to the testing set followed by the prediction
y_pred = clf_best.predict(X_test)

from sklearn.metrics import accuracy_score
# calculate accuracy, Note: y_test is the ground truth for the tesing set
# we have similiar score for the testing set as the cross validation score - good

print(f'Accuracy Score : {accuracy_score(y_test, y_pred)}')

Accuracy Score : 0.7988826815642458


## Persist the Model
The following code shows how to save the trained model as a pickle file, which can be loaded in to make predictions.

In [None]:
# Save the model as a pickle file
import joblib
joblib.dump(clf_best, "clf-linen-best.pickle")

['clf-linen-best.pickle']

In [None]:
# Load the model from a pickle file
saved_tree_clf = joblib.load("clf-linen-best.pickle")
saved_tree_clf

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num_pipeline',
                                                  Pipeline(steps=[('num_imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'Sibling',
                                                   'Placetest']),
                                                 ('cat_pipeline',
                                                  Pipeline(steps=[('cat_imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder())]),
             

In [None]:
userid1 = pd.DataFrame(
    {
        'Education': [3],
        'Sex': ['male'], 
        'Age': [23],
        'Sibling': [0],
        'Placetest': [70],
    }
)
userid1

Unnamed: 0,Education,Sex,Age,Sibling,Placetest
0,3,male,23,0,70


In [None]:
userid2 = pd.DataFrame(
    {
        'Education': [1],
        'Sex': ['female'], 
        'Age': [21],
        'Sibling': [0],
        'Placetest': [80],
    }
)
userid2

Unnamed: 0,Education,Sex,Age,Sibling,Placetest
0,1,female,21,0,80


In [None]:
# not passed
pred1 = saved_tree_clf.predict(userid1)
pred1

array([0])

In [None]:
# passed
pred2 = saved_tree_clf.predict(userid2)
pred2

array([1])