- Analyze tips data from seaborn
    - Features = Total bill, sex, smoker, day, and time
    - Target = Tips
- Preprocess
    - One hot encoding: smoker, day, time, sex
    - no treatment: numerical
- Random state 2020, splitting 80:20
    - Model linear regression and tree (criterion mse, max_depth 5) compute R2 uing 5 fold CV
    - Model decision tree (criterion mse, max_depth 5) compute mse in test set
    - Do hyperparameter tuning (Randomized search) for decision tree (optimize criterion, min sample leaf, max depth) optimized by sme and using cv 5 fold
    - Compare the result before and after in test set

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, classification_report, f1_score
from sklearn.metrics import roc_curve, roc_auc_score, plot_roc_curve, precision_recall_curve, average_precision_score, plot_precision_recall_curve

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
x = df[['total_bill', 'sex', 'smoker', 'day', 'time']]
x = pd.get_dummies(x, drop_first= True)
y = df['tip']

In [5]:
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size = 0.2,
    random_state = 2020
)

# MODEL

In [14]:
tree = DecisionTreeRegressor(criterion='mse', max_depth=5, random_state=2020)
linreg = LinearRegression()

In [18]:
tree_cv = cross_val_score(tree, x_train, y_train, scoring = 'r2')
linreg_cv = cross_val_score(linreg, x_train, y_train, scoring = 'r2')

In [20]:
print('Tree')
print('Hasil Cross Validasi', tree_cv)
print('Hasil Mean Cross Validasi', tree_cv.mean())
print('Hasil STD Cross Validasi', tree_cv.std())

Tree
Hasil Cross Validasi [0.11865437 0.09663697 0.13388799 0.60844872 0.41722998]
Hasil Mean Cross Validasi 0.27497160666626597
Hasil STD Cross Validasi 0.2037583555823742


In [21]:
print('Linreg')
print('Hasil Cross Validasi', linreg_cv)
print('Hasil Mean Cross Validasi', linreg_cv.mean())
print('Hasil STD Cross Validasi', linreg_cv.std())

Linreg
Hasil Cross Validasi [0.27243371 0.18984475 0.31816682 0.46456813 0.44769515]
Hasil Mean Cross Validasi 0.33854171153933316
Hasil STD Cross Validasi 0.10458807015801638


# Tree test score

In [22]:
tree = DecisionTreeRegressor(criterion='mse', max_depth=5, random_state=2020)
tree.fit(x_train,y_train)

DecisionTreeRegressor(max_depth=5, random_state=2020)

In [23]:
y_pred = tree.predict(x_test)
mean_squared_error(y_test, y_pred)

1.1723927020084088

# Hyperparameter Tuning untuk Tree

In [29]:
hyperparam = {
    'min_samples_leaf': [1,5,10,15,20,50,100], # 6
    'max_depth': [2,3,4,5,6,7], # 6
    'criterion': ['mse', 'mae'] # 2
}

# 72 kombinasi yang akan dibandingkan
# pilih 20 dari 72
# random state 2020 ada 10 =  1 3 4 10

tree = DecisionTreeRegressor(criterion='mse', max_depth=5, random_state=2020)

randomized_search = RandomizedSearchCV(
    tree,
    param_distributions = hyperparam,
    n_iter = 20,
    cv = 5,
    scoring = 'r2',
    n_jobs = -1
)


In [30]:
randomized_search.fit(x_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=DecisionTreeRegressor(max_depth=5,
                                                   random_state=2020),
                   n_iter=20, n_jobs=-1,
                   param_distributions={'criterion': ['mse', 'mae'],
                                        'max_depth': [2, 3, 4, 5, 6, 7],
                                        'min_samples_leaf': [1, 5, 10, 15, 20,
                                                             50, 100]},
                   scoring='r2')

In [31]:
randomized_search.best_score_

0.42723761679129024

In [32]:
randomized_search.best_params_

{'min_samples_leaf': 1, 'max_depth': 3, 'criterion': 'mse'}

In [35]:
hasil_cv = pd.DataFrame(randomized_search.cv_results_)
hasil_cv[hasil_cv['param_min_samples_leaf'] == 1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,0.006209,0.001171,0.004598,0.002344,1,3,mae,"{'min_samples_leaf': 1, 'max_depth': 3, 'crite...",0.27918,0.187959,0.276184,0.671596,0.508687,0.384721,0.178493,2
7,0.006296,0.002044,0.003405,0.001012,1,2,mae,"{'min_samples_leaf': 1, 'max_depth': 2, 'crite...",0.349978,0.182995,0.162519,0.507202,0.331705,0.30688,0.125531,5
13,0.004373,0.000471,0.004787,0.002034,1,3,mse,"{'min_samples_leaf': 1, 'max_depth': 3, 'crite...",0.401605,0.344567,0.172655,0.660094,0.557267,0.427238,0.169371,1
