In [None]:
#%pip install ISLP

In [None]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
from ISLP import load_data , confusion_table
from sklearn.ensemble import (RandomForestRegressor as RF,
                              GradientBoostingRegressor as GBR)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score,
                             confusion_matrix)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import (DecisionTreeClassifier as DTC,
                          DecisionTreeRegressor as DTR,
                          plot_tree,
                          export_text)

In [None]:
wage_raw = load_data("Wage")
wage_raw.head()

In [None]:
# one-hot encoding (converting string format to something more numeric)
cat_vars = ['maritl', 'race', 'education', 'region', 'jobclass', 'health_ins']
wage_df = pd.get_dummies(wage_raw, columns = cat_vars)
wage_df.head()

# Classification

* response variable: `health`
* explanatory varibles: all except `logwage`

In [None]:
X = wage_df.drop(['health', 'logwage'], axis = 1)
y = wage_df['health']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 301)

In [None]:
wage_tree = DTC(criterion='entropy', max_depth = 3, random_state = 301).fit(X_train, y_train)
ax = subplots(figsize=(12,12))[1]
plot_tree(wage_tree, feature_names = X_train.columns, ax = ax)

## Bagging

The **bagging** idea employs a *random forest* with all of the explanatory variables

In [None]:
# one-hot encoding (converting string format to something more numeric)
cat_vars = ['maritl', 'race', 'education', 'region', 'jobclass', 'health', 'health_ins']
wage_df = pd.get_dummies(wage_raw, columns = cat_vars)
wage_df.columns

X = wage_df.drop(['health_1. <=Good', 'health_2. >=Very Good', 'logwage'], axis = 1)
y = wage_df['health_2. >=Very Good']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 301)

bag_wage = RF(max_features = X_train.shape[1], #i.e. use all of the predictor variables
                random_state = 301)
bag_wage.fit(X_train, y_train)
#y_pred = bag_wage.predict(X_test)
#print(confusion_matrix(y_test, y_pred))
#print("Accuracy: ", accuracy_score(y_test, y_pred))

We can arrange the explanatory variables in order of *importance*

In [None]:
feature_imp = pd.DataFrame({'importance':bag_wage.feature_importances_},
                           index = X_train.columns)
feature_imp.sort_values(by='importance', ascending=False)

# Regression Trees

* response variable: `logwage`
* explanatory varibles: all except `wage`


In [None]:
# one-hot encoding (converting string format to something more numeric)
cat_vars = ['maritl', 'race', 'education', 'region', 'jobclass', 'health', 'health_ins']
wage_df = pd.get_dummies(wage_raw, columns = cat_vars)
wage_df.columns

X = wage_df.drop(['logwage', 'wage'], axis = 1)
y = wage_df['logwage']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 301)

## Boosting

In [None]:
boost_wage = GBR(n_estimators = 500,
                   learning_rate = 0.1,
                   max_depth = 4,
                   random_state = 301)
boost_wage.fit(X_train, y_train)
y_pred = boost_wage.predict(X_test)
print("MSE:", np.mean((y_test - y_pred)**2))

In [None]:
feature_imp = pd.DataFrame({'importance':boost_wage.feature_importances_},
                           index = X_train.columns)
feature_imp.sort_values(by='importance', ascending=False)