In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# set defaults
plt.style.use('seaborn-white')   # seaborn custom plot style
plt.rc('figure', dpi=100, figsize=(7, 5))   # set default size/resolution
plt.rc('font', size=12)   # font size

tips = sns.load_dataset('tips')

# Examples

Outline

* Grid Search / Classification.
* Regression with feature selection (multicollinearity)
* Text model (bag of words /  tfidf).

### Example: Predicting Diabetes

* Given health measurements, can you predict whether an individual will develop diabetes?
* Classification of health outcomes: 0=NO DIABETES and 1=DIABETES
* We will use 'accuracy' to evaluate the model (not necessarily the best choice!)

In [None]:
diabetes = pd.read_csv('data/diabetes.csv')
diabetes.head()

In [None]:
diabetes[['Pregnancies','Insulin', 'BMI', 'Glucose', 'Outcome']].describe()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [None]:
# features
X = diabetes.drop('Outcome', axis=1)
# outcome
y = diabetes.Outcome

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12) # 70% training and 30% test

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [None]:
# proportion for correct values
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
# look at the importance of each feature in our tree model
# _: attributes created when model is fit. Contains info that it learned from the data
dict(zip(X.columns, clf.feature_importances_))

In [None]:
# maximum depth
clf.tree_.max_depth

In [None]:
# node count
clf.tree_.node_count

In [None]:
clf_small = DecisionTreeClassifier(max_depth=4)
clf_small.fit(X_train, y_train)
clf_small.score(X_train, y_train)

In [None]:
clf_small.score(X_test, y_test)

In [None]:
from util import tree_to_code

In [None]:
tree_to_code(clf_small, X.columns)

In [None]:
tree_to_code(clf, X.columns)

## How to select model parameters?

* Each combination of model parameters requires: 
    - fitting a model; evaluating the model; comparing performance.
* This leads to training perhaps *thousands* of models!
    - How to do this in a computationally feasible way?
    - How to keep your code clean to keep track?

## How to select model parameters?

* How to select in a computationally feasible way?
    - Trying every combination ('grid search') works for smaller data/models.
    - Sampling possible combinations of parameters using probabilistic reasoning.
    
* How to keep your code clean to keep track?
    - The `sklearn.model_selection` package.

### Example: parameter search for diabetes prediction

* Decision tree has possible parameters:
    - `max_depth`, `min_samples_split`, `min_samples_leaf`
    - plus others we won't consider
    
* We will use `GridSearchCV` to explore parameter combinations using cross-validation

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# How many combinations are there to search through?

parameters = {
    'max_depth': [2,3,4,5,7,10,13,15,18,None], 
    'min_samples_split':[2,3,5,7,10,15,20],
    'min_samples_leaf':[2,3,5,7,10,15,20]
}

In [None]:
[len(v) for v in parameters.values()]

In [None]:
clf = GridSearchCV(DecisionTreeClassifier(), parameters, cv = 5)

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf#.best_params_

In [None]:
# number of competing models
len(clf.cv_results_['mean_test_score'])

In [None]:
# total number of models fit = (# of param combos) x (5 folds)
490 * 5

### Was the "best parameter" the only "best choice"

* Plot the histogram of scores across all models
* Many high values implies many good model choices
    - Robust model
* Plot the histogram of standard deviations for the CVs
    - many small numbers imply most parameters give stable models.

In [None]:
# Each x-value is the score of a different model,
# corresponding to a choice of parameters
#
plt.hist(clf.cv_results_['mean_test_score'], bins=12)
plt.suptitle('accuracies on validation set for CV');

In [None]:
# if many models were good and had high accuracy => good
# you do not want a model that really depends on parameter choices => likely, overfitting
# if a lot of good model choices: you are learning real patterns in the data 

In [None]:
# Each x-value is the standard deviation for the 5-fold CV for a single model
# corresponding to a choice of parameters
plt.hist(clf.cv_results_['std_test_score'])
plt.suptitle('std deviation of CV scores');

In [None]:
# if we have a high std of accuracy: model varied a lot, depending on how 
# you split your data.
# If std is low, then you are getting the same model performace 
# ^^ tells you what you should expect for fluctuation in your accuracy above


In [None]:
clf_best = clf.best_estimator_

In [None]:
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
tree_to_code(clf_best, X.columns)

# Multicollinearity

## Regression with Multicollinearity

* Linear regression with (perfectly) correlated features leads to high variance (unstable) models.
* When the dataset ~1-dimensional in 3-dim space, fitting a plane is under-determined.
* Regression gives you a plane of best fit but there are other planes that will give very similar error when fitting it that look very different: overfit model
* Use Principal Component Analysis to drop unneeded features.

<img src="imgs/multicollinearity.png" width="50%">

## Regression: predicting tips data

* Use quantitative feature "as is".
* Use all features: one-hot encode categorical features.
    - Many perfectly correlated features: e.g. `is_Lunch` and `is_Dinner`.

In [None]:
tips = sns.load_dataset('tips')

In [None]:
tips.head()

## Determining Correlated features

How many columns can we keep without losing information?

In [None]:
catcols = ['sex', 'smoker', 'day', 'time']
pd.get_dummies(tips).head()      # only use get_dummies for illustration, NOT model building!

## Creating a Regression Pipeline
* Drop correlated features using Principal Component Analysis (PCA)
* Categorical Variables: One-Hot Encoding => PCA (drop correlated features)
* Quantitative Variables: Pass-through
* Estimator: Linear Regression

In [None]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

In [None]:
# svd_solver: how much information do you want to drop
# default: drops nothing
cats = Pipeline([
    ('ohe', OneHotEncoder(sparse=False)), 
    ('pca', PCA(svd_solver='full'))
])
catcols = ['sex', 'smoker', 'day', 'time']

nums = FunctionTransformer(lambda x:x)
numcols = ['total_bill', 'size']

ct = ColumnTransformer([('cat', cats, catcols), ('num', nums, numcols)])

pl = Pipeline([('feats', ct), ('lr', LinearRegression())])

### Use Parameter Grid-Search with Cross-Validation

* Inspect the parameters of the pipeline with `get_params`: use to get keys for parameter-grid.
* Train models using a grid search and analyze the results!

In [None]:
help(PCA)

In [None]:
pl.get_params().keys()

In [None]:
# None: keep all variables
# 0.90 : keep 90% of the information
params = {'feats__cat__pca__n_components':[.8, .85, .9, .95, None]}
grids = GridSearchCV(pl, param_grid=params, cv=5)

In [None]:
X_tr, X_ts, y_tr, y_ts = train_test_split(tips, tips.tip)
grids.fit(X_tr, y_tr)

In [None]:
grids.best_params_

In [None]:
# The best_score_ is the average over your cross-validation fold scores of the best model 
# (best in exactly that sense: scores highest on average over folds)
grids.best_score_

In [None]:
grids.cv_results_

In [None]:
grids.best_estimator_.score(X_ts, y_ts)

## Housing Price Prediction: Sales Price from House Characteristics

* Create a 'baseline' model: a model pipeline without ingenuity.
* Try to improve upon the the baseline with creativity and domain knowledge.

In [None]:
housing = pd.read_csv('data/housing.csv')
housing.head()

In [None]:
housing.info()

In [None]:
# drop id and SalesPrice from features

X = housing.drop(['Id', 'SalePrice'], axis=1)
y = housing['SalePrice']

## Baseline Model

* Do the 'minimum' to fit a model and check the performance.
    - Quantitative: impute with zero and pass-through (Why is zero not a *terrible* choice?)
    - Categorical: One-hot encoding (what assumption is this making about the data?)

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
types = X.dtypes
catcols = types.loc[types == np.object].index
numcols = types.loc[types != np.object].index

In [None]:
cats = Pipeline([
    ('imp', SimpleImputer(strategy='constant', fill_value='NULL')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False)),
#    ('pca', PCA(svd_solver='full', n_components=0.99))
])

ct = ColumnTransformer([
    ('catcols', cats, catcols),
    ('numcols', SimpleImputer(strategy='constant', fill_value=0), numcols)
])

pl = Pipeline([('feats', ct), ('reg', LinearRegression())])

In [None]:
X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=0.25)

In [None]:
pl.fit(X_tr, y_tr)
pl.score(X_ts, y_ts)

In [None]:
preds = pl.predict(X_ts)
np.sqrt(np.mean(preds - y_ts)**2)

In [None]:
out = []
for _ in range(100):
    X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=0.25)
    pl.fit(X_tr, y_tr)
    out.append(pl.score(X_ts, y_ts))

In [None]:
pd.Series(out).plot(kind='hist', title='scores in 100 model builds', bins=np.linspace(0, 1, 10));

## Improved Model

Question: What features and model improvements can we use?

In [None]:
housing.head()

## Improved Model

* If using linear regression: PCA to drop extra information.
* Better techniques for categorical encoding? (Ordinal vs Nominal?)
* Better techniques for imputation?
* Other models to try? (Tree-based?)

In [None]:
from sklearn.ensemble import RandomForestRegressor

cats = Pipeline([
    ('imp', SimpleImputer(strategy='constant', fill_value='NULL')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False)),
    ('pca', PCA(svd_solver='full', n_components=0.99))
])

ct = ColumnTransformer([
    ('catcols', cats, catcols),
    ('numcols', SimpleImputer(strategy='constant', fill_value=0), numcols)
])

pl = Pipeline([('feats', ct), ('reg', RandomForestRegressor())])

In [None]:
X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=0.25)

In [None]:
pl.fit(X_tr, y_tr)
pl.score(X_ts, y_ts)

In [None]:
preds = pl.predict(X_ts)
np.sqrt(np.mean(preds - y_ts)**2)

## Predicting Reviews

In [None]:
reviews = pd.read_json(open('data/reviews.json'), lines=True)

In [None]:
reviews

In [None]:
X = reviews['summary']
y = (reviews[['overall']] > 3).astype(int)

X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=0.25)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [None]:
count_vec = CountVectorizer()

In [None]:
count_vec.fit(X_tr)

In [None]:
vocab = sorted(count_vec.vocabulary_, key=lambda x:x[1])

pd.DataFrame(
    data=count_vec.transform(X).toarray(),
    columns=vocab
).head()

In [None]:
pl = Pipeline([
    ('cv', CountVectorizer()), 
    ('clf', RandomForestClassifier(max_depth=8, n_estimators=7))
])

In [None]:
pl.fit(X_tr, y_tr)

In [None]:
pl.score(X_ts, y_ts)

In [None]:
clf = pl.named_steps['clf']

In [None]:
# What are the most important features?
pd.Series(
    data=clf.feature_importances_,
    index=vocab
).sort_values(ascending=False)

In [None]:
word = 'annoy'

reviews[reviews['summary'].str.lower().str.contains(word)]

In [None]:
reviews[reviews['summary'].str.lower().str.contains(word)].overall.plot(kind='hist')

In [None]:
pl.get_params().keys()

In [None]:
params = {
    'clf__max_depth': np.arange(2,500,20)
}

In [None]:
grids = GridSearchCV(pl, param_grid=params, cv=3, return_train_score=True)

In [None]:
grids.fit(X_tr, y_tr)

In [None]:
grids.cv_results_.keys()

In [None]:
index = grids.param_grid['clf__max_depth']
test = grids.cv_results_['mean_test_score']
train = grids.cv_results_['mean_train_score']

In [None]:
pd.DataFrame({'test': test, 'train': train}, index=index).plot()