In [None]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

In [None]:
import numpy as np

# Trees

In [None]:
import pandas as pd

In [None]:
data = pd.read_excel('data/CCPP/Folds5x2_pp.xlsx')

In [None]:
smoothed = data.groupby('V', as_index=False).mean()
smoothed = smoothed[smoothed.V > 30]

In [None]:
plt.plot(smoothed.V, smoothed.PE, '.')
plt.xlabel('Vacuum Pressure')
plt.ylabel('Power Output')

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [None]:
smoothed['Vsq'] = smoothed.V**2

In [None]:
lreg = LinearRegression().fit(smoothed[['V', 'Vsq']], smoothed.PE)

In [None]:
plt.plot(smoothed.V, smoothed.PE, '.', label='data', alpha=0.5)
xs = np.arange(35,80,.1)
plt.plot(xs, lreg.predict([(x, x**2) for x in xs]), label='linear regression')
#plt.plot(xs, tree.predict(xs.reshape(-1,1)), label='decision tree')
plt.xlabel('Vacuum Pressure')
plt.ylabel('Power Output')
plt.legend()
plt.savefig('../../Sparebank/talks/img/lr.png', dpi=300)

In [None]:
tree = DecisionTreeRegressor(max_depth=2).fit(smoothed[['V']], smoothed.PE)

In [None]:
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image

In [None]:
dot = export_graphviz(tree, out_file=None, feature_names=['V'], rounded=True, filled=True, class_names=['PE'])

In [None]:
graph = pydotplus.graph_from_dot_data(dot)
Image(graph.create_png())

In [None]:
graph.write_png('../../Sparebank/talks/img/reg_tree.png')

In [None]:
plt.plot(smoothed.V, smoothed.PE, '.', label='data', alpha=0.5)
xs = np.arange(35,80,.1)
plt.plot(xs, lreg.predict([(x, x**2) for x in xs]), label='linear regression')
plt.plot(xs, tree.predict(xs.reshape(-1,1)), label='decision tree')
plt.xlabel('Vacuum Pressure')
plt.ylabel('Power Output')
plt.legend()
plt.savefig('../../Sparebank/talks/img/lr_vs_tree.png', dpi=300)

In [None]:
deep_tree = DecisionTreeRegressor(max_depth=5).fit(smoothed[['V']], smoothed.PE)

In [None]:
shallow_tree = DecisionTreeRegressor(max_depth=1).fit(smoothed[['V']], smoothed.PE)

In [None]:
plt.plot(smoothed.V, smoothed.PE, '.', label='data', alpha=0.5)
xs = np.arange(35,80,.1)
plt.plot(xs, deep_tree.predict(xs.reshape(-1,1)), label='deep tree')
plt.plot(xs, shallow_tree.predict(xs.reshape(-1,1)), label='shallow tree')
plt.xlabel('Vacuum Pressure')
plt.ylabel('Power Output')
plt.legend()
plt.savefig('../../Sparebank/talks/img/tree_depth.png', dpi=300)

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(smoothed, test_size=0.2)
MSE_train, MSE_test = [], []
ds = range(1, 11)
for d in ds:
    model_d = DecisionTreeRegressor(max_depth=d).fit(train[['V']], train.PE)
    MSE_train.append(np.mean((model_d.predict(train[['V']]) - train.PE)**2))
    MSE_test.append(np.mean((model_d.predict(test[['V']]) - test.PE)**2))

In [None]:
plt.plot(ds, MSE_test, label='test error')
plt.plot(ds, MSE_train, label='train error')
plt.legend()
plt.xlabel('tree depth')
plt.ylabel('mean squared error')
plt.savefig('../../Sparebank/talks/img/tree_depth_train_test.png', dpi=300)

In [None]:
plt.plot(smoothed.V, smoothed.PE, '.', label='data', alpha=0.5)
xs = np.arange(35,80,.1)
plt.plot(xs, DecisionTreeRegressor(max_depth=3).fit(smoothed[['V']], smoothed.PE).predict(xs.reshape(-1,1)), label='ideal depth')
plt.xlabel('Vacuum Pressure')
plt.ylabel('Power Output')
plt.legend()

In [None]:
red_wine = pd.read_csv('data/winequality-red.csv', sep=';')

In [None]:
red_wine.plot.scatter('alcohol', 'quality')
plt.title('red wine quality vs. alcohol content')

In [None]:
red_wine.groupby('quality').quality.count().plot.bar()
plt.savefig('../../Sparebank/talks/img/winequals_bar.png')

In [None]:
red_wine.corr()

In [None]:
wine_tree = DecisionTreeRegressor(max_depth=3).fit(red_wine[['alcohol','sulphates']], red_wine.quality)
xx, yy = np.meshgrid(np.arange(8,15,.05), np.arange(0.4,2,0.01))
X, Y = xx.ravel(), yy.ravel()
Z = wine_tree.predict(np.c_[X,Y]).reshape(xx.shape)

In [None]:
from pylab import get_cmap

In [None]:
for qual, data in red_wine.groupby('quality'):
    if qual in [5,7]:
        plt.plot(data.alcohol, data.sulphates, '.', alpha=0.5, label={5: 'low quality',
                                                                      6: 'medium quality',
                                                                      7: 'high quality'}[qual])
#plt.contourf(xx, yy, Z, cmap=get_cmap('BuGn'), alpha=0.6)
plt.xlabel('alcohol content')
plt.ylabel('sulphates')
plt.legend()
plt.savefig('../../Sparebank/talks/img/winequal.png', dpi=300)

In [None]:
wine_dot = export_graphviz(wine_tree, out_file=None, rounded=True, filled=True, feature_names=['alcohol', 'sulphates'])

In [None]:
wine_graph = pydotplus.graph_from_dot_data(wine_dot)
Image(wine_graph.create_png())

In [None]:
wine_vars = list(red_wine.columns)

In [None]:
wine_vars.remove('quality')

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
Xtr, Xte, ytr, yte = train_test_split(red_wine[wine_vars], red_wine.quality > 4, test_size=0.3)
hr_train, hr_test = [], []
ds = range(1, 15)
for d in ds:
    model_d = DecisionTreeClassifier(max_depth=d).fit(Xtr, ytr)
    hr_train.append(np.mean(model_d.predict(Xtr) == ytr))
    hr_test.append(np.mean(model_d.predict(Xte) == yte))
plt.plot(ds, hr_test, label='test accuracy')
plt.plot(ds, hr_train, label='train accuracy')
plt.legend()
plt.xlabel('tree depth')
plt.ylabel('accuracy')
plt.title('Quality 4 or higher?')
plt.savefig('../../Sparebank/talks/img/tree_depth_train_test_wine_bad.png', dpi=300)

In [None]:
Xtr, Xte, ytr, yte = train_test_split(red_wine[wine_vars], red_wine.quality > 6, test_size=0.33)
hr_train, hr_test = [], []
ds = range(1, 15)
for d in ds:
    model_d = DecisionTreeClassifier(max_depth=d).fit(Xtr, ytr)
    hr_train.append(np.mean(model_d.predict(Xtr) == ytr))
    hr_test.append(np.mean(model_d.predict(Xte) == yte))
plt.plot(ds, hr_test, label='test accuracy')
plt.plot(ds, hr_train, label='train accuracy')
plt.legend()
plt.xlabel('tree depth')
plt.ylabel('accuracy')
plt.title('Quality 6 or higher?')
plt.savefig('../../Sparebank/talks/img/tree_depth_train_test_wine_great.png', dpi=300)

In [None]:
dt = DecisionTreeClassifier(max_depth=3).fit(Xtr, ytr)
relevant = dt.feature_importances_ > 0
labels = np.array(wine_vars)[relevant]
plt.bar(range(len(labels)), dt.feature_importances_[relevant])
plt.xticks(range(len(labels)), labels, rotation='vertical')
plt.ylabel('importance')
plt.title('What makes a great wine?')
plt.savefig('../../Sparebank/talks/img/wine_importance.png', dpi=300, bbox_inches='tight')

In [None]:
wine_vars, dt.feature_importances_

In [None]:
dtm = DecisionTreeClassifier(max_depth=2).fit(Xtr, ytr)

In [None]:
for qual in sorted(yte.unique()):
    hr = np.mean(dtm.predict(Xte[yte == qual]) == yte[yte==qual])
    perc = sum(yte==qual) / float(len(yte))
    print qual, " ".join("{:4.2f}".format(i) for i in (hr, perc))

In [None]:
dtm = DecisionTreeClassifier(max_depth=6).fit(Xtr, ytr)
for qual in sorted(yte.unique()):
    hr = np.mean(dtm.predict(Xte[yte == qual]) == yte[yte==qual])
    perc = sum(yte==qual) / float(len(yte))
    print qual, " ".join("{:4.2f}".format(i) for i in (hr, perc))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier().fit(Xtr, ytr > 5)

In [None]:
from sklearn.metrics import roc_curve

In [None]:
fpr, tpr, thresholds = roc_curve(yte > 5, rf.predict_proba(Xte)[:,1], drop_intermediate=False)

In [None]:
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1])
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.title('ROC curve')
plt.savefig('../../Sparebank/talks/img/ROC.png', dpi=300)