In [None]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# Lecture 6

In [None]:
rs = np.arange(0,1,0.001)
def e(p, r):
    return r**(1./p)
for p in (1, 2, 4, 10):
    plt.plot(rs, e(p, rs), label=r'$p = {}$'.format(p))
plt.xlabel('sampled fraction of observations $r$')
plt.ylabel('edge length $e_p(r)$')
plt.legend()
plt.savefig('../05-linreg/img/ep.png', dpi=300)

In [None]:
from scipy.stats import t, norm

In [None]:
norm.pdf(2, 0, 1)

In [None]:
t.pdf(1, 100)

In [None]:
zs = np.arange(3,6,.01)
dofs = (30, 50, 100)
for d in dofs:
    plt.plot(zs, t.pdf(zs, d), label="$t_{{{}}}$".format(d))
plt.plot(zs, norm.pdf(zs, 0, 1), label="normal")
plt.legend()
plt.xlabel("Z")
plt.ylabel("P(Z)")
plt.savefig("../05-linreg/img/tails.png", dpi=300)

In [None]:
fig_size = plt.rcParams['figure.figsize']

In [None]:
plt.rcParams['figure.figsize'] = [5,5]

In [None]:
delta = 0.025
x = np.arange(-1.5, 1.5, delta)
y = np.arange(-1.5, 1.5, delta)
X, Y = np.meshgrid(x, y)
z_ridge = [i**2 + j**2 for i,j in zip(X,Y)]
cs = plt.contour(X, Y, z_ridge)
plt.clabel(cs)
plt.xlabel(r'$\theta_1$')
plt.ylabel(r'$\theta_2$')
plt.title('Ridge penalty contour lines.')
plt.savefig('../05-linreg/img/ridge.png', dpi=300)

In [None]:
z_lasso = [abs(i) + abs(j) for i,j in zip(X,Y)]
cs = plt.contour(X, Y, z_lasso)
plt.clabel(cs)
plt.xlabel(r'$\theta_1$')
plt.ylabel(r'$\theta_2$')
plt.title('Lasso penalty contour lines.')
plt.savefig('../05-linreg/img/lasso.png', dpi=300)

In [None]:
alpha = 0.5
z_elnet = [alpha*(i**2 + j**2) + (1-alpha)*(abs(i) + abs(j)) for i,j in zip(X,Y)]
cs = plt.contour(X, Y, z_elnet)
plt.clabel(cs)
plt.xlabel(r'$\theta_1$')
plt.ylabel(r'$\theta_2$')
plt.title(r'Elastic net penalty contour lines, $\alpha = 0.5$.')
plt.savefig('../05-linreg/img/elnet.png', dpi=300)

# Classification

In [None]:
sigma = np.array([[0.5,0.2], [0.2,0.5]])*1.1
means = np.array([[2,1],[-1,1], [0,-2]])

In [None]:
from scipy.optimize import minimize_scalar

In [None]:
Sinv = np.linalg.inv(sigma)
def ne(X, mu):
    delta = X - mu
    return np.dot(np.dot(delta.T, Sinv), delta)

In [None]:
def b(x, i, j):
    def f(y):
        X = np.array([x, y])
        return (ne(X, means[i]) - ne(X, means[j]))**2
    return minimize_scalar(f).x

In [None]:
from itertools import combinations

In [None]:
N = 50
for n, mu in enumerate(means):
    x, y = np.random.multivariate_normal(mu, sigma, N).T
    plt.scatter(x, y, marker='${}$'.format(n))
xs = np.arange(0.01, -3, -0.5)
plt.plot(xs, [b(x,1,2) for x in xs], color = 'black')
xs = np.arange(0.05, 1.7, 0.2)
plt.plot(xs, [b(x, 0, 1) for x in xs], color = 'black')
xs = np.arange(0.05, 3, 0.5)
plt.plot(xs, [b(x, 0, 2) for x in xs], color = 'black')
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
plt.title('class labels and decision boundaries')
plt.savefig('../08-classification/img/linear_boundaries.png', dpi=300)

In [None]:
from scipy.stats import multivariate_normal

In [None]:
delta = 0.05
x = np.arange(-3, 4, delta)
y = np.arange(-4, 3, delta)
X, Y = np.meshgrid(x, y)
pos = np.empty(X.shape + (2,))
pos[:, :, 0] = X; pos[:, :, 1] = Y
for i in range(3):
    p = multivariate_normal(means[i], sigma)
    z = p.pdf(np.array(pos))
    cs = plt.contour(X, Y ,z)
    plt.clabel(cs)
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
plt.title('probability density contour lines')
plt.savefig('../08-classification/img/countour_lines.png', dpi=300)

In [None]:
sigma = np.array([[1, 0], [0, 1]])*0.05
means = [[-2, -1], [0, 0], [2, 1]]
N = 200
X = [np.random.multivariate_normal(mean, sigma, N) for mean in means]

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
models = []
for k in range(len(means)):
    y = []
    for l in range(len(means)):
        y += [l == k and 1 or 0]*N
    models.append(LinearRegression().fit(np.vstack(X), y))

In [None]:
def bl(x, i, j):
    def f(y):
        X = np.array([[x, y]])
        return (models[i].predict(X) - models[j].predict(X))**2
    return minimize_scalar(f).x

In [None]:
def classify(x):
    return np.argmax([model.predict(x) for model in models])

In [None]:
from matplotlib.markers import MarkerStyle

In [None]:
for n, vals in enumerate(X):
    x, y = vals.T
    plt.scatter(x, y, marker='${}$'.format(n))
n = 20
xs, ys = np.meshgrid(np.linspace(-2, 2, n), np.linspace(-1.5, 1.5, n))
xs, ys = xs.ravel(), ys.ravel()
labels = np.array([classify([[x ,y]]) for x,y in zip(xs, ys)])
for label in range(len(means)):
    plt.scatter(xs[labels == label], ys[labels == label], marker='${}$'.format(label), alpha=0.6)
#xs = np.arange(-0.35, 0.35, 0.01)
#plt.plot(xs, [bl(x, 0, 1) for x in xs], color='black')
#xs = np.arange(-1, 1.0, 0.01)
#plt.plot(xs, [bl(x, 1, 2) for x in xs], color='black')
#xs = np.arange(-0.65, 0.65, 0.01)
#plt.plot(xs, [bl(x, 0, 2) for x in xs], color='black')
plt.xlabel('$X_1$')
plt.ylabel('$X_2$')
plt.savefig('../08-classification/img/linear_model.png', dpi=300)

In [None]:
import pandas as pd

In [None]:
wine = pd.read_csv('data/winequality-red.csv', sep=';')

In [None]:
wine.plot.scatter('alcohol', 'quality')
plt.title('red wine quality vs. alcohol content')
plt.savefig('../08-classification/img/winequal.png', dpi=300)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

In [None]:
import numpy as np

In [None]:
X1, X2 = np.random.standard_normal(10), np.random.standard_normal(10) + 5

In [None]:
y1, y2 = np.zeros(10), np.zeros(10) + 1

In [None]:
lin_model = LinearRegression().fit(np.hstack((X1, X2)).reshape(-1,1), np.hstack((y1, y2)))

In [None]:
plt.plot(X1, y1, 'o')
plt.plot(X2, y2, 'o')
plt.plot(np.arange(-2, 8, 1), lin_model.predict(np.arange(-2, 8, 1).reshape(-1, 1)),
         label=r'$f(x) = \theta_0 + \theta_1x$')
plt.plot([-2, 8], [0.5, 0.5], label='decision threshold')
plt.xlabel('$X$')
plt.ylabel('$g$')
plt.legend(loc='lower right')
plt.savefig('../08-classification/img/lin_reg_class.png', dpi=300)

In [None]:
X2[-1] = 16

In [None]:
lin_model = LinearRegression().fit(np.hstack((X1, X2)).reshape(-1,1), np.hstack((y1, y2)))

In [None]:
plt.plot(X1, y1, 'o')
plt.plot(X2, y2, 'o')
plt.plot(np.arange(-2, 8, 1), lin_model.predict(np.arange(-2, 8, 1).reshape(-1, 1)),
        label=r'$f(x) = \theta_0 + \theta_1x$')
plt.plot([-2, 8], [0.5, 0.5], label='decision threshold')
plt.xlabel('$X$')
plt.ylabel('$g$')
plt.legend(loc='lower right')
plt.savefig('../08-classification/img/lin_reg_class_outlier.png', dpi=300)

In [None]:
lr_model = LogisticRegression().fit(np.hstack((X1, X2)).reshape(-1,1), np.hstack((y1, y2)))

In [None]:
lr_model.coef_, lr_model.intercept_

In [None]:
plt.plot(X1, y1, 'o')
plt.plot(X2, y2, 'o')
xs = np.arange(-2.5, 15, 0.02)
plt.plot(xs, lr_model.predict_proba(xs.reshape(-1, 1)).T[1],
        label=r'$f(x) = \frac{1}{1 + exp(\theta_0 + \theta_1 x)}$')
plt.plot([-2, 8], [0.5, 0.5], label='decision threshold')
plt.xlabel('$X$')
plt.ylabel('$g$')
plt.legend(loc='lower right')
plt.savefig('../08-classification/img/log_reg_class_outlier.png', dpi=300)

# Trees

In [None]:
import pandas as pd

In [None]:
data = pd.read_excel('data/CCPP/Folds5x2_pp.xlsx')

In [None]:
smoothed = data.groupby('V', as_index=False).mean()
smoothed = smoothed[smoothed.V > 30]

In [None]:
plt.plot(smoothed.V, smoothed.PE, '.')
plt.xlabel('Vacuum Pressure')
plt.ylabel('Power Output')

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [None]:
smoothed['Vsq'] = smoothed.V**2

In [None]:
lreg = LinearRegression().fit(smoothed[['V', 'Vsq']], smoothed.PE)

In [None]:
plt.plot(smoothed.V, smoothed.PE, '.', label='data', alpha=0.5)
xs = np.arange(35,80,.1)
plt.plot(xs, lreg.predict([(x, x**2) for x in xs]), label='linear regression')
#plt.plot(xs, tree.predict(xs.reshape(-1,1)), label='decision tree')
plt.xlabel('Vacuum Pressure')
plt.ylabel('Power Output')
plt.legend()
plt.savefig('../../Sparebank/talks/img/lr.png', dpi=300)

In [None]:
tree = DecisionTreeRegressor(max_depth=2).fit(smoothed[['V']], smoothed.PE)

In [None]:
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image

In [None]:
dot = export_graphviz(tree, out_file=None, feature_names=['V'], rounded=True, filled=True, class_names=['PE'])

In [None]:
graph = pydotplus.graph_from_dot_data(dot)
Image(graph.create_png())

In [None]:
graph.write_png('../../Sparebank/talks/img/reg_tree.png')

In [None]:
plt.plot(smoothed.V, smoothed.PE, '.', label='data', alpha=0.5)
xs = np.arange(35,80,.1)
plt.plot(xs, lreg.predict([(x, x**2) for x in xs]), label='linear regression')
plt.plot(xs, tree.predict(xs.reshape(-1,1)), label='decision tree')
plt.xlabel('Vacuum Pressure')
plt.ylabel('Power Output')
plt.legend()
plt.savefig('../../Sparebank/talks/img/lr_vs_tree.png', dpi=300)

In [None]:
deep_tree = DecisionTreeRegressor(max_depth=5).fit(smoothed[['V']], smoothed.PE)

In [None]:
shallow_tree = DecisionTreeRegressor(max_depth=1).fit(smoothed[['V']], smoothed.PE)

In [None]:
plt.plot(smoothed.V, smoothed.PE, '.', label='data', alpha=0.5)
xs = np.arange(35,80,.1)
plt.plot(xs, deep_tree.predict(xs.reshape(-1,1)), label='deep tree')
plt.plot(xs, shallow_tree.predict(xs.reshape(-1,1)), label='shallow tree')
plt.xlabel('Vacuum Pressure')
plt.ylabel('Power Output')
plt.legend()
plt.savefig('../../Sparebank/talks/img/tree_depth.png', dpi=300)

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(smoothed, test_size=0.2)
MSE_train, MSE_test = [], []
ds = range(1, 11)
for d in ds:
    model_d = DecisionTreeRegressor(max_depth=d).fit(train[['V']], train.PE)
    MSE_train.append(np.mean((model_d.predict(train[['V']]) - train.PE)**2))
    MSE_test.append(np.mean((model_d.predict(test[['V']]) - test.PE)**2))

In [None]:
plt.plot(ds, MSE_test, label='test error')
plt.plot(ds, MSE_train, label='train error')
plt.legend()
plt.xlabel('tree depth')
plt.ylabel('mean squared error')
plt.savefig('../../Sparebank/talks/img/tree_depth_train_test.png', dpi=300)

In [None]:
plt.plot(smoothed.V, smoothed.PE, '.', label='data', alpha=0.5)
xs = np.arange(35,80,.1)
plt.plot(xs, DecisionTreeRegressor(max_depth=3).fit(smoothed[['V']], smoothed.PE).predict(xs.reshape(-1,1)), label='ideal depth')
plt.xlabel('Vacuum Pressure')
plt.ylabel('Power Output')
plt.legend()
plt.savefig('../../Sparebank/talks/img/best_tree.png', dpi=300)

In [None]:
red_wine = pd.read_csv('data/winequality-red.csv', sep=';')

In [None]:
red_wine.groupby('quality').quality.count().plot.bar()
plt.savefig('../../Sparebank/talks/img/winequals_bar.png')

In [None]:
red_wine.corr()

In [None]:
for qual, data in red_wine.groupby('quality'):
    if qual in [5,7]:
        plt.plot(data.alcohol, data.sulphates, '.')

In [None]:
wine_tree = DecisionTreeRegressor(max_depth=3).fit(red_wine[['alcohol','sulphates']], red_wine.quality)
xx, yy = np.meshgrid(np.arange(8,15,.05), np.arange(0.4,2,0.01))
X, Y = xx.ravel(), yy.ravel()
Z = wine_tree.predict(np.c_[X,Y]).reshape(xx.shape)

In [None]:
from pylab import get_cmap

In [None]:
for qual, data in red_wine.groupby('quality'):
    if qual in [5,7]:
        plt.plot(data.alcohol, data.sulphates, '.', alpha=0.5, label={5: 'low quality',
                                                                      6: 'medium quality',
                                                                      7: 'high quality'}[qual])
#plt.contourf(xx, yy, Z, cmap=get_cmap('BuGn'), alpha=0.6)
plt.xlabel('alcohol content')
plt.ylabel('sulphates')
plt.legend()
plt.savefig('../../Sparebank/talks/img/winequal.png', dpi=300)

In [None]:
wine_dot = export_graphviz(wine_tree, out_file=None, rounded=True, filled=True, feature_names=['alcohol', 'sulphates'])

In [None]:
wine_graph = pydotplus.graph_from_dot_data(wine_dot)
Image(wine_graph.create_png())

In [None]:
wine_graph.write_png('../../Sparebank/talks/img/wine_tree.png')

In [None]:
for qual, data in red_wine.groupby('quality'):
    if qual in [5,7]:
        plt.plot(data.alcohol, data.sulphates, '.', alpha=0.5, label={5: 'low quality', 7: 'high quality'}[qual])
plt.contourf(xx, yy, Z, cmap=get_cmap('BuGn'), alpha=0.6)
plt.xlabel('alcohol content')
plt.ylabel('sulphates')
plt.legend()
plt.savefig('../../Sparebank/talks/img/winequal_tree.png', dpi=300)

In [None]:
wine_vars = list(red_wine.columns)

In [None]:
wine_vars.remove('quality')

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
Xtr, Xte, ytr, yte = train_test_split(red_wine[wine_vars], red_wine.quality > 4, test_size=0.3)
hr_train, hr_test = [], []
ds = range(1, 15)
for d in ds:
    model_d = DecisionTreeClassifier(max_depth=d).fit(Xtr, ytr)
    hr_train.append(np.mean(model_d.predict(Xtr) == ytr))
    hr_test.append(np.mean(model_d.predict(Xte) == yte))
plt.plot(ds, hr_test, label='test accuracy')
plt.plot(ds, hr_train, label='train accuracy')
plt.legend()
plt.xlabel('tree depth')
plt.ylabel('accuracy')
plt.title('Quality 4 or higher?')
plt.savefig('../../Sparebank/talks/img/tree_depth_train_test_wine_bad.png', dpi=300)

In [None]:
Xtr, Xte, ytr, yte = train_test_split(red_wine[wine_vars], red_wine.quality > 6, test_size=0.33)
hr_train, hr_test = [], []
ds = range(1, 15)
for d in ds:
    model_d = DecisionTreeClassifier(max_depth=d).fit(Xtr, ytr)
    hr_train.append(np.mean(model_d.predict(Xtr) == ytr))
    hr_test.append(np.mean(model_d.predict(Xte) == yte))
plt.plot(ds, hr_test, label='test accuracy')
plt.plot(ds, hr_train, label='train accuracy')
plt.legend()
plt.xlabel('tree depth')
plt.ylabel('accuracy')
plt.title('Quality 6 or higher?')
plt.savefig('../../Sparebank/talks/img/tree_depth_train_test_wine_great.png', dpi=300)

In [None]:
dt = DecisionTreeClassifier(max_depth=3).fit(Xtr, ytr)
relevant = dt.feature_importances_ > 0
labels = np.array(wine_vars)[relevant]
plt.bar(range(len(labels)), dt.feature_importances_[relevant])
plt.xticks(range(len(labels)), labels, rotation='vertical')
plt.ylabel('importance')
plt.title('What makes a great wine?')
plt.savefig('../../Sparebank/talks/img/wine_importance.png', dpi=300, bbox_inches='tight')

In [None]:
wine_vars, dt.feature_importances_

In [None]:
dtm = DecisionTreeClassifier(max_depth=2).fit(Xtr, ytr)

In [None]:
for qual in sorted(yte.unique()):
    hr = np.mean(dtm.predict(Xte[yte == qual]) == yte[yte==qual])
    perc = sum(yte==qual) / float(len(yte))
    print qual, " ".join("{:4.2f}".format(i) for i in (hr, perc))

In [None]:
dtm = DecisionTreeClassifier(max_depth=6).fit(Xtr, ytr)
for qual in sorted(yte.unique()):
    hr = np.mean(dtm.predict(Xte[yte == qual]) == yte[yte==qual])
    perc = sum(yte==qual) / float(len(yte))
    print qual, " ".join("{:4.2f}".format(i) for i in (hr, perc))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier().fit(Xtr, ytr > 5)

In [None]:
from sklearn.metrics import roc_curve

In [None]:
fpr, tpr, thresholds = roc_curve(yte > 5, rf.predict_proba(Xte)[:,1], drop_intermediate=False)

In [None]:
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1])
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.title('ROC curve')
plt.savefig('../../Sparebank/talks/img/ROC.png', dpi=300)

In [None]:
Xtr, Xte, ytr, yte = train_test_split(red_wine[wine_vars], red_wine.quality > 5, test_size=0.3)
hr_train, hr_test = [], []
ds = range(1, 11)
for d in ds:
    model_d = RandomForestClassifier(max_depth=d,n_estimators=50).fit(Xtr, ytr)
    hr_train.append(np.mean(model_d.predict(Xtr) == ytr))
    hr_test.append(np.mean(model_d.predict(Xte) == yte))
plt.plot(ds, hr_test, label='test hit rate')
plt.plot(ds, hr_train, label='train hit rate')
plt.legend()
plt.xlabel('tree depth')
plt.ylabel('hit rate')
#plt.savefig('../../Sparebank/talks/img/forest_depth_train_test_wine.png', dpi=300)

# Anomalies

In [None]:
xs = np.random.standard_normal(10)

In [None]:
xs[5] = 4.2

In [None]:
plt.plot((xs+2)*10, 'o')
plt.ylabel('noise level')
plt.xlabel('time')
plt.savefig('../../Sparebank/talks/img/noise_anomaly.png', dpi=300)

In [None]:
plt.plot(xs, 'o')
plt.ylabel('z')
plt.xlabel('time')
plt.savefig('../../Sparebank/talks/img/noise_anomaly_z.png', dpi=300)

In [None]:
from scipy.stats import norm

In [None]:
norm().pdf(0.5)

In [None]:
?plt.xlabel

In [None]:
xs = np.arange(0,1,0.05)
plt.fill_between(xs, np.zeros_like(xs), norm().pdf(xs))
xs = np.arange(1,2,0.05)
plt.fill_between(xs, np.zeros_like(xs), norm().pdf(xs))
xs = np.arange(2,3,0.05)
plt.fill_between(xs, np.zeros_like(xs), norm().pdf(xs))
xs = np.arange(3,4,0.05)
plt.fill_between(xs, np.zeros_like(xs), norm().pdf(xs))
plt.xticks(range(5), [r'${}\sigma$'.format(i) for i in range(5)])
plt.text(0.5, 0.05, '34.1%',  horizontalalignment='center', verticalalignment='center', color='white')
plt.text(1.5, 0.05, '13.6%',  horizontalalignment='center', verticalalignment='center', color='white')
plt.text(2.5, 0.05, '2.1%',  horizontalalignment='center', verticalalignment='center')
plt.text(3.5, 0.05, '0.1%',  horizontalalignment='center', verticalalignment='center')
#plt.text(1,1, u'\u00A9 Dirk Hesse, iKnow Solutions',
#         size=8,
#         horizontalalignment='right',
#         verticalalignment='bottom',
#         transform=plt.gca().transAxes)
plt.ylabel('probability density')
plt.xlabel('random variable')
plt.savefig('../../Sparebank/talks/img/normal_dist.png', dpi=300)

# Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

In [None]:
data_sample = red_wine.sample(20)
Z = linkage(data_sample[wine_vars])
data_sample.quality.hist()

In [None]:
dendrogram(Z)
plt.xlabel('cluster number')
plt.ylabel('distance')
plt.title('dendrogram')
plt.savefig('../../Sparebank/talks/img/dendrogram.png', dpi=300)

In [None]:
data_sample['labels'] = fcluster(Z, 4, criterion='maxclust')

In [None]:
data_sample.groupby('labels').quality.mean()

In [None]:
from sklearn.cluster import KMeans

In [None]:
model = KMeans(3).fit(red_wine[wine_vars])

In [None]:
?model.score

In [None]:
ks = range(2, 11)
ds = [-KMeans(k).fit(red_wine[wine_vars]).score(red_wine[wine_vars]) for k in ks]

In [None]:
ks.insert(0, 1)

In [None]:
X = red_wine[wine_vars].values

In [None]:
mu = np.mean(X, axis=0).shape

In [None]:
ds.insert(0, np.sum((X - mu)**2))

In [None]:
plt.plot(ks, ds)

In [None]:
from sklearn.datasets import load_iris

In [None]:
iris = load_iris()

In [None]:
X = iris['data']

In [None]:
ks = range(2,10)
costs = [-KMeans(k).fit(X).score(X) for k in ks]

In [None]:
iris.keys()

In [None]:
plt.plot(ks, costs)
plt.xlabel('$k$')
plt.ylabel('within-cluster RSS')
plt.title('Iris data clustering')
plt.savefig('../../Sparebank/talks/img/choosing_k.png', dpi=300)

In [None]:
labels = KMeans(3).fit(X).predict(X)

In [None]:
actuals = iris['target']

In [None]:
pred = pd.DataFrame({'prediction': labels, 'actual': actuals})

In [None]:
d = pred.groupby(['prediction', 'actual']).count().reset_index()

In [None]:
d.pivot_table(columns='actual', values='count', index='prediction').plot.bar()
plt.ylabel('count')
plt.title('Iris data clustering')
plt.savefig('../../Sparebank/talks/img/iris_bars.png', dpi=300)