In [2]:
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_blobs
from matplotlib.colors import ListedColormap
from sklearn.datasets import load_breast_cancer

cmap_bold = ListedColormap(['#FFFF00','#00FF00','#0000FF','#000000'])
fruits = pd.read_table('fruit_data_with_colors.txt')

In [3]:
feature_names_fruits = ['height','width','mass','color_score']
X_fruits = fruits[feature_names_fruits]
y_fruits = fruits['fruit_label']

target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']

X_fruits_2d = fruits[['height','width']]
y_fruits_2d = fruits['fruit_label']

In [4]:
from sklearn.datasets import make_regression
plt.figure()
plt.title('Sample regression, 1 input var')
X_R1, y_R1 = make_regression(n_samples = 100, n_features =1, n_informative=1, bias=150.0, noise=30, random_state=0)
plt.scatter(X_R1, y_R1, marker='o',s=50)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x127f395e0>

In [6]:
from sklearn.datasets import make_friedman1
plt.figure()
plt.title('Complex regression, 1 input var')
X_F1, y_F1 = make_friedman1(n_samples = 100, n_features =7, random_state=0)
plt.scatter(X_F1[:,2], y_F1, marker='o',s=50)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x128973220>

In [10]:
plt.figure()
plt.title('sample binary, 2 informative feature')
X_C2, y_C2 = make_classification(n_samples = 100, n_features =2,n_redundant=0,
                                 n_informative=2,n_clusters_per_class=1,flip_y=0.1, 
                                 class_sep=0.5, random_state=0)
plt.scatter(X_C2[:,0], X_C2[:,1], marker='o',c=y_C2, s=50, cmap=cmap_bold)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x128c898b0>

In [11]:
plt.figure()
plt.title('sample binary, non linearly separable classes')
X_D2, y_D2 = make_blobs(n_samples = 100, n_features =2,centers=8,
                                 cluster_std=1.3,random_state=4)
y_D2 = y_D2 % 2
plt.scatter(X_D2[:,0], X_D2[:,1], marker='o',c=y_D2, s=50, cmap=cmap_bold)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x128d64f40>

### Naive Bayes classifiers

In [16]:
from sklearn.naive_bayes import GaussianNB
X_train, X_test, y_train, y_test = train_test_split(X_C2,y_C2, random_state=0)
nbclf = GaussianNB().fit(X_train, y_train)
nbclf.score(X_test, y_test)

0.84

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_D2,y_D2, random_state=0)
nbclf = GaussianNB().fit(X_train, y_train)
nbclf.score(X_test, y_test)

0.72

In [18]:
cancer = load_breast_cancer()
(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_cancer,y_cancer, random_state=0)
nbclf = GaussianNB().fit(X_train, y_train)
nbclf.score(X_train, y_train)

0.9507042253521126

In [23]:
nbclf.score(X_test, y_test)

0.9370629370629371

### Ensembles of Decision Trees

#### Random forests

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_D2,y_D2, random_state=0)
clf = RandomForestClassifier().fit(X_train,y_train)
clf.score(X_train, y_train)

1.0

In [37]:
clf.score(X_test, y_test)

0.8

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_fruits,y_fruits, random_state=0)
clf = RandomForestClassifier().fit(X_train,y_train)
clf.score(X_train, y_train)

1.0

In [39]:
clf.score(X_test, y_test)

0.8666666666666667

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_cancer,y_cancer, random_state=0)
clf = RandomForestClassifier(max_features=8,random_state=0).fit(X_train,y_train)
clf.score(X_train, y_train)

1.0

In [41]:
clf.score(X_test, y_test)

0.972027972027972

### Gradient-boosted Decision Trees

In [43]:
from sklearn.ensemble import GradientBoostingClassifier
X_train, X_test, y_train, y_test = train_test_split(X_D2,y_D2, random_state=0)
clf = GradientBoostingClassifier().fit(X_train,y_train)
clf.score(X_train, y_train)

1.0

In [44]:
clf.score(X_test, y_test)

0.76

#### Gradient-boosted Decision Trees on the fruit dataset

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_fruits,y_fruits, random_state=0)
clf = GradientBoostingClassifier().fit(X_train,y_train)
clf.score(X_train, y_train)

1.0

In [46]:
clf.score(X_test, y_test)

0.8

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X_cancer,y_cancer, random_state=0)
clf = GradientBoostingClassifier(learning_rate = 0.1, max_depth=3,random_state=0).fit(X_train,y_train)
clf.score(X_train, y_train)

1.0

In [52]:
clf.score(X_test, y_test)

0.965034965034965

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_cancer,y_cancer, random_state=0)
clf = GradientBoostingClassifier(learning_rate = 0.01, max_depth=2, random_state=0).fit(X_train,y_train)
clf.score(X_train, y_train)

0.9741784037558685

In [54]:
clf.score(X_test, y_test)

0.965034965034965

### Neural Networks

#### Activation Functions

In [57]:
xrange = np.linspace(-2,2,200)
plt.figure(figsize=(7,6))
plt.plot(xrange,np.maximum(xrange,0),label='relu')
plt.plot(xrange,np.tanh(xrange),label='tanh')
plt.plot(xrange, 1/(1+np.exp(-xrange)),label='logistic')
plt.legend()
plt.title('Neural network activation')
plt.xlabel('Input value (x)')
plt.ylabel('Activation function output')

plt.show()

<IPython.core.display.Javascript object>

#### Neural networks: classification

In [64]:
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)

nnclf = MLPClassifier(hidden_layer_sizes = [1], solver='lbfgs',random_state=0).fit(X_train, y_train)
nnclf.score(X_train, y_train)

0.6133333333333333

In [65]:
nnclf.score(X_test, y_test)

0.64

In [66]:
nnclf = MLPClassifier(hidden_layer_sizes = [10], solver='lbfgs',random_state=0).fit(X_train, y_train)
nnclf.score(X_train, y_train)

0.7733333333333333

In [67]:
nnclf.score(X_test, y_test)

0.64

In [68]:
nnclf = MLPClassifier(hidden_layer_sizes = [100], solver='lbfgs',random_state=0).fit(X_train, y_train)
nnclf.score(X_train, y_train)

0.92

In [69]:
nnclf.score(X_test, y_test)

0.72

In [76]:
nnclf = MLPClassifier(hidden_layer_sizes = [10, 10], solver='lbfgs',random_state=0).fit(X_train, y_train)
nnclf.score(X_train, y_train)

0.9333333333333333

In [77]:
nnclf.score(X_test, y_test)

0.72

In [79]:
nnclf = MLPClassifier(hidden_layer_sizes = [100, 100], solver='lbfgs',random_state=0).fit(X_train, y_train)
nnclf.score(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


0.9466666666666667

In [80]:
nnclf.score(X_test, y_test)

0.76

#### Regularization parameter:alpha

In [81]:
nnclf = MLPClassifier(alpha= 0.01, hidden_layer_sizes = [100, 100], solver='lbfgs',random_state=0).fit(X_train, y_train)
print(nnclf.score(X_train, y_train))
print(nnclf.score(X_test, y_test))

0.9333333333333333
0.72


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [82]:
nnclf = MLPClassifier(alpha= 0.1, hidden_layer_sizes = [100, 100], solver='lbfgs',random_state=0).fit(X_train, y_train)
print(nnclf.score(X_train, y_train))
print(nnclf.score(X_test, y_test))

0.9466666666666667
0.72


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [83]:
nnclf = MLPClassifier(alpha= 1, hidden_layer_sizes = [100, 100], solver='lbfgs',random_state=0).fit(X_train, y_train)
print(nnclf.score(X_train, y_train))
print(nnclf.score(X_test, y_test))

0.8933333333333333
0.8


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [84]:
nnclf = MLPClassifier(alpha= 5, hidden_layer_sizes = [100, 100], solver='lbfgs',random_state=0).fit(X_train, y_train)
print(nnclf.score(X_train, y_train))
print(nnclf.score(X_test, y_test))

0.8666666666666667
0.92


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [85]:
nnclf = MLPClassifier(alpha= 10, hidden_layer_sizes = [100, 100], solver='lbfgs',random_state=0).fit(X_train, y_train)
print(nnclf.score(X_train, y_train))
print(nnclf.score(X_test, y_test))

0.7066666666666667
0.8


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


#### Different effects for activation function

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)
for this_activation in (['logistic','tanh','relu']):
    nnclf = MLPClassifier(solver='lbfgs',activation = this_activation,
                         alpha=0.1, hidden_layer_sizes = [10,10],
                         random_state=0).fit(X_train, y_train)
    print(this_activation)
    print(nnclf.score(X_train, y_train))
    print(nnclf.score(X_test, y_test))
    print()

logistic
0.9333333333333333
0.76

tanh
0.92
0.8

relu
0.8666666666666667
0.88



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


### NN: regression

In [97]:
from sklearn.neural_network import MLPRegressor

fig, subaxes = plt.subplots(2,3,figsize=(11,8), dpi=70)
X_predict_input = np.linspace(-3,3,50).reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state=0)

for thisaxisrow, thisactivation in zip(subaxes,['tanh','relu']):
    for thisalpha, thisaxis in zip([0.0001, 1.0, 100], thisaxisrow):
        mlpreg = MLPRegressor(hidden_layer_sizes = [100,100],activation = thisactivation,
                     alpha=thisalpha, solver='lbfgs').fit(X_train, y_train)
        y_predict_output = mlpreg.predict(X_predict_input)
        thisaxis.set_xlim([-2.5,0.75])
        thisaxis.plot(X_predict_input,y_predict_output,'^',markersize=10)
        thisaxis.plot(X_train,y_train,'o')
        thisaxis.set_xlabel('input feature')
        thisaxis.set_ylabel('target value')
        thisaxis.set_title('alpha={},activation={}'.format(thisalpha,thisactivation))

<IPython.core.display.Javascript object>

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

In [99]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state=0)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

nnclf = MLPClassifier(solver='lbfgs',
                 alpha=5.0, hidden_layer_sizes = [100,100],
                 random_state=0).fit(X_train_scaled, y_train)
print(nnclf.score(X_train_scaled, y_train))
print(nnclf.score(X_test_scaled, y_test))

0.9835680751173709
0.965034965034965
