In [34]:
%matplotlib notebook
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, make_blobs
from matplotlib.colors import ListedColormap
from sklearn.datasets import load_breast_cancer
from adspy_shared_utilities import load_crime_dataset
cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000'])


# regression dataSet
from sklearn.datasets import make_regression
plt.figure()
plt.title('Sample Regression data')
X_R1, y_R1 = make_regression(n_samples=100,n_features=1,
                            n_informative=1,bias=150.0,
                            noise=30,random_state=0)
plt.scatter(X_R1,y_R1,s=50)


<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x9864754208>

In [35]:
# complex regression dataset
from sklearn.datasets import make_friedman1
plt.figure()
plt.title('Complex Regression dataset')
X_F1,y_F1 = make_friedman1(n_samples=100,n_features=7,
                          random_state=0)
plt.scatter(X_F1[:,2],y_F1, s=50)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x986352cb00>

In [36]:
# dataset for classification 
X_C2, y_C2 = make_classification(n_samples=100,n_features=2,
                                n_redundant=0,n_informative=2,
                                n_clusters_per_class=1,flip_y=0,
                                class_sep=0.5,random_state=0)
plt.figure()
plt.title('Binary Classification Dataset')
plt.scatter(X_C2[:,0],X_C2[:,1],c=y_C2,s=50,cmap=cmap_bold)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x986443a278>

In [37]:
# Complex Binary Classification
X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2, centers = 8,
                       cluster_std = 1.3, random_state = 4)
y_D2 = y_D2 % 2
plt.figure()
plt.title('Complex binary classification problem with non-linearly separable classes')
plt.scatter(X_D2[:,0], X_D2[:,1], c=y_D2,
           marker= 'o', s=50, cmap=cmap_bold)
plt.show()

<IPython.core.display.Javascript object>

In [38]:
# Breast Cancer dataset for classification
cancer = load_breast_cancer()
(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)

# Crime dataset
(X_crime, y_crime) = load_crime_dataset()

# K-Nearest Neighbours


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from adspy_shared_utilities import plot_two_class_knn

X_train,X_test,y_train,y_test = train_test_split(X_C2,y_C2,random_state=0)

plot_two_class_knn(X_train,y_train,1,'uniform',X_test,y_test)
plot_two_class_knn(X_train,y_train,3,'uniform',X_test,y_test)
plot_two_class_knn(X_train,y_train,11,'uniform',X_test,y_test)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# K-Neighbours Regression


In [40]:
from sklearn.neighbors import KNeighborsRegressor
X_train,X_test,y_train,y_test = train_test_split(X_R1,y_R1,random_state=0)
knnreg= KNeighborsRegressor(n_neighbors=5).fit(X_train,y_train)

print(knnreg.predict(X_test))
print('R-squared test score: {:.3f}'.format(knnreg.score(X_test,y_test)))


[ 231.70974697  148.35572605  150.58852659  150.58852659   72.14859259
  166.50590948  141.90634426  235.57098756  208.25897836  102.10462746
  191.31852674  134.50044902  228.32181403  148.35572605  159.16911306
  113.46875166  144.03646012  199.23189853  143.19242433  166.50590948
  231.70974697  208.25897836  128.01545355  123.14247619  141.90634426]
R-squared test score: 0.425


In [41]:
import numpy as np
fig,subaxes =plt.subplots(1,2,figsize=(8,4))
X_predict_input= np.linspace(-3,3,50).reshape(-1,1)
X_train,X_test,y_train,y_test = train_test_split(X_R1[0::5],
                                                 y_R1[0::5],
                                                 random_state=0)
for thisaxis , K in zip(subaxes,[1,3]):
    knnreg= KNeighborsRegressor(n_neighbors=K).fit(X_train,y_train)
    y_predict_output = knnreg.predict(X_predict_input)
    thisaxis.set_xlim([-2.5,0.75])
    thisaxis.plot(X_predict_input,y_predict_output,'^',markersize=10,
                 label='Predicted',alpha=0.8)
    thisaxis.plot(X_train,y_train,'o',label='True Value',alpha=0.8)
    thisaxis.set_xlabel('Input Feature')
    thisaxis.set_ylabel('Target Value')
    thisaxis.set_title('KNN regression (k={})'.format(K))
    thisaxis.legend()
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [42]:
# plot k-NN regression on sample dataset for different values of K
fig, subaxes = plt.subplots(5, 1, figsize=(5,20))
X_predict_input = np.linspace(-3, 3, 500).reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1,
                                                   random_state = 0)

for thisaxis, K in zip(subaxes, [1, 3, 7, 15, 55]):
    knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train)
    y_predict_output = knnreg.predict(X_predict_input)
    train_score = knnreg.score(X_train, y_train)
    test_score = knnreg.score(X_test, y_test)
    thisaxis.plot(X_predict_input, y_predict_output)
    thisaxis.plot(X_train, y_train, 'o', alpha=0.9, label='Train')
    thisaxis.plot(X_test, y_test, '^', alpha=0.9, label='Test')
    thisaxis.set_xlabel('Input feature')
    thisaxis.set_ylabel('Target value')
    thisaxis.set_title('KNN Regression (K={})\n\
Train $R^2 = {:.3f}$,  Test $R^2 = {:.3f}$'
                      .format(K, train_score, test_score))
    thisaxis.legend()
    plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
plt.show()

<IPython.core.display.Javascript object>

# Linear Regression

In [43]:
from sklearn.linear_model import LinearRegression
X_train,X_test,y_train,y_test = train_test_split(X_R1,y_R1,random_state=0)
linreg =LinearRegression().fit(X_train,y_train)
print('Slope {}'.format(linreg.coef_))
print('y intercept {}'.format(linreg.intercept_))
print(linreg.score(X_train,y_train))
print(linreg.score(X_test,y_test))

plt.figure(figsize=(5,5))
plt.scatter(X_R1,y_R1,marker='o',s=50,alpha=0.8)
plt.plot(X_R1,linreg.coef_*X_R1 + linreg.intercept_,'r-')
plt.xlabel('Feature Val')
plt.ylabel('Target Val')
plt.show()

Slope [ 45.70870465]
y intercept 148.44575345658873
0.678595077114
0.491596159349


<IPython.core.display.Javascript object>

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
                                                   random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)

print('slope(m) = {} '.format(linreg.coef_) )
print('c : {}'.format(linreg.intercept_))

print('R-squared test : {}'.format(linreg.score(X_test,y_test)))

slope(m) = [  1.61892346e-03  -9.43009110e+01   1.36067510e+01  -3.13380670e+01
  -8.15482714e-02  -1.69455128e+01  -2.42730375e-03   1.53013232e+00
  -1.39193248e-02  -7.72112833e+00   2.28112354e+01  -5.65708295e+00
   9.34751364e+00   2.06969566e-01  -7.43413626e+00   9.65856476e-03
   4.38030290e-03   4.79754625e-03  -4.46469212e+00  -1.60907140e+01
   8.82778012e+00  -5.06734503e-01  -1.42198055e+00   8.17551991e+00
  -3.87048268e+00  -3.54209213e+00   4.48758304e+00   9.30645715e+00
   1.73644996e+02   1.18220766e+01   1.51120836e+02  -3.29613007e+02
  -1.35343395e+02   6.95380108e-01  -2.38369008e+01   2.77038981e+00
   3.82248925e-01   4.38813358e+00  -1.06410851e+01  -4.92294176e-03
   4.14031827e+01  -1.16206866e-03   1.18568968e+00   1.75418465e+00
  -3.68283678e+00   1.59679443e+00  -8.42180230e+00  -3.79703897e+01
   4.74076990e+01  -2.50768374e+01  -2.88246410e-01  -3.65633234e+01
   1.89516080e+01  -4.53336736e+01   6.82698598e+02   1.04478671e+02
  -3.28575414e+02  -3.1

# Ridge Regression

In [45]:
from sklearn.linear_model import Ridge
X_train,X_test,y_train,y_test = train_test_split(X_crime,y_crime,
                                                random_state=0)
linridge= Ridge(alpha=20.0).fit(X_train,y_train)
print('Training score {}'.format(linridge.score(X_train,y_train)))
print('Test data score {}'.format(linridge.score(X_test,y_test)))

Training score 0.6709349944606453
Test data score 0.4940490145966777


# Ridge Regression with Feature Scaling

In [46]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train,X_test,y_train,y_test = train_test_split(X_crime,y_crime,
                                                random_state=0)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.fit_transform(X_test)

linridge= Ridge(alpha=20.0).fit(X_train_scaled,y_train)

print('Training score {}'.format(linridge.score(X_train_scaled,y_train)))
print('Test data score {}'.format(linridge.score(X_test_scaled,y_test)))

Training score 0.6146175955616784
Test data score 0.6203186557130429


# Polynomial Regression

In [47]:

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_F1_poly=poly.fit_transform(X_F1)

X_train, X_test, y_train, y_test = train_test_split(X_F1, y_F1,
                                                   random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)

print('Training score {}'.format(linreg.score(X_train,y_train)))
print('Test data score {}'.format(linreg.score(X_test,y_test)))

Training score 0.7223750207373035
Test data score 0.7221339576925412


# Logistic Regression

In [66]:
import pandas as pd
fruits = pd.read_table('fruit_data_with_colors.txt')
feature_names_fruits = ['height','width','mass','color_score']
X_fruits= fruits[feature_names_fruits]
y_fruits=['fruit_label']

target_names_fruits = ['apple','mandarin','orange','lemon']
X_fruits_2d = fruits[ ['height','width'] ]
y_fruits_2d = fruits['fruit_label']


In [111]:
from sklearn.linear_model import LogisticRegression
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot
fig , subaxes = plt.subplots(1,1,figsize=(7,5))
y_fruits_apple = y_fruits_2d == 1
X_train,X_test,y_train,y_test = (train_test_split(X_fruits_2d.as_matrix(),
                                                y_fruits_apple.as_matrix(),
                                                random_state=0) )
clf = LogisticRegression(C=100).fit(X_train,y_train)
plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None,
                                         None, 'Logistic regression \
for binary classification\nFruit dataset: Apple vs others',
                                         subaxes)
plt.show()
h = 6
w = 8
print('A fruit with height {} and width {} is predicted to be: {}'
     .format(h,w, ['not an apple', 'an apple'][clf.predict([[h,w]])[0]]))
print(  "Train Score  {}".format(clf.score(X_train,y_train)) )
print("Test Score {}". format ( clf.score(X_test,y_test) )

SyntaxError: invalid syntax (<ipython-input-111-99a600b2c976>, line 19)