## Preamble and Review

In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [2]:
np.set_printoptions(precision=2)

## Load data

In [4]:
fruits = pd.read_table('fruit_data_with_colors.txt')

## Define feature and target names 

In [5]:
feature_names_fruits = ['height', 'width', 'mass', 'color_score']
X_fruits = fruits[feature_names_fruits]
y_fruits = fruits['fruit_label']
target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']

In [6]:
X_fruits_2d = fruits[['height', 'width']]
y_fruits_2d = fruits['fruit_label']

## Train-test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_fruits, y_fruits, random_state = 0)

## Feature normalization

In [9]:
from sklearn.preprocessing import MinMaxScaler

In [10]:
scaler = MinMaxScaler()

In [13]:
X_train_scaled = scaler.fit_transform(X_train)

In [14]:
X_test_scaled = scaler.transform(X_test)

## KNN classifier

In [16]:
knn = KNeighborsClassifier(n_neighbors = 5)

In [17]:
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [18]:
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train_scaled, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test_scaled, y_test)))


Accuracy of K-NN classifier on training set: 0.95
Accuracy of K-NN classifier on test set: 1.00


In [19]:
# Prediction
example_fruit = [[5.5, 2.2, 10, 0.70]]
example_fruit_scaled = scaler.transform(example_fruit)
print('Predicted fruit type for ', example_fruit, ' is ', 
          target_names_fruits[knn.predict(example_fruit_scaled)[0]-1])

Predicted fruit type for  [[5.5, 2.2, 10, 0.7]]  is  mandarin


In [20]:
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve

param_range = np.logspace(-3, 3, 4)
train_scores, test_scores = validation_curve(SVC(), X, y,
                                            param_name='gamma',
                                            param_range=param_range, cv=3)

NameError: name 'X' is not defined

In [152]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


np.random.seed(0)
n = 15
x = np.linspace(0,10,n) + np.random.randn(n)/5
y = np.sin(x)+x/6 + np.random.randn(n)/10


X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)

# You can use this function to help you visualize the dataset by
# plotting a scatterplot of the data points
# in the training and test sets.
def part1_scatter():
    import matplotlib.pyplot as plt
    %matplotlib notebook
    plt.figure()
    plt.scatter(X_train, y_train, label='training data')
    plt.scatter(X_test, y_test, label='test data')
    plt.legend(loc=4);


In [118]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

result = np.zeros((4,100))
for i, degree in enumerate([1,3,6,9]):
    # Your code here
    poly = PolynomialFeatures(degree = degree)

    X_poly = poly.fit_transform(X_train.reshape(11,1))
    X_pred = poly.fit_transform(np.linspace(0,10,100).reshape(100,1))

    linreg = LinearRegression().fit(X_poly, y_train)

    predict = linreg.predict(X_pred)
    result[i,:] = predict 

# Question 2
Write a function that fits a polynomial LinearRegression model on the training data `X_train` for degrees 0 through 9. 
For each model compute the $R^2$ (coefficient of determination) 
regression score on the training data as well as the the test data, and return both of these arrays in a tuple.

*This function should return one tuple of numpy arrays `(r2_train, r2_test)`. Both arrays should have shape `(10,)`*

ValueError: cannot reshape array of size 8 into shape (11,1)

In [192]:
def answer_two():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics.regression import r2_score
    
    array = np.array([(0,0)])

    for count, degree in enumerate([0,1,2,3,4,5,6,7,8,9]):

        poly = PolynomialFeatures(degree = degree)

        X_poly = poly.fit_transform(X_train.reshape(11,1))

        X_train1, X_test1, y_train1, y_test1 = train_test_split(X_poly, y_train, random_state = 0)

        linreg = LinearRegression().fit(X_train1, y_train1)
        
        array = np.vstack((array, (linreg.score(X_train1, y_train1), linreg.score(X_test1, y_test1))))

    return np.delete(array, (0), axis = 0)

In [194]:
answer_two()

array([[ 0.  , -0.02],
       [ 0.5 ,  0.17],
       [ 0.51,  0.21],
       [ 0.64,  0.28],
       [ 0.95,  0.73],
       [ 1.  ,  0.63],
       [ 1.  ,  0.79],
       [ 1.  ,  0.83],
       [ 1.  ,  0.86],
       [ 1.  ,  0.85]])