In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np 
import time

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:
C = 1
kernel = 'linear'

In [3]:
# TODO: Change to 200000 once you get to Question#2
iterations = 5000

# You can set this to false if you want to draw the full square matrix:
FAST_DRAW = True

In [4]:
def drawPlots(model, X_train, X_test, y_train, y_test, wintitle='Figure 1'):
    # You can use this to break any higher-dimensional space down,
    # And view cross sections of it.

    # If this line throws an error, use plt.style.use('ggplot') instead
    mpl.style.use('ggplot') # Look Pretty

    padding = 3
    resolution = 0.5
    max_2d_score = 0

    y_colors = ['#ff0000', '#00ff00', '#0000ff']
    my_cmap  = mpl.colors.ListedColormap(['#ffaaaa', '#aaffaa', '#aaaaff'])
    colors   = [y_colors[i] for i in y_train]
    num_columns = len(X_train.columns)

    fig = plt.figure()
    fig.canvas.set_window_title(wintitle)
    fig.set_tight_layout(True)
    
    cnt = 0
    for col in range(num_columns):
        for row in range(num_columns):
            
            # Easy out
            if FAST_DRAW and col > row:
                cnt += 1
                continue

            ax = plt.subplot(num_columns, num_columns, cnt + 1)
            plt.xticks(())
            plt.yticks(())

            # Intersection:
            if col == row:
                plt.text(0.5, 0.5, X_train.columns[row], verticalalignment='center', horizontalalignment='center', fontsize=12)
                cnt += 1
                continue


            # Only select two features to display, then train the model
            X_train_bag = X_train.ix[:, [row,col]]
            X_test_bag = X_test.ix[:, [row,col]]
            model.fit(X_train_bag, y_train)

            # Create a mesh to plot in
            x_min, x_max = X_train_bag.ix[:, 0].min() - padding, X_train_bag.ix[:, 0].max() + padding
            y_min, y_max = X_train_bag.ix[:, 1].min() - padding, X_train_bag.ix[:, 1].max() + padding
            xx, yy = np.meshgrid(np.arange(x_min, x_max, resolution),
                                 np.arange(y_min, y_max, resolution))

            # Plot Boundaries
            plt.xlim(xx.min(), xx.max())
            plt.ylim(yy.min(), yy.max())

            # Prepare the contour
            Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
            Z = Z.reshape(xx.shape)
            plt.contourf(xx, yy, Z, cmap=my_cmap, alpha=0.8)
            plt.scatter(X_train_bag.ix[:, 0], X_train_bag.ix[:, 1], c=colors, alpha=0.5)


            score = round(model.score(X_test_bag, y_test) * 100, 3)
            plt.text(0.5, 0, "Score: {0}".format(score), transform = ax.transAxes, horizontalalignment='center', fontsize=8)
            max_2d_score = score if score > max_2d_score else max_2d_score

            cnt += 1

    print("Max 2D Score: ", max_2d_score)

In [16]:
def benchmark(model, X_train, X_test, y_train, y_test, wintitle='Figure 1'):
    print(wintitle + ' Results')
    s = time.time()
    
    for i in range(iterations):
        
        model.fit(X_train, y_train)
        
    print("{0} Iterations Training Time: ".format(iterations), time.time() - s)


    s = time.time()
    for i in range(iterations):
        
        prediction=model.predict(X_test)
        model.score(y_test,prediction)
    print("{0} Iterations Scoring Time: ".format(iterations), time.time() - s)
    print("High-Dimensionality Score: ", round((score*100), 3))

In [6]:
X=pd.read_csv('../DAT210x/Module6/Datasets/wheat.data', index_col=0)

In [7]:
X[pd.isnull(X).any(axis=1)]

Unnamed: 0_level_0,area,perimeter,compactness,length,width,asymmetry,groove,wheat_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7,14.11,14.1,0.8911,5.42,3.302,2.7,,canadian
35,16.12,15.0,,0.9,,5.709,3.485,canadian
60,11.42,12.86,0.8683,5.008,2.85,2.7,,canadian
135,15.38,14.66,0.899,5.477,3.465,3.6,,canadian
169,11.24,13.0,,0.8359,5.09,2.715,3.521,canadian
170,11.02,13.0,,0.8189,5.325,2.701,6.735,canadian
201,12.67,13.32,0.8977,4.984,3.135,2.3,,canadian


In [8]:
X.dropna(inplace=True)

In [9]:
y=X.wheat_type
X.drop('wheat_type', axis=1,inplace=True)

In [10]:
y=y.map({'canadian':0, 'kama':1, 'rosa':2})

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=7)

In [17]:
svc=SVC(C=C, kernel=kernel)

In [18]:
knn=KNeighborsClassifier(n_neighbors=5)

In [19]:
benchmark(knn, X_train, X_test, y_train, y_test, 'KNeighbors')
drawPlots(knn, X_train, X_test, y_train, y_test, 'KNeighbors')

KNeighbors Results
5000 Iterations Training Time:  2.2783453464508057


ValueError: Expected 2D array, got 1D array instead:
array=[2 2 1 1 0 1 2 2 1 1 2 2 0 0 0 0 1 2 0 1 0 0 0 2 0 0 2 0 2 0 1 2 0 2 2 2 2
 0 2 2 2 2 1 2 1 1 1 0 1 0 1 1 2 2 2 1 1 0 2 2 2].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [20]:
benchmark(svc, X_train, X_test, y_train, y_test, 'SVC')
drawPlots(svc, X_train, X_test, y_train, y_test, 'SVC')

SVC Results
5000 Iterations Training Time:  4.025995969772339


ValueError: Expected 2D array, got 1D array instead:
array=[2. 2. 1. 1. 0. 1. 2. 2. 1. 1. 2. 2. 0. 0. 0. 0. 1. 2. 0. 1. 0. 0. 0. 2.
 0. 0. 2. 0. 2. 0. 1. 2. 0. 2. 2. 2. 2. 0. 2. 2. 2. 2. 1. 2. 1. 1. 1. 0.
 1. 0. 1. 1. 2. 2. 2. 1. 1. 0. 2. 2. 2.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.