# More Linear Classifiers

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.special import softmax
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

### Load the olive oil data

In [None]:
oil = pd.read_csv('olive_oils.csv')
print(oil.shape)
oil.head()

### Exploring the data

In [None]:
# Plotting five features against one another for the sake of space. We have 7 features, so if you have
# a large enough screen, feel free to change 5 to 7 below and see all pairs of features.

labels = set(oil['area_name'].values)
fig, axs = plt.subplots(5,5, figsize = (20, 20))
#print(axs)
for label in labels:
    for i in range(5):
        for j in range(5):
            if i != j:
                axs[i, j].plot(oil.loc[oil['area_name'] == label].iloc[:,i+1], 
                               oil.loc[oil['area_name'] == label].iloc[:,j+1], 'o', label = label)
            if i == j:
                #axs[i,j].set_title(oil.columns[i+1])
                axs[i,j].text(0.5, 0.5,oil.columns[i+1], horizontalalignment='center',
                     verticalalignment='center', transform = axs[i,j].transAxes)
plt.show()


In [None]:
plt.figure(figsize=(10,10))
for label in labels:
    plt.plot(oil.loc[oil['area_name'] == label]['palmitic'], 
                               oil.loc[oil['area_name'] == label]['linoleic'], 'o', label = label)
plt.legend()
plt.show()

**Question:** Which classes look like they will be relatively easy to identify? Which classes will be difficult?

### Set up training and testing data

We're going to use 400 instances from this data set as our training set, and the remaining 172 as a testing set.

In [None]:
train_idx = np.random.choice(oil.index, 400, replace = False)
oil_train = oil.loc[train_idx,:]
oil_test = oil.drop(train_idx)

For visualization, we'll use just two of the variables here.

In [None]:
scaler = StandardScaler()  # As with many other models, these perform better with scaled data
X = scaler.fit_transform(oil_train[['palmitic', 'linoleic']])
enc = LabelEncoder()
y = enc.fit_transform(oil_train['area_name'])

In [None]:
# These are helper functions for plotting the decision regions.
# Code from StackOverflow user seralouk, comments my own

def make_meshgrid(x, y, h=0.02): # If you are using unscaled data, change h to about 2 to avoid out-of-memory errors
    x_min, x_max = x.min() - 0.5, x.max() + 0.5
    y_min, y_max = y.min() - 0.5, y.max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy

def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

In [None]:
# Example: plotting the decision surface of a linear support vector classifier

model = SVC(kernel='linear', gamma = 'scale')
clf = model.fit(X, y)

title = ('Decision surface of linear SVC')
X0, X1 = X[:,0], X[:,1]
xx, yy = make_meshgrid(X0, X1, h = 0.02) # Unscaled data -- don't forget to set h!!

fig, ax = plt.subplots(figsize=(10,10))

cf = plot_contours(ax, clf, xx, yy, cmap=plt.cm.Pastel2, alpha=0.8)

for label in labels:
    ax.scatter(X0[oil_train['area_name'] == label], 
               X1[oil_train['area_name'] == label], 
               marker = 'o', edgecolor = 'black', label = label)

ax.set_ylabel('Linoleic')
ax.set_xlabel('Palmitic')
ax.set_title(title)
ax.legend()
plt.show()

#### Example

Now let's use the same code with a `LogisticRegression` model.

In [None]:
clf = LogisticRegression(solver = 'lbfgs', penalty = 'none', multi_class='ovr')
clf.fit(X, y)
title = ('Decision surface of logistic regression')
X0, X1 = X[:,0], X[:,1]
xx, yy = make_meshgrid(X0, X1, h = 0.02)

fig, ax = plt.subplots(figsize=(10,10))

plot_contours(ax, clf, xx, yy, cmap=plt.cm.Pastel2, alpha=0.8)
for label in sorted(labels):
    ax.scatter(X0[oil_train['area_name'] == label], 
               X1[oil_train['area_name'] == label], 
               marker = 'o', edgecolor = 'black', label = label)

ax.set_ylabel('Linoleic')
ax.set_xlabel('Palmitic')
ax.set_title(title)
ax.legend()
plt.show()

**Question:** What qualitative differences do you notice between the decision boundaries for logistic regression and those for the linear SVM?

### Set up training and testing data for model validation

#### Exercise

Using the same train/test set as before, set up testing and training `X` and `y`, but this time:
* use all predictors in `X` instead of just 2
* don't run `y` through the `LabelEncoder`

#### Example

The following code uses five-fold cross-validation to find a good value of `C` to use for a linear SVM. This runs through a number of values of `C`. Assumes your training data are called `X_train`, `y_train`

In [None]:
n_trials = 200
min_C = 0.1
max_C = 20
scores = np.zeros(n_trials)

C_range = np.linspace(min_C, max_C, n_trials)

i = 0
for C in C_range:
    model = SVC(kernel='linear', gamma = 'scale', C = C)
    score = cross_val_score(model, X_train, y_train, cv = 5)
    scores[i] = np.average(score)
    i += 1
    
plt.figure()
plt.plot(C_range, scores, '.')
plt.xlabel("Value of C")
plt.ylabel("5-CV accuracy score")
plt.show()

best_C_lin = C_range[np.argmax(scores)]
print("Best value of C found:", best_C_lin)

#### Exercise

Repeat the above process, but use a radial basis function kernel (set `kernel = 'rbf'`, `gamma = 'auto'` when initializing your `SVC` instance). Plot the accuracy against the value of `C` and report the best value of `C`. (You can copy most of the above code, but make the necessary modifications.)

#### Exercise

Use five-fold cross validation to estimate the accuracy of the logistic regression model. The logistic regression we're using doesn't have a hyperparameter for us to tune, so just run five-fold CV on it once. Does it outperform either of the support vector classifiers?

In [None]:
clf = LogisticRegression(solver = 'lbfgs', penalty = 'none', multi_class='ovr')
score = cross_val_score(clf, X, y, cv = 5)
print(np.average(score))

#### Exercise

Finally: take the best model you found, according to cross-validation scores, and fit it to the training data. Then make predictions on the testing data and print a confusion matrix. What is the overall test accuracy of the best model?