
<h1> DS200A Computer Vision Assignment</h1>

<h2>  Part Three: Training Models </h2>	

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import sklearn
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV

In [2]:
def train_test_split(df):
    train_df, val_df = sklearn.model_selection.train_test_split(df, test_size=0.2)
    return train_df, val_df

#Split the data into a training set, and test set 
def accuracy(pred, actual):
    return np.mean(pred == actual)
# Calculate the accuracy percentage of the predicted values
def separate_features(df):
    feature_df = df.iloc[:,2:]
    labels_df = df.iloc[:,1]
    return feature_df, labels_df

In [3]:
training_data = pd.read_hdf('training_data.h5')

In [4]:
# training_data.head()

<h3>  Train models using all of the following methods below. Be sure to drop the actual image column, and the encoding</h3>	Take note of the differences in accuracy, and methods.


In [5]:
train, val = train_test_split(training_data)
X_train, y_train = separate_features(train)
X_train.reset_index().drop(columns=['index'])
y_train.reset_index().drop(columns=['index'])
y_train = y_train.astype('int')

X_val, y_val = separate_features(val)
X_val.reset_index().drop(columns=['index'])
y_val.reset_index().drop(columns=['index'])
y_val = y_val.astype('int')

In [6]:
import warnings
warnings.filterwarnings("ignore")

def cross_val(model, k, feature_data, label_data):
    
    kf = sklearn.model_selection.KFold(n_splits = k)
    kf.get_n_splits(feature_data)
    
    accuracy_scores = np.empty([k, 1])
    model_params = [{} for i in range(feature_data.shape[1])]
        
    for i, (train_index, test_index) in enumerate(kf.split(feature_data)):
        model.fit(feature_data.iloc[train_index], label_data.iloc[train_index])
        label_predict = model.predict(feature_data.iloc[test_index])
        accuracy_scores[i] = accuracy(label_data.iloc[test_index], label_predict)
        model_params[i] = model.get_params()
        print(accuracy_scores[i])
    
    return model_params, accuracy_scores

Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

lreg = LogisticRegression(random_state=0, penalty='l2', C=1)
params = {'C': [1e-1, 1, 10, 100]}
lreg = GridSearchCV(lreg, params, cv=5)
model_params, accuracy_scores = cross_val(lreg, 5, X_train, y_train)

[ 0.39166667]
[ 0.375]
[ 0.38333333]
[ 0.325]
[ 0.35833333]


K-nearest Neighbors

In [10]:
from sklearn.neighbors import KNeighborsClassifier

params = {'n_neighbors': [1,2,4,8], 'algorithm': ['ball_tree', 'kd_tree']}
neigh = KNeighborsClassifier()
grid_neigh = GridSearchCV(neigh, params, cv=5)
grid_neigh.fit(X_train, y_train);

Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier

rand_forest = RandomForestClassifier(
    random_state=0, 
    n_estimators=25, 
    max_depth=5, 
    bootstrap=True
)
params = {
    'n_estimators': [25,75,150],
    'max_depth': [10,20,80]
}
grid_rand_forest = GridSearchCV(rand_forest, params, cv=5)
model_params, accuracy_scores = cross_val(grid_rand_forest, 5, X_train, y_train)

[ 0.4125]
[ 0.35]
[ 0.43333333]
[ 0.32916667]
[ 0.35416667]


In [12]:
rand_forest = RandomForestClassifier(
    random_state=0, 
    n_estimators=100, 
    max_depth=20, 
    bootstrap=True
)

cross_val(rand_forest, 5, X_train, y_train)
y_hat = rand_forest.predict(X_val)
accuracy(y_hat, y_val)

[ 0.4]
[ 0.33333333]
[ 0.425]
[ 0.32083333]
[ 0.3625]


0.31893687707641194

Support Vector Machine

In [15]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC

params = {'C': [1e-1, 1, 10, 100], 'loss': ['hinge', 'squared_hinge']}
one_vs_one_svm = OneVsOneClassifier(LinearSVC(random_state=0, penalty='l2', C='1'))
# grid_ovo_svm = GridSearchCV(one_vs_one_svm, params, cv=5)
# grid_ovo_svm.fit(X_train, y_train);

# from sklearn.svm import LinearSVC
# params = {'C': [1e-1, 1, 10, 100], 'loss': ['hinge', 'squared_hinge']}
# grid_svm = GridSearchCV(LinearSVC(), params, cv=5)
# grid_svm.fit(X_train, y_train);

In [None]:
# from sklearn.multiclass import OneVsRestClassifier
# params = {'C': [1e-1, 1, 10, 100], 'loss': ['hinge', 'squared_hinge']}
# one_vs_rest_svm = OneVsRestClassifier(LinearSVC(random_state=0, penalty='l2'))
# grid_ovr_svm = GridSearchCV(one_vs_rest_svm, params, cv=5)
# grid_ovr_svm.fit(X_train, y_train);

In [16]:
models = [lreg, grid_neigh, rand_forest, grid_svm]
names = ['Logistic Regression', 'K-NN', 'Random Forest', 'SVM']
max_name_length = len(max(names, key=len))

for model, name in zip(models, names):
    y_hat = model.predict(X_val)
    acc = accuracy(y_hat, y_val)
    print('{:19s} Accuracy: {}'.format(name, acc))

Logistic Regression Accuracy: 0.3289036544850498
K-NN                Accuracy: 0.20930232558139536
Random Forest       Accuracy: 0.31893687707641194
SVM                 Accuracy: 0.17275747508305647
