# Initial Setup

In [55]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(57)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "model_outputs", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## Initial Data Imports

In [56]:
tictac_single = np.loadtxt('./datasets-part1/tictac_single.txt')
tictac_multi = np.loadtxt('./datasets-part1/tictac_multi.txt')
tictac_finals = np.loadtxt('./datasets-part1/tictac_final.txt')

In [58]:
xs, ys = tictac_single[:,:9], tictac_single[:,9:]
# used ravel to flatten y into a 1D array
ys = ys.flatten()
# x is input features and y is output labels

In [59]:
xs.shape

(6551, 9)

In [60]:
ys.shape

(6551,)

In [61]:
print(len(tictac_single))

6551


In [62]:
print(len(tictac_multi))

6551


In [63]:
print(len(tictac_finals))

958


In [64]:
xs_train, xs_test, ys_train, ys_test = xs[:5241], xs[5241:], ys[:5241], ys[5241:] # 80 / 20, train / test split

In [65]:
print(xs[0])

[ 1. -1.  0.  0.  0.  0.  0.  1.  0.]


In [96]:
ys_train[0:10]

array([6., 1., 2., 0., 3., 2., 0., 0., 3., 0.])

## Training and Running an Actual Model

Decided to use the SGDClassifier as it has been pretty effective in earlier assignments, though it
may struggle in this one as we are determining the single optimal move:

In [67]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=57)
sgd_clf.fit(xs_train, ys_train)

In [94]:
sgd_clf.predict(xs[0:10])

array([4., 0., 0., 0., 1., 4., 0., 0., 1., 4.])

In [69]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, xs_train, ys_train, cv=3, scoring="accuracy")

array([0.20549513, 0.19862622, 0.09845449])

## Multiclass Prediction

In [95]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto", random_state=57)
svm_clf.fit(xs_train[:1000], ys_train[:1000]) 
svm_clf.predict(xs[0:10])

array([4., 4., 0., 0., 3., 2., 0., 0., 4., 0.])

In [81]:
some_digit_scores = svm_clf.decision_function(xs)
some_digit_scores

array([[ 3.90434406,  3.05247512,  6.25263902, ...,  5.22448238,
        -0.29147954,  1.74857232],
       [ 5.22726045,  7.28441531,  2.9580736 , ...,  5.13372006,
         0.73574219,  5.20026198],
       [ 8.30232141,  5.24797743,  7.28317796, ...,  3.81251881,
         0.71006828,  1.76061653],
       ...,
       [ 6.28359477,  7.284516  ,  8.27746513, ...,  3.88421157,
         0.72448466,  1.72463219],
       [ 8.29599929,  6.25328062,  7.23063739, ...,  4.10747397,
         0.71363252,  1.76433015],
       [ 3.02971255,  5.18152559,  4.06201606, ...,  6.22338187,
        -0.28999071,  6.07472351]])

In [86]:
# to predict what is the highest value of a given index:
arr = []
for i in range(10):
    arr.append(np.argmax(some_digit_scores[i]))
arr

[4, 4, 0, 0, 4, 2, 0, 0, 4, 0]

In [72]:
svm_clf.classes_

array([0., 1., 2., 3., 4., 5., 6., 7., 8.])

In [92]:
from sklearn.multiclass import OneVsRestClassifier
ovr_clf = OneVsRestClassifier(SVC(gamma="auto", random_state=57))
ovr_clf.fit(xs_train[:1000], ys_train[:1000])
ovr_clf.predict(xs)

array([4., 1., 0., ..., 2., 0., 4.])

In [97]:
cross_val_score(ovr_clf, xs_train, ys_train, cv=3, scoring="accuracy")

array([0.73554665, 0.78191185, 0.80881511])

As seen above, the multiclass classifier of OneVsRest was much more accurate than the SGD Classifier.