# Support Vector Machine

## Preparación de ambiente

#Para convertir imágenes en tamaño 28x28
convert - resize 28x28! '8.png' sample8_black_r.png

### Carga de módulos

In [1]:
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score

### Funciones relevantes

In [2]:
def get_data(number):
    img = Image.open('sample%d_black_r.png'%(number)).convert('L')
    img_arr = np.array(img)
    WIDTH, HEIGHT = img.size
    data = list(img.getdata())
    data = [data[offset:offset+WIDTH] for offset in range(0, WIDTH*HEIGHT, WIDTH)]
    return data

In [3]:
def print_data(data):
    for row in data:
        print(''.join('{:3}'.format(value) for value in row))

In [4]:
def flatten_data(data):
    return[np.reshape(data, (28*28,))]

In [5]:
get_data(3)

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  15,
  1,
  0,
  0,
  4,
  0,
  6,
  0,
  11,
  3,
  0,
  0,
  0,
  3,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  18,
  7,
  0,
  0,
  5,
  16,
  0,
  10,
  0,
  27,
  0,
  0,
  0,
  20,
  4,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  15,
  1,
  0,
  4,
  12,
  2,
  10,
  0,
  13,
  9,
  0,
  2,
  0,
  3,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  19,
  0,
  0,
  12,
  0,
  1,
  0,
  0,
  0,
  6,
  0,
  0,
  2,
  7,
  0,
  10,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  10,
  250,
  255,
  240,
  250,
  255,
  255,
  255,
  247,
  255,
  14,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  6,
  2,
  255,
  255,
  255,
  255,
  238,
  255,
  240,
  255,
  247,
  12,
  0,
  0,
  10,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  14,
  0,
  0,
  8,
  241,
  255,
  255,
  0

In [6]:
print_data(get_data(3))

  0  0  0  0  0  0  0  0  0  0 15  1  0  0  4  0  6  0 11  3  0  0  0  3  0  0  0  0
  0  0  0  0  0  0  0  0 18  7  0  0  5 16  0 10  0 27  0  0  0 20  4  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0 15  1  0  4 12  2 10  0 13  9  0  2  0  3  0  0  0  0
  0  0  0  0  0  0  0  0 19  0  0 12  0  1  0  0  0  6  0  0  2  7  0 10  0  0  0  0
  0  0  0  0  0  0  0  0  0 10250255240250255255255247255 14  0  0  5  0  0  0  0  0
  0  0  0  0  0  0  0  0  6  2255255255255238255240255247 12  0  0 10  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  1  0 14  0  0  8241255255  0  0 16  0  0 14  0  0  0  0
  0  0  0  0  0  0  0  0 14  0  3  0 10  0255255244  0  2  0  7  0  8  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  5  2  0  0  0255255255  0 18  3  0  0  6  0  0  0  0  0
  0  0  0  0  0  0  0  0 11  0  4  0  1255240250  0 12  0  0  7  5 11  0  0  0  0  0
  0  0  0  0  0  0  0  0  3  0  9  4245255252255255 13  0  2  3  1  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0255245249255255237 14  5  0  

In [7]:
flatten_data(get_data(3))

[array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  15,   1,   0,
          0,   4,   0,   6,   0,  11,   3,   0,   0,   0,   3,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  18,   7,   0,
          0,   5,  16,   0,  10,   0,  27,   0,   0,   0,  20,   4,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,  15,   1,   0,   4,  12,   2,  10,   0,  13,   9,   0,   2,
          0,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,  19,   0,   0,  12,   0,   1,   0,   0,   0,   6,   0,   0,
          2,   7,   0,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,  10, 250, 255, 240, 250, 255, 255, 255, 247,
        255,  14,   0,   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   6,   2, 255, 255, 255, 255, 238, 255,
        240, 255, 247,  12,   0,   0,  10,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   1

## Carga de datos

### Carga de conjunto de entrenamiento

In [8]:
df = pd.read_csv("digits_train_sample.csv")

In [10]:
df.shape

(6000, 785)

In [9]:
df

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5996,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5997,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Análisis Exploratorio

In [11]:
df["label"].value_counts(True)

label
7    0.112833
1    0.111833
8    0.104667
0    0.100500
9    0.099000
6    0.098500
3    0.097167
2    0.094500
4    0.091000
5    0.090000
Name: proportion, dtype: float64

In [17]:
df.sample(10)

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
1371,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2663,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5304,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4977,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2871,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3873,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4261,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4115,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1073,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Preparación de datos

In [18]:
X = df[[x for x in df.columns if x != "label"]]
y = df[["label"]]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7)

In [20]:
sc_x = MinMaxScaler()

In [21]:
Xs = sc_x.fit_transform(X_train)

## Modelado

### LDA

#### Entrenamiento

In [22]:
lda = LinearDiscriminantAnalysis()
lda.fit(Xs, y_train.values.ravel())
ls_scores = cross_val_score(estimator=lda, scoring="accuracy", X=Xs, y=y_train, cv=4, n_jobs=-1)
np.mean(ls_scores), np.std(ls_scores)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(np.float64(0.8078571428571429), np.float64(0.012748660504978618))

### Support Vector Machine

#### Entrenamiento

In [23]:
svm = SVC()
svm.fit(X_train, y_train.values.ravel())
ls_scores = cross_val_score(estimator=svm, scoring="accuracy", X=X_train, y=y_train, cv=4, n_jobs=-1)
np.mean(ls_scores), np.std(ls_scores)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(np.float64(0.9421428571428571), np.float64(0.008712464030253327))

#### Hyperparametrización

In [24]:
svm?

[0;31mType:[0m        SVC
[0;31mString form:[0m SVC()
[0;31mFile:[0m        ~/.local/share/virtualenvs/clases-XAo6S3GO/lib/python3.10/site-packages/sklearn/svm/_classes.py
[0;31mDocstring:[0m  
C-Support Vector Classification.

The implementation is based on libsvm. The fit time scales at least
quadratically with the number of samples and may be impractical
beyond tens of thousands of samples. For large datasets
consider using :class:`~sklearn.svm.LinearSVC` or
:class:`~sklearn.linear_model.SGDClassifier` instead, possibly after a
:class:`~sklearn.kernel_approximation.Nystroem` transformer or
other :ref:`kernel_approximation`.

The multiclass support is handled according to a one-vs-one scheme.

For details on the precise mathematical formulation of the provided
kernel functions and how `gamma`, `coef0` and `degree` affect each
other, see the corresponding section in the narrative documentation:
:ref:`svm_kernels`.

To learn how to tune SVC's hyperparameters, see the following 

#OVO
A,B,C

1 - A o B
2 - A o C
3 - B o A
4 - B o C
5 - C o A
6 - C o B

#OVR
A,B,C

1 - A o NoA (B,C)
2 - B o NoB (A,C)
3 - C o NoC (A,B)

In [25]:
param_grid = {"C": [x for x in range(10)],
              "kernel": ['linear', 'poly', 'rbf', 'sigmoid']}

In [26]:
n_hyper = np.prod([x for x in map(len, param_grid.values())])

In [27]:
n_hyper

np.int64(40)

In [28]:
%%time
model = SVC()
clf = GridSearchCV(model, param_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="accuracy", verbose=1)
clf.fit(X_train, y_train)

Fitting 4 folds for each of 40 candidates, totalling 160 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

CPU times: user 4.93 s, sys: 1.03 s, total: 5.96 s
Wall time: 4min 19s


In [29]:
print("Best score: " + str(clf.best_score_))
print("Best estimator: " + str(clf.best_estimator_))

Best score: 0.9497619047619048
Best estimator: SVC(C=4)


### Validación del modelo

#### Análisis de resultados

In [30]:
y_pred = svm.predict(X_train)

In [31]:
confusion_matrix(y_true=y_train, y_pred=y_pred)

array([[426,   0,   0,   0,   0,   0,   0,   0,   2,   0],
       [  0, 467,   2,   0,   0,   0,   0,   0,   1,   1],
       [  0,   1, 384,   0,   1,   0,   0,   4,   0,   0],
       [  0,   0,   4, 393,   0,   2,   0,   2,   3,   1],
       [  0,   0,   1,   0, 374,   0,   0,   0,   0,   2],
       [  0,   0,   1,   1,   0, 377,   1,   0,   3,   0],
       [  0,   1,   0,   0,   0,   0, 421,   0,   0,   0],
       [  0,   1,   0,   1,   1,   0,   0, 451,   0,   2],
       [  0,   3,   0,   3,   1,   2,   1,   0, 416,   2],
       [  1,   1,   0,   3,   4,   0,   0,   3,   1, 427]])

#### Pruebas del modelo en datos no vistos

In [32]:
df_val = pd.read_csv("digits_test_sample.csv")

In [33]:
X_val = df_val[[x for x in df_val.columns if x != "label"]]
y_val = df_val[["label"]]

In [None]:
#Xvs = sc_x.transform(X_val)

In [34]:
svm.score(X_val, y_val)

0.948

In [35]:
accuracy_score(y_pred=svm.predict(X_val), y_true=y_val)

0.948

#### Pruebas del modelo en datos reales

In [52]:
n=9

In [53]:
data = flatten_data(get_data(n))

In [54]:
#data

In [55]:
svm.predict(data)



array([5])

In [56]:
print_data(get_data(n))

  0  0  0  0  0  0  0  0  0  0  3  0  0 10  0  5  0 11  0  8  0  0 16  0  0  0  0  0
  0  0  0  0  0  0  0  0  5  0  6  5  0  0  9  0 19  0 13  0  6  4  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0 18  0  0 21  0  4 11  0  5  0  0  0  0  2  9  0  0  0  0
  0  0  0  0  0  0  0  0  6  4  0  5  1  6  0  4 12  0  2 10 22  0  8  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  8  0  0255255255248 15  0  0  0  5  0 15  0  0  0  0
  0  0  0  0  0  0  0  0  0 10  0255255253255237247255255  1 20  0  0 12  0  0  0  0
  0  0  0  0  0  0  0  0  4245252251243255 12  9255247255255  0 18  2  0  0  0  0  0
  0  0  0  0  0  0  0  0254255255248  8  5  0  0  0  6247255  6  0  0 13  0  0  0  0
  0  0  0  0  0  0  0  0250255  4  0  0  6  0  3  1 12255239 10  4  2  0  0  0  0  0
  0  0  0  0  0  0  0  0255255  0  7  1  0  1  0  0  0  0255  1  0  0 12  0  0  0  0
  0  0  0  0  0  0  0  0252232 14  0  0 14  0 22  0 16 15250  0  7  0  3  0  0  0  0
  0  0  0  0  0  0  0  0255255  0  0  4 16  0  0 15  0250237 20  