# KNN Digits
Implementing a simple KNN to classify digits.

In [1]:
import cv2 as cv
from PIL import Image
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

#### Data Prep

In [2]:
df_28x28 = pd.read_csv('numbers.csv')
df_38x28 = pd.read_csv('digits_only_numbers.csv')

df = pd.concat([df_28x28, df_38x28], ignore_index=True)
df

Unnamed: 0,numbers,file_path
0,0,media/kills_only/loop_00000.jpg
1,n,media/kills_only/loop_00001.jpg
2,2,media/kills_only/loop_00002.jpg
3,2,media/kills_only/loop_00003.jpg
4,2,media/kills_only/loop_00004.jpg
...,...,...
13133,45,media/digits_only/loop_07259pr.jpg
13134,7,media/digits_only/loop_07260k.jpg
13135,45,media/digits_only/loop_07260pr.jpg
13136,7,media/digits_only/loop_07261k.jpg


Standardive and/or Remove 'null' values. Both improve accuracy a few %.reset_index

In [3]:
# helps a few %
df.numbers.loc[df.numbers == 'n'] = ''
df.numbers.loc[df.numbers == 'e'] = ''
df.numbers.loc[df.numbers == 'b'] = ''

# # maybe helps a little less than making them all the same
# df = df.loc[df.numbers != 'n']
# df = df.loc[df.numbers != 'e']
# df = df.loc[df.numbers != 'b']

# df.reset_index()

# df.tail(3)

Limit the number of samples from each possible outcome.

In [4]:
# temp_df = pd.DataFrame(columns=df.columns)

# for u in df.numbers.unique():
#     sample = df.loc[df.numbers==u].copy()
#     if len(sample) > 50:
#         sample = sample.sample(50)
#     temp_df = pd.concat([temp_df, sample])

# df = temp_df

# df

#### What target values are in the dataset?

In [5]:
len(df.numbers.unique()),# df.numbers.unique()

(231,)

In [6]:
actual_numbers = []
for un in df.numbers.unique():
    try:
        actual_numbers.append(int(un))
    except:
        pass
    
len(actual_numbers), #sorted(actual_numbers)

(156,)

In [7]:
# ??weird numbers??
for i in actual_numbers:
    if i not in range(153):
        print(i)

158
159


In [8]:
for i in range(153):
    # currently have 0-152
    if i not in sorted(actual_numbers):
        print(i)

In [9]:
df.numbers.value_counts()[:10]

      909
0     819
11    581
10    533
1     527
5     474
4     432
2     398
9     394
12    394
Name: numbers, dtype: int64

In [10]:
df.numbers.value_counts()[10:30]

3     353
8     332
15    326
16    316
6     287
14    260
7     248
18    229
20    184
19    180
13    170
17    167
21    113
23     93
22     83
28     80
36     78
29     76
56     76
25     73
Name: numbers, dtype: int64

In [11]:
df.numbers.value_counts()[30:]

24      69
59      69
35      68
34      65
38      64
        ..
i107     1
6b       1
i106     1
i64      1
i80      1
Name: numbers, Length: 201, dtype: int64

For `X`: Make list of lists, each holding an array (image) and its file path. `.flatten()` the arrays so they're 1D.

For `y`: Target values are found in the `numbers columns`.

After train/test splitting, split the file paths from the arrays (images) so we have an array of file paths and an array of arrays (images) for training and for testing (4 arrays total).

The arrays of file paths (`train_file_paths`, `test_file_paths`) are of no use to our model, and are only recorded so that we can examine particular instances (e.g. to see an incorrectly predicted image).

In [12]:
X = [[cv.imread(fp).flatten(), fp] if Image.open(fp).size==(38, 28) else [np.array(Image.open(fp).crop((0-3, 0, 28+7, 28))).flatten(), fp] for fp in df.file_path.values]
y = df.numbers.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# keep file paths 
train_file_paths = np.array([fp for img, fp in X_train])
test_file_paths = np.array([fp for img, fp in X_test])

X_train = np.array([img for img, fp in X_train])
X_test = np.array([img for img, fp in X_test])

#### Create & Train Model
And output an array of predictions (just to see what they look like).

In [13]:
knn = KNeighborsClassifier(n_neighbors=1, n_jobs=8)

In [14]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_jobs=8, n_neighbors=1)

In [15]:
knn.predict(X_test)

array(['130', '2', '39', ..., '29', '32', '87'], dtype=object)

#### Score Model

In [16]:
%%time
preds = knn.predict(X_test)

n_correct = np.sum(preds==y_test)
n_possible = len(y_test)

print(f'n_correct:  {n_correct}\nn_possible: {n_possible}\n% correct:  {n_correct/n_possible*100}%')

n_correct:  3501
n_possible: 3942
% correct:  88.81278538812786%
Wall time: 12.8 s


#### Current 
- 4796 rows, k=1, test_size=0.3, % correct: 93.12022237665045%

#### Previous Scores
- Week of 31 August 2020
    - 1740 rows, k=1, test_size=0.2, % correct: 89.65517241379311%
    - 2114 rows, k=1, test_size=0.2, % correct: 88.88888888888889%
    - 2114 rows, k=2, test_size=0.2, % correct: 88.65248226950354%
    - 2114 rows, k=3, test_size=100, % correct: 94.0% (one off, more range variation than above, higher highs, lower lows)
    - 2114 rows, k=1, test_size=100, % correct: 91.0% (consistent, some variation ranging 84-93%)
    - 2114 rows, k=1, test_size=0.3, % correct (7 runs avg): 88.008998875%

#### Goal Score (18 September 2020)
- n rows, k=k, test_size=test_size, % correct: > 94%

#### Goal Score (30 September 2020)
- n rows, k=k, test_size=test_size, % correct: > 98.1%

#### Goal Deployed Score (31 October 2020)
- n rows, k=k, test_size=live_feed, % correct: > 95.1%+

## What's wrong? Predicted v Actual
Incorrect predictions on the left, actual values (labels) on the right. (Assumes labels are correct.)

In [17]:
comp_df = pd.DataFrame()

comp_df['predicted'] = preds
comp_df['actual'] = y_test
comp_df['reference_file'] = test_file_paths

comp_df.loc[comp_df.predicted != comp_df.actual]

Unnamed: 0,predicted,actual,reference_file
9,2,7,media/digits_only/loop_07191k.jpg
21,135,105,media/digits_only/loop_05924pr.jpg
23,35,114,media/digits_only/loop_03507pr.jpg
24,63,62,media/digits_only/loop_05602pr.jpg
27,48,49,media/digits_only/loop_07210pr.jpg
...,...,...,...
3885,5,8,media/digits_only/loop_05949k.jpg
3891,,45,media/digits_only/loop_04640pr.jpg
3894,142,143,media/digits_only/loop_05860pr.jpg
3898,49,48,media/digits_only/loop_07252pr.jpg


What are the top 7 targets we are missing?

In [18]:
comp_df.loc[comp_df.predicted != comp_df.actual].actual.value_counts()[:7]

      32
6     20
9     15
12    10
90     7
16     7
80     7
Name: actual, dtype: int64

What are the top 7 targets we are hitting?

In [19]:
comp_df.loc[comp_df.predicted == comp_df.actual].actual.value_counts()[:7]

      255
0     242
11    171
10    166
1     150
4     142
2     126
Name: actual, dtype: int64

For each number, print
- number
- n incorrect predictions
- n correct predictions

Note: this displays numbers from most incorrectly predicted (total) to least incorrectly predicted, not necessairly the same as least accurate to most accurate (i.e. less representation means less opportunities for error).

**Goal**: > 95% accuracy on each possible target (outcome).

In [20]:
n_numbers_possible_correct = list(comp_df.loc[comp_df.predicted == comp_df.actual].actual.value_counts())
numbers_possible_correct = comp_df.loc[comp_df.predicted == comp_df.actual].actual.value_counts().index

n_numbers_possible_incorrect = list(comp_df.loc[comp_df.predicted != comp_df.actual].actual.value_counts())
numbers_possible_incorrect = comp_df.loc[comp_df.predicted != comp_df.actual].actual.value_counts().index

correct_dct = {}
for i in range(len(numbers_possible_correct)):
    correct_dct.update({numbers_possible_correct[i]:n_numbers_possible_correct[i]})

incorrect_dct = {}
for i in range(len(numbers_possible_incorrect)):
    incorrect_dct.update({numbers_possible_incorrect[i]:n_numbers_possible_incorrect[i]})
    
seen = []
correct_keys = [k for k in correct_dct.keys()]
incorrect_keys = [k for k in incorrect_dct.keys()]

accepted_error = 0.5 * 100
# accepted_error = 0.35 * 100  # 12 Sept
# accepted_error = 0.2 * 100  # 19 Sept
# accepted_error = 0.08 * 100  # 26 Sept
# accepted_error = 0.04999995 * 100  # 3 Oct
need_more = []

In [21]:
for i in incorrect_keys:
    if i not in seen:
        print(i)
        print(f'incorrect: {incorrect_dct[i]}')
        try:
            print(f'# correct: {correct_dct[i]}')
            error_per = float(str(incorrect_dct[i]/(incorrect_dct[i]+correct_dct[i])*100)[:7])
            print(f'per error: {error_per}%')
            if error_per > accepted_error:
                try:
                    need_more.append(int(i))
                except:
                    need_more.append(i)
        except:
            try:
                need_more.append(int(i))
            except:
                need_more.append(i)
        print()
        seen.append(i)

for i in correct_keys:
    if i not in seen:
        print(i)
        print(f'correct: {correct_dct[i]}')
        try:
            print(f'incorrect: {incorrect_dct[i]}')
            error_per = float(str(incorrect_dct[i]/(incorrect_dct[i]+correct_dct[i])*100)[:7])
            print(f'per error: {error_per}%')
            if error_per > accepted_error:
                try:
                    need_more.append(int(i))
                except:
                    need_more.append(i)
        except:
            pass
        print()
        seen.append(i)

need_more_digits = []
need_more_others = []
for target in need_more:
    try:
        need_more_digits.append(int(target))
    except:
        need_more_others.append(target)


incorrect: 32
# correct: 255
per error: 11.1498%

6
incorrect: 20
# correct: 71
per error: 21.978%

9
incorrect: 15
# correct: 118
per error: 11.2781%

12
incorrect: 10
# correct: 103
per error: 8.84955%

90
incorrect: 7
# correct: 3
per error: 70.0%

16
incorrect: 7
# correct: 86
per error: 7.52688%

80
incorrect: 7
# correct: 10
per error: 41.1764%

72
incorrect: 7
# correct: 8
per error: 46.6666%

18
incorrect: 6
# correct: 58
per error: 9.375%

78
incorrect: 6
# correct: 5
per error: 54.5454%

50
incorrect: 6
# correct: 9
per error: 40.0%

92
incorrect: 6
# correct: 5
per error: 54.5454%

59
incorrect: 6
# correct: 20
per error: 23.0769%

76
incorrect: 6
# correct: 5
per error: 54.5454%

20
incorrect: 5
# correct: 48
per error: 9.43396%

64
incorrect: 5
# correct: 10
per error: 33.3333%

69
incorrect: 5
# correct: 6
per error: 45.4545%

71
incorrect: 5
# correct: 7
per error: 41.6666%

84
incorrect: 5
# correct: 5
per error: 50.0%

10
incorrect: 5
# correct: 166
per error: 2.92397

In [22]:
len(need_more_digits)

20

In [23]:
sorted(need_more_digits)

[52,
 53,
 55,
 76,
 78,
 81,
 83,
 88,
 90,
 92,
 94,
 95,
 97,
 98,
 111,
 125,
 135,
 136,
 143,
 152]

In [24]:
sorted(need_more_others)

['6b',
 'b2',
 'i133',
 'i136',
 'i144',
 'i150',
 'i37',
 'i38',
 'i59',
 'i62',
 'i74',
 'i79',
 'i83',
 'i85']