# KNN Digits
Implementing a simple KNN to classify digits.

In [1]:
import cv2 as cv
from PIL import Image
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

## Data Prep

#### Load Reference Files and Labels

In [2]:
# 2,114 (total) n_kills (28, 28) crops
df_28x28 = pd.read_csv('numbers.csv')

# 11,024 (total) n_kills & n_pr (38, 28) crops
df_38x28 = pd.read_csv('digits_only_numbers.csv')

# 7,717 (each) n_kills & n_pr crops (38, 28) & 150 (total) n_tr crops (38, 28)
df_38x28_s = pd.read_csv('labeled_screenshots.csv')
n_teams_numbers = df_38x28_s[['n_teams_remaining', 'tr_reference_file']].dropna()
n_players_numbers = df_38x28_s[['n_players_remaining', 'pr_reference_file']].dropna()
n_kills_numbers = df_38x28_s[['n_kills', 'k_reference_file']].dropna()
for numbers_group in [n_teams_numbers, n_players_numbers, n_kills_numbers]:
    numbers_group.columns = ['numbers', 'file_path']
df_38x28_s = pd.concat([n_teams_numbers, n_players_numbers, n_kills_numbers])

# combine into unified dataframe of numbers (labels) & file paths
df = pd.concat([df_28x28, df_38x28, df_38x28_s], ignore_index=True)
df

Unnamed: 0,numbers,file_path
0,0,media/kills_only/loop_00000.jpg
1,n,media/kills_only/loop_00001.jpg
2,2,media/kills_only/loop_00002.jpg
3,2,media/kills_only/loop_00003.jpg
4,2,media/kills_only/loop_00004.jpg
...,...,...
28717,5,media/stable_numbers/number_crops/loop_0007719...
28718,5,media/stable_numbers/number_crops/loop_0007720...
28719,5,media/stable_numbers/number_crops/loop_0007721...
28720,5,media/stable_numbers/number_crops/loop_0007722...


#### Clean Up Labels

In [3]:
max_val = 153
# max_val = 33

max_label_sample = False
# max_label_sample = 500

standard_nulls = True

fix_digits = True

In [4]:
# fix digits
if fix_digits:
    df.numbers.loc[df.numbers == '00'] = 0
    for _ in range(160):
        if _ <= 152:
            df.numbers.loc[df.numbers == f'{_}'] = _
            df.numbers.loc[df.numbers == f'{float(_)}'] = _
            # relabel: partially blurry > blurry
            for e in [f'b{_}', f'{_}b', f'{_}bb', f'b{float(_)}', f'{float(_)}b' f'{float(_)}bb']:
                df.numbers.loc[df.numbers == e] = 'b'
                if _ < 10:
                    df.numbers.loc[df.numbers == f'b0{_}'] = 'b'
            # remove all icon issue numbers
            for e in [f'i{int(_)}', f'{int(_)}i', f'i{float(_)}', f'{float(_)}i',
                      f'i{float(_)}b', f'b{float(_)}i', f'b{int(_)}i', f'i{int(_)}b', f'ie{int(_)}', f'ie{float(_)}',
                      f'i{int(_)}e', f'i{float(_)}e']:
                df = df.loc[df.numbers != e]
            # remove other error issue numbers
            for e in [f'e{_}', f'{_}e', f'e{float(_)}', f'{float(_)}e']:
                df = df.loc[df.numbers != e]
        else:
            # remove any numbers over 152
            for e in [f'{int(_)}', f'i{int(_)}', f'{int(_)}i', f'i{float(_)}', f'{float(_)}i', 
                      f'b{int(_)}', f'{int(_)}b',
                      f'e{int(_)}', f'e{float(_)}']:
                df = df.loc[df.numbers != e]

# fix nulls (standardize)
if standard_nulls:
    df.numbers.loc[df.numbers == 'b'] = ''
    df.numbers.loc[df.numbers == 'e'] = ''
    df.numbers.loc[df.numbers == 'r'] = ''
    df.numbers.loc[df.numbers == 'n'] = ''
    df.numbers.loc[df.numbers == 'bb'] = ''
    df.numbers.loc[df.numbers == 'ib'] = ''
    df.numbers.loc[df.numbers == 'ibb'] = ''
    df.numbers.loc[df.numbers == 'ie'] = ''
    df.numbers.loc[df.numbers == 'nn'] = ''
    df.numbers.loc[df.numbers == ''] = 153

# 0-9 only
if max_val:
    df = df.loc[df.numbers != '']
    df = df.loc[df.numbers <= max_val]
    
# limit number of each label
if max_label_sample:
    for value in df.numbers.unique():
        c = len(df.loc[df.numbers==value])
        if c > max_label_sample:
            temp_df = df.loc[df.numbers == value].sample(max_label_sample)
            df = df.loc[df.numbers != value]
            df = pd.concat([df, temp_df])
        print(f'{value} | {len(df.loc[df.numbers==value])}')

# convert numbers column to float
try:
    df.numbers = df.numbers.astype('float')
    print(f'df.numbers.dtype == {df.numbers.dtype}')
except:
    print(f'df.numbers.dtype == {df.numbers.dtype}')

# let's see how it looks
print(f'len(df) == {len(df)}')
df

df.numbers.dtype == float64
len(df) == 28115


Unnamed: 0,numbers,file_path
0,0.0,media/kills_only/loop_00000.jpg
1,153.0,media/kills_only/loop_00001.jpg
2,2.0,media/kills_only/loop_00002.jpg
3,2.0,media/kills_only/loop_00003.jpg
4,2.0,media/kills_only/loop_00004.jpg
...,...,...
28717,5.0,media/stable_numbers/number_crops/loop_0007719...
28718,5.0,media/stable_numbers/number_crops/loop_0007720...
28719,5.0,media/stable_numbers/number_crops/loop_0007721...
28720,5.0,media/stable_numbers/number_crops/loop_0007722...


#### What target values are in the dataset?

In [5]:
len(df.numbers.unique()), df.numbers.unique()

(154,
 array([  0., 153.,   2.,   3.,   4.,   1.,  21.,   5.,   6.,   8.,   7.,
          9.,  10.,  11.,  12.,  14.,  15.,  16.,  13.,  17.,  18.,  19.,
         24.,  27.,  20.,  22.,  23.,  25.,  26.,  61.,  59.,  60.,  56.,
         50.,  49.,  47.,  39.,  37.,  36.,  35.,  33.,  32.,  29.,  28.,
         30., 142., 139., 138., 133., 132., 130., 129., 128., 126., 122.,
        121., 120., 119., 118., 117., 116., 115., 114., 113., 112., 110.,
        106., 105., 104., 102., 101., 100.,  93.,  89.,  86.,  79.,  76.,
         73.,  71.,  65.,  64.,  55.,  45.,  43.,  42.,  40.,  38.,  34.,
         75.,  58.,  57.,  54.,  51.,  48.,  46.,  44.,  41.,  83.,  81.,
         80.,  74.,  72.,  68., 123.,  96.,  95.,  94.,  91.,  88.,  87.,
         85.,  84.,  82.,  78.,  77.,  67.,  66.,  63.,  62.,  53.,  52.,
        150., 148., 147.,  92.,  90., 144., 143., 140., 136., 135., 131.,
        127., 111., 109.,  97.,  70.,  69., 134., 125., 124., 108., 107.,
        103., 146., 145., 141., 

In [6]:
actual_numbers = []
for un in df.numbers.unique():
    try:
        actual_numbers.append(int(un))
    except:
        pass
    
len(actual_numbers)#, sorted(actual_numbers)

154

In [7]:
df.numbers.value_counts()

153.0    2104
4.0      1342
3.0      1067
0.0      1029
1.0      1020
         ... 
98.0       15
94.0       15
97.0       11
147.0       6
152.0       1
Name: numbers, Length: 154, dtype: int64

### Load in Data (Images)
For `X`: Make list of lists, each holding an array (image) and its file path. `.flatten()` the arrays so they're 1D.

For `y`: Target values are found in the `numbers columns`.

In [8]:
%%time
X = [[cv.imread(fp).flatten(), fp] if Image.open(fp).size==(38, 28) else [np.array(Image.open(fp).crop((0-3, 0, 28+7, 28))).flatten(), fp] for fp in df.file_path.values]
y = df.numbers.values

Wall time: 9.02 s


#### Train / Test Split
After train/test splitting, split the file paths from the arrays (images) so we have an array of file paths and an array of arrays (images) for training and for testing (4 arrays total).

The arrays of file paths (`train_file_paths`, `test_file_paths`) are of no use to our model, and are only recorded so that we can examine particular instances (e.g. to see an incorrectly predicted image).

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# keep file paths 
train_file_paths = np.array([fp for img, fp in X_train])
test_file_paths = np.array([fp for img, fp in X_test])

X_train = np.array([img for img, fp in X_train])
X_test = np.array([img for img, fp in X_test])

In [10]:
y_train#, pd.DataFrame(y_train).value_counts()

array([108., 132.,   5., ...,   4.,  78.,  20.])

## Create & Train Model
And output an array of predictions (just to see what they look like).

In [11]:
knn = KNeighborsClassifier(n_neighbors=1, n_jobs=8)

In [12]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_jobs=8, n_neighbors=1)

In [13]:
#### Make Predictions

In [14]:
%%time
preds = knn.predict(X_test)

Wall time: 57 s


In [15]:
preds

array([25., 10., 20., ...,  3., 16., 16.])

#### Score Model

In [16]:
n_correct = np.sum(preds==y_test)
n_possible = len(y_test)

print(f'n_correct:  {n_correct}\nn_possible: {n_possible}\n% correct:  {n_correct/n_possible*100}%')

n_correct:  7129
n_possible: 8435
% correct:  84.51689389448725%


#### Current 
- 4796 rows, k=1, test_size=0.3, % correct: 93.12022237665045%

#### Previous Scores
- Week of 31 August 2020
    - 1740 rows, k=1, test_size=0.2, % correct: 89.65517241379311%
    - 2114 rows, k=1, test_size=0.2, % correct: 88.88888888888889%
    - 2114 rows, k=2, test_size=0.2, % correct: 88.65248226950354%
    - 2114 rows, k=3, test_size=100, % correct: 94.0% (one off, more range variation than above, higher highs, lower lows)
    - 2114 rows, k=1, test_size=100, % correct: 91.0% (consistent, some variation ranging 84-93%)
    - 2114 rows, k=1, test_size=0.3, % correct (7 runs avg): 88.008998875%

#### Goal Score (18 September 2020)
- n rows, k=k, test_size=test_size, % correct: > 94%

#### Goal Score (30 September 2020)
- n rows, k=k, test_size=test_size, % correct: > 98.1%

#### Goal Deployed Score (31 October 2020)
- n rows, k=k, test_size=live_feed, % correct: > 95.1%+

## What's wrong? Predicted v Actual
Incorrect predictions on the left, actual values (labels) on the right. (Assumes labels are correct.)

In [17]:
comp_df = pd.DataFrame()

comp_df['predicted'] = preds
comp_df['actual'] = y_test
comp_df['reference_file'] = test_file_paths

comp_df.loc[comp_df.predicted != comp_df.actual]

Unnamed: 0,predicted,actual,reference_file
1,10.0,13.0,media/stable_numbers/number_crops/loop_0001681...
6,115.0,116.0,media/digits_only/loop_07106pr.jpg
11,18.0,10.0,media/stable_numbers/number_crops/loop_0000012...
20,133.0,136.0,media/stable_numbers/number_crops/loop_0000160...
35,20.0,28.0,media/stable_numbers/number_crops/loop_0002315...
...,...,...,...
8415,49.0,16.0,media/stable_numbers/number_crops/loop_0003765...
8426,61.0,6.0,media/digits_only/loop_01718pr.jpg
8428,87.0,37.0,media/digits_only/loop_06527pr.jpg
8430,8.0,153.0,media/stable_numbers/number_crops/loop_0007026...


What are the top 7 targets we are missing?

In [18]:
comp_df.loc[comp_df.predicted != comp_df.actual].actual.value_counts()[:7]

153.0    103
20.0      46
4.0       40
18.0      38
9.0       35
28.0      35
3.0       35
16.0      34
23.0      34
17.0      33
5.0       30
7.0       28
Name: actual, dtype: int64

What are the top 7 targets we are hitting?

In [19]:
comp_df.loc[comp_df.predicted == comp_df.actual].actual.value_counts()[:7]

153.0    526
4.0      379
1.0      304
3.0      292
0.0      288
11.0     274
10.0     237
16.0     218
2.0      214
20.0     199
18.0     192
5.0      185
8.0      175
7.0      170
Name: actual, dtype: int64

For each number, print
- number
- n incorrect predictions
- n correct predictions

Note: this displays numbers from most incorrectly predicted (total) to least incorrectly predicted, not necessairly the same as least accurate to most accurate (i.e. less representation means less opportunities for error).

**Goal**: > 95% accuracy on each possible target (outcome).

In [20]:
n_numbers_possible_correct = list(comp_df.loc[comp_df.predicted == comp_df.actual].actual.value_counts())
numbers_possible_correct = comp_df.loc[comp_df.predicted == comp_df.actual].actual.value_counts().index

n_numbers_possible_incorrect = list(comp_df.loc[comp_df.predicted != comp_df.actual].actual.value_counts())
numbers_possible_incorrect = comp_df.loc[comp_df.predicted != comp_df.actual].actual.value_counts().index

correct_dct = {}
for i in range(len(numbers_possible_correct)):
    correct_dct.update({numbers_possible_correct[i]:n_numbers_possible_correct[i]})

incorrect_dct = {}
for i in range(len(numbers_possible_incorrect)):
    incorrect_dct.update({numbers_possible_incorrect[i]:n_numbers_possible_incorrect[i]})
    
seen = []
correct_keys = [k for k in correct_dct.keys()]
incorrect_keys = [k for k in incorrect_dct.keys()]

# accepted_error = 0.5 * 100
accepted_error = 0.4 * 100  # 12 Sept (0.35)
# accepted_error = 0.2 * 100  # 19 Sept
# accepted_error = 0.08 * 100  # 26 Sept
# accepted_error = 0.04999995 * 100  # 3 Oct
need_more = []

In [21]:
for i in incorrect_keys:
    if i not in seen:
        print(i)
        print(f'incorrect: {incorrect_dct[i]}')
        try:
            print(f'# correct: {correct_dct[i]}')
            error_per = float(str(incorrect_dct[i]/(incorrect_dct[i]+correct_dct[i])*100)[:7])
            print(f'per error: {error_per}%')
            if error_per > accepted_error:
                try:
                    need_more.append(int(i))
                except:
                    need_more.append(i)
        except:
            try:
                need_more.append(int(i))
            except:
                need_more.append(i)
        print()
        seen.append(i)

for i in correct_keys:
    if i not in seen:
        print(i)
        print(f'correct: {correct_dct[i]}')
        try:
            print(f'incorrect: {incorrect_dct[i]}')
            error_per = float(str(incorrect_dct[i]/(incorrect_dct[i]+correct_dct[i])*100)[:7])
            print(f'per error: {error_per}%')
            if error_per > accepted_error:
                try:
                    need_more.append(int(i))
                except:
                    need_more.append(i)
        except:
            pass
        print()
        seen.append(i)

need_more_digits = []
need_more_others = []
for target in need_more:
    try:
        need_more_digits.append(int(target))
    except:
        need_more_others.append(target)
        
print(f'len(need_more_digits) == {len(need_more_digits)}')
print(f'len(need_more_others) == {len(need_more_others)}')

153.0
incorrect: 103
# correct: 526
per error: 16.3751%

20.0
incorrect: 46
# correct: 199
per error: 18.7755%

4.0
incorrect: 40
# correct: 379
per error: 9.54653%

18.0
incorrect: 38
# correct: 192
per error: 16.5217%

9.0
incorrect: 35
# correct: 151
per error: 18.8172%

28.0
incorrect: 35
# correct: 48
per error: 42.1686%

3.0
incorrect: 35
# correct: 292
per error: 10.7033%

16.0
incorrect: 34
# correct: 218
per error: 13.492%

23.0
incorrect: 34
# correct: 64
per error: 34.6938%

17.0
incorrect: 33
# correct: 167
per error: 16.5%

5.0
incorrect: 30
# correct: 185
per error: 13.9534%

7.0
incorrect: 28
# correct: 170
per error: 14.1414%

21.0
incorrect: 26
# correct: 57
per error: 31.3253%

19.0
incorrect: 25
# correct: 153
per error: 14.0449%

8.0
incorrect: 25
# correct: 175
per error: 12.5%

22.0
incorrect: 24
# correct: 130
per error: 15.5844%

11.0
incorrect: 24
# correct: 274
per error: 8.05369%

12.0
incorrect: 22
# correct: 144
per error: 13.253%

10.0
incorrect: 21
# corr

In [22]:
sorted(need_more_digits)

[28, 52, 55, 56, 78, 79, 82, 83, 88, 90, 94, 97, 98, 102, 108, 127, 147, 152]

In [23]:
sorted(need_more_others)

[]