# KNN Digits
Implementing a simple KNN to classify digits.

In [None]:
import cv2 as cv
from PIL import Image
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

## Data Prep

#### Load Reference Files and Labels

In [None]:
# 2,114 (total) n_kills (28, 28) crops
df_28x28 = pd.read_csv('numbers.csv')

# 11,024 (total) n_kills & n_pr (38, 28) crops
df_38x28 = pd.read_csv('digits_only_numbers.csv')

# 7,717 (each) n_kills & n_pr crops (38, 28) & 150 (total) n_tr crops (38, 28)
df_38x28_s = pd.read_csv('labeled_screenshots.csv')
n_teams_numbers = df_38x28_s[['n_teams_remaining', 'tr_reference_file']].dropna()
n_players_numbers = df_38x28_s[['n_players_remaining', 'pr_reference_file']].dropna()
n_kills_numbers = df_38x28_s[['n_kills', 'k_reference_file']].dropna()
for numbers_group in [n_teams_numbers, n_players_numbers, n_kills_numbers]:
    numbers_group.columns = ['numbers', 'file_path']
df_38x28_s = pd.concat([n_teams_numbers, n_players_numbers, n_kills_numbers])

# combine into unified dataframe of numbers (labels) & file paths
df = pd.concat([df_28x28, df_38x28, df_38x28_s], ignore_index=True)
df

#### Clean Up Labels

In [None]:
max_val = 153
# max_val = 33

max_label_sample = False
# max_label_sample = 500

standard_nulls = True

fix_digits = True

In [None]:
# fix digits
if fix_digits:
    df.numbers.loc[df.numbers == '00'] = 0
    for _ in range(160):
        if _ <= 152:
            df.numbers.loc[df.numbers == f'{_}'] = _
            df.numbers.loc[df.numbers == f'{float(_)}'] = _
            # relabel: partially blurry > blurry
            for e in [f'b{_}', f'{_}b', f'{_}bb', f'b{float(_)}', f'{float(_)}b' f'{float(_)}bb']:
                df.numbers.loc[df.numbers == e] = 'b'
                if _ < 10:
                    df.numbers.loc[df.numbers == f'b0{_}'] = 'b'
            # remove all icon issue numbers
            for e in [f'i{int(_)}', f'{int(_)}i', f'i{float(_)}', f'{float(_)}i',
                      f'i{float(_)}b', f'b{float(_)}i', f'b{int(_)}i', f'i{int(_)}b', f'ie{int(_)}', f'ie{float(_)}',
                      f'i{int(_)}e', f'i{float(_)}e']:
                df = df.loc[df.numbers != e]
            # remove other error issue numbers
            for e in [f'e{_}', f'{_}e', f'e{float(_)}', f'{float(_)}e']:
                df = df.loc[df.numbers != e]
        else:
            # remove any numbers over 152
            for e in [f'{int(_)}', f'i{int(_)}', f'{int(_)}i', f'i{float(_)}', f'{float(_)}i', 
                      f'b{int(_)}', f'{int(_)}b',
                      f'e{int(_)}', f'e{float(_)}']:
                df = df.loc[df.numbers != e]

# fix nulls (standardize)
if standard_nulls:
    df.numbers.loc[df.numbers == 'b'] = ''
    df.numbers.loc[df.numbers == 'e'] = ''
    df.numbers.loc[df.numbers == 'r'] = ''
    df.numbers.loc[df.numbers == 'n'] = ''
    df.numbers.loc[df.numbers == 'bb'] = ''
    df.numbers.loc[df.numbers == 'ib'] = ''
    df.numbers.loc[df.numbers == 'ibb'] = ''
    df.numbers.loc[df.numbers == 'ie'] = ''
    df.numbers.loc[df.numbers == 'nn'] = ''
    df.numbers.loc[df.numbers == ''] = 153

# 0-9 only
if max_val:
    df = df.loc[df.numbers != '']
    df = df.loc[df.numbers <= max_val]
    
# limit number of each label
if max_label_sample:
    for value in df.numbers.unique():
        c = len(df.loc[df.numbers==value])
        if c > max_label_sample:
            temp_df = df.loc[df.numbers == value].sample(max_label_sample)
            df = df.loc[df.numbers != value]
            df = pd.concat([df, temp_df])
        print(f'{value} | {len(df.loc[df.numbers==value])}')

# convert numbers column to float
try:
    df.numbers = df.numbers.astype('float')
    print(f'df.numbers.dtype == {df.numbers.dtype}')
except:
    print(f'df.numbers.dtype == {df.numbers.dtype}')

# let's see how it looks
print(f'len(df) == {len(df)}')
df

#### What target values are in the dataset?

In [None]:
len(df.numbers.unique()), df.numbers.unique()

In [None]:
actual_numbers = []
for un in df.numbers.unique():
    try:
        actual_numbers.append(int(un))
    except:
        pass
    
len(actual_numbers)#, sorted(actual_numbers)

In [None]:
df.numbers.value_counts()

### Load in Data (Images)
For `X`: Make list of lists, each holding an array (image) and its file path. `.flatten()` the arrays so they're 1D.

For `y`: Target values are found in the `numbers columns`.

In [None]:
%%time
X = [[cv.imread(fp).flatten(), fp] if Image.open(fp).size==(38, 28) else [np.array(Image.open(fp).crop((0-3, 0, 28+7, 28))).flatten(), fp] for fp in df.file_path.values]
y = df.numbers.values

#### Train / Test Split
After train/test splitting, split the file paths from the arrays (images) so we have an array of file paths and an array of arrays (images) for training and for testing (4 arrays total).

The arrays of file paths (`train_file_paths`, `test_file_paths`) are of no use to our model, and are only recorded so that we can examine particular instances (e.g. to see an incorrectly predicted image).

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# keep file paths 
train_file_paths = np.array([fp for img, fp in X_train])
test_file_paths = np.array([fp for img, fp in X_test])

X_train = np.array([img for img, fp in X_train])
X_test = np.array([img for img, fp in X_test])

In [None]:
y_train#, pd.DataFrame(y_train).value_counts()

## Create & Train Model
And output an array of predictions (just to see what they look like).

In [None]:
knn = KNeighborsClassifier(n_neighbors=1, n_jobs=8)

In [None]:
knn.fit(X_train, y_train)

In [None]:
#### Make Predictions

In [None]:
%%time
preds = knn.predict(X_test)

In [None]:
preds

#### Score Model

In [None]:
n_correct = np.sum(preds==y_test)
n_possible = len(y_test)

print(f'n_correct:  {n_correct}\nn_possible: {n_possible}\n% correct:  {n_correct/n_possible*100}%')

#### Current 
- 4796 rows, k=1, test_size=0.3, % correct: 93.12022237665045%

#### Previous Scores
- Week of 31 August 2020
    - 1740 rows, k=1, test_size=0.2, % correct: 89.65517241379311%
    - 2114 rows, k=1, test_size=0.2, % correct: 88.88888888888889%
    - 2114 rows, k=2, test_size=0.2, % correct: 88.65248226950354%
    - 2114 rows, k=3, test_size=100, % correct: 94.0% (one off, more range variation than above, higher highs, lower lows)
    - 2114 rows, k=1, test_size=100, % correct: 91.0% (consistent, some variation ranging 84-93%)
    - 2114 rows, k=1, test_size=0.3, % correct (7 runs avg): 88.008998875%

#### Goal Score (18 September 2020)
- n rows, k=k, test_size=test_size, % correct: > 94%

#### Goal Score (30 September 2020)
- n rows, k=k, test_size=test_size, % correct: > 98.1%

#### Goal Deployed Score (31 October 2020)
- n rows, k=k, test_size=live_feed, % correct: > 95.1%+

## What's wrong? Predicted v Actual
Incorrect predictions on the left, actual values (labels) on the right. (Assumes labels are correct.)

In [None]:
comp_df = pd.DataFrame()

comp_df['predicted'] = preds
comp_df['actual'] = y_test
comp_df['reference_file'] = test_file_paths

comp_df.loc[comp_df.predicted != comp_df.actual]

What are the top 7 targets we are missing?

In [None]:
comp_df.loc[comp_df.predicted != comp_df.actual].actual.value_counts()[:7]

What are the top 7 targets we are hitting?

In [None]:
comp_df.loc[comp_df.predicted == comp_df.actual].actual.value_counts()[:7]

For each number, print
- number
- n incorrect predictions
- n correct predictions

Note: this displays numbers from most incorrectly predicted (total) to least incorrectly predicted, not necessairly the same as least accurate to most accurate (i.e. less representation means less opportunities for error).

**Goal**: > 95% accuracy on each possible target (outcome).

In [None]:
n_numbers_possible_correct = list(comp_df.loc[comp_df.predicted == comp_df.actual].actual.value_counts())
numbers_possible_correct = comp_df.loc[comp_df.predicted == comp_df.actual].actual.value_counts().index

n_numbers_possible_incorrect = list(comp_df.loc[comp_df.predicted != comp_df.actual].actual.value_counts())
numbers_possible_incorrect = comp_df.loc[comp_df.predicted != comp_df.actual].actual.value_counts().index

correct_dct = {}
for i in range(len(numbers_possible_correct)):
    correct_dct.update({numbers_possible_correct[i]:n_numbers_possible_correct[i]})

incorrect_dct = {}
for i in range(len(numbers_possible_incorrect)):
    incorrect_dct.update({numbers_possible_incorrect[i]:n_numbers_possible_incorrect[i]})
    
seen = []
correct_keys = [k for k in correct_dct.keys()]
incorrect_keys = [k for k in incorrect_dct.keys()]

# accepted_error = 0.5 * 100
accepted_error = 0.4 * 100  # 12 Sept (0.35)
# accepted_error = 0.2 * 100  # 19 Sept
# accepted_error = 0.08 * 100  # 26 Sept
# accepted_error = 0.04999995 * 100  # 3 Oct
need_more = []

In [None]:
for i in incorrect_keys:
    if i not in seen:
        print(i)
        print(f'incorrect: {incorrect_dct[i]}')
        try:
            print(f'# correct: {correct_dct[i]}')
            error_per = float(str(incorrect_dct[i]/(incorrect_dct[i]+correct_dct[i])*100)[:7])
            print(f'per error: {error_per}%')
            if error_per > accepted_error:
                try:
                    need_more.append(int(i))
                except:
                    need_more.append(i)
        except:
            try:
                need_more.append(int(i))
            except:
                need_more.append(i)
        print()
        seen.append(i)

for i in correct_keys:
    if i not in seen:
        print(i)
        print(f'correct: {correct_dct[i]}')
        try:
            print(f'incorrect: {incorrect_dct[i]}')
            error_per = float(str(incorrect_dct[i]/(incorrect_dct[i]+correct_dct[i])*100)[:7])
            print(f'per error: {error_per}%')
            if error_per > accepted_error:
                try:
                    need_more.append(int(i))
                except:
                    need_more.append(i)
        except:
            pass
        print()
        seen.append(i)

need_more_digits = []
need_more_others = []
for target in need_more:
    try:
        need_more_digits.append(int(target))
    except:
        need_more_others.append(target)
        
print(f'len(need_more_digits) == {len(need_more_digits)}')
print(f'len(need_more_others) == {len(need_more_others)}')

In [None]:
sorted(need_more_digits)

In [None]:
sorted(need_more_others)