Help: https://youtu.be/1gQR24B3ISE

#### Imports

In [None]:
import torch
# import torchvision

import pandas as pd

from tqdm import tqdm

import cv2 as cv
import numpy as np
from PIL import Image

from sklearn.model_selection import train_test_split

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## ETL Data (Images)

In [None]:
%%time
# 2,114 (total) n_kills (28, 28) crops
df_28x28 = pd.read_csv('numbers.csv')

# 11,024 (total) n_kills & n_pr (38, 28) crops
df_38x28 = pd.read_csv('digits_only_numbers.csv')

# 7,717 (each) n_kills & n_pr crops (38, 28) & 150 (total) n_tr crops (38, 28)
df_38x28_s = pd.read_csv('labeled_screenshots.csv')
n_teams_numbers = df_38x28_s[['n_teams_remaining', 'tr_reference_file']].dropna()
n_players_numbers = df_38x28_s[['n_players_remaining', 'pr_reference_file']].dropna()
n_kills_numbers = df_38x28_s[['n_kills', 'k_reference_file']].dropna()
for numbers_group in [n_teams_numbers, n_players_numbers, n_kills_numbers]:
    numbers_group.columns = ['numbers', 'file_path']
df_38x28_s = pd.concat([n_teams_numbers, n_players_numbers, n_kills_numbers])

# combine into unified dataframe of numbers (labels) & file paths
df = pd.concat([df_28x28, df_38x28, df_38x28_s], ignore_index=True)

# max_val = 152
max_val = 33

max_label_sample = 500
# max_label_sample = 200

standard_nulls = True

fix_digits = True

# fix digits
if fix_digits:
    df.numbers.loc[df.numbers == '00'] = 0
    for _ in range(160):
        if _ <= 152:
            df.numbers.loc[df.numbers == f'{_}'] = _
            df.numbers.loc[df.numbers == f'{float(_)}'] = _
            # relabel: partially blurry > blurry
            for e in [f'b{_}', f'{_}b', f'{_}bb', f'b{float(_)}', f'{float(_)}b' f'{float(_)}bb']:
                df.numbers.loc[df.numbers == e] = 'b'
                if _ < 10:
                    df.numbers.loc[df.numbers == f'b0{_}'] = 'b'
            # remove all icon issue numbers
            for e in [f'i{int(_)}', f'{int(_)}i', f'i{float(_)}', f'{float(_)}i',
                      f'i{float(_)}b', f'b{float(_)}i', f'b{int(_)}i', f'i{int(_)}b', f'ie{int(_)}', f'ie{float(_)}',
                      f'i{int(_)}e', f'i{float(_)}e']:
                df = df.loc[df.numbers != e]
            # remove other error issue numbers
            for e in [f'e{_}', f'{_}e', f'e{float(_)}', f'{float(_)}e']:
                df = df.loc[df.numbers != e]
        else:
            # remove any numbers over 152
            for e in [f'{int(_)}', f'i{int(_)}', f'{int(_)}i', f'i{float(_)}', f'{float(_)}i', 
                      f'b{int(_)}', f'{int(_)}b',
                      f'e{int(_)}', f'e{float(_)}']:
                df = df.loc[df.numbers != e]

# fix nulls (standardize)
if standard_nulls:
    df.numbers.loc[df.numbers == 'b'] = ''
    df.numbers.loc[df.numbers == 'e'] = ''
    df.numbers.loc[df.numbers == 'r'] = ''
    df.numbers.loc[df.numbers == 'n'] = ''
    df.numbers.loc[df.numbers == 'bb'] = ''
    df.numbers.loc[df.numbers == 'ib'] = ''
    df.numbers.loc[df.numbers == 'ibb'] = ''
    df.numbers.loc[df.numbers == 'ie'] = ''
    df.numbers.loc[df.numbers == 'nn'] = ''
    df.numbers.loc[df.numbers == ''] = 153

# 0-9 only
if max_val:
    df = df.loc[df.numbers != '']
    df = df.loc[df.numbers <= max_val]
    
# limit number of each label
if max_label_sample:
    for value in df.numbers.unique():
        c = len(df.loc[df.numbers==value])
        if c > max_label_sample:
            temp_df = df.loc[df.numbers == value].sample(max_label_sample)
            df = df.loc[df.numbers != value]
            df = pd.concat([df, temp_df])
        print(f'{value} | {len(df.loc[df.numbers==value])}')

file_paths = df.file_path.values
# load & resize image arrays
image_arrays = []
for path in file_paths:
    base_size = Image.open(path).size
    if base_size == (38, 28):
        img = cv.imread(path, cv.IMREAD_GRAYSCALE)
        img = Image.fromarray(img)
        img = img.crop((0, 0-5, 38, 28+5))
        img = np.array(img)
        img = cv.resize(img, (50, 50))
        image_arrays.append(img)
    elif base_size == (28, 28):
        img = cv.imread(path, cv.IMREAD_GRAYSCALE)
        img = Image.fromarray(img)
        img = img.crop((0-3, 0-5, 28+7, 28+5))
        img = np.array(img)
        img = cv.resize(img, (50, 50))
        image_arrays.append(img)
    else:
        raise Exception(f'\nerror: unknown size | {base_size}')

# image labels
y = df.numbers.values

# zip loaded images with their file paths
X = [[img, path] for img, path in zip(image_arrays, file_paths)]

# shuffle and split data into training & testing sets (sacraficing 10%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)

# cut file paths from training & testing sets 
train_file_paths = [fp for img, fp in X_train]
test_file_paths = [fp for img, fp in X_test]

# cut image arrays from training & testing sets
X_train = [img for img, fp in X_train]
X_test = [img for img, fp in X_test]

train = [[img, np.eye(max_val+1)[label]] for img, label in zip(X_train, y_train)]
test = [[img, np.eye(max_val+1)[label]] for img, label in zip(X_test, y_test)]

#### Save Train/Test Data

In [None]:
np.save('training_data.npy', train)
np.save('testing_data.npy', test)

#### Load in Train Data

In [None]:
training_data = np.load('training_data.npy', allow_pickle=True)

len(training_data)

## Create Model

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__() # just run the init of parent class (nn.Module)
        self.conv1 = nn.Conv2d(1, 32, 5) # input is 1 image, 32 output channels, 5x5 kernel / window
        self.conv2 = nn.Conv2d(32, 64, 5) # input is 32, bc the first layer output 32. Then we say the output will be 64 channels, 5x5 kernel / window
        self.conv3 = nn.Conv2d(64, 128, 5)

        x = torch.randn(50,50).view(-1,1,50,50)
        self._to_linear = None
        self.convs(x)

        self.fc1 = nn.Linear(self._to_linear, 512) #flattening.
        self.fc2 = nn.Linear(512, max_val+1) # 512 in, 2 out bc we're doing 2 classes (dog vs cat).

    def convs(self, x):
        # max pooling over 2x2
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv3(x)), (2, 2))

        if self._to_linear is None:
            self._to_linear = x[0].shape[0]*x[0].shape[1]*x[0].shape[2]
        return x

    def forward(self, x):
        x = self.convs(x)
        x = x.view(-1, self._to_linear)  # .view is reshape ... this flattens X before 
        x = F.relu(self.fc1(x))
        x = self.fc2(x) # bc this is our output layer. No activation here.
        return F.softmax(x, dim=1)


net = Net()
print(net)

In [None]:
optimizer = optim.Adam(net.parameters(), lr=0.001)
loss_function = nn.MSELoss()

In [None]:
X = torch.Tensor([i[0] for i in training_data]).view(-1,50,50)
X = X/255.0
y = torch.Tensor([i[1] for i in training_data])

#### Resplit Data

In [None]:
VAL_PCT = 0.3  # lets reserve 10% of our data for validation
val_size = int(len(X)*VAL_PCT)
print(val_size)

In [None]:
train_X = X[:-val_size]
train_y = y[:-val_size]

test_X = X[-val_size:]
test_y = y[-val_size:]

In [None]:
print(len(train_X), len(test_X))

### Train & Score the Model

In [None]:
# BATCH_SIZE = 180
BATCH_SIZE = 120
EPOCHS = 1
total_epochs = EPOCHS

for epoch in range(EPOCHS):
    for i in tqdm(range(0, len(train_X), BATCH_SIZE)): # from 0, to the len of x, stepping BATCH_SIZE at a time. [:50] ..for now just to dev
        #print(f"{i}:{i+BATCH_SIZE}")
        batch_X = train_X[i:i+BATCH_SIZE].view(-1, 1, 50, 50)
        batch_y = train_y[i:i+BATCH_SIZE]

        net.zero_grad()

        outputs = net(batch_X)
        loss = loss_function(outputs, batch_y)
        loss.backward()
        optimizer.step()    # Does the update

    print(f"Epoch: {total_epochs}. Loss: {loss}")

In [None]:
correct = 0
total = 0
with torch.no_grad():
    for i in tqdm(range(len(test_X))):
        real_class = torch.argmax(test_y[i])
        net_out = net(test_X[i].view(-1, 1, 50, 50))[0]  # returns a list, 
        predicted_class = torch.argmax(net_out)

        if predicted_class == real_class:
            correct += 1
        total += 1
print(f"{total_epochs} Epoch Accuracy: ", round(correct/total, 3))

In [None]:
# BATCH_SIZE = 180
BATCH_SIZE = 120
EPOCHS = 1
total_epochs += EPOCHS

for epoch in range(EPOCHS):
    for i in tqdm(range(0, len(train_X), BATCH_SIZE)): # from 0, to the len of x, stepping BATCH_SIZE at a time. [:50] ..for now just to dev
        #print(f"{i}:{i+BATCH_SIZE}")
        batch_X = train_X[i:i+BATCH_SIZE].view(-1, 1, 50, 50)
        batch_y = train_y[i:i+BATCH_SIZE]

        net.zero_grad()

        outputs = net(batch_X)
        loss = loss_function(outputs, batch_y)
        loss.backward()
        optimizer.step()    # Does the update

    print(f"Epoch: {total_epochs}. Loss: {loss}")

In [None]:
correct = 0
total = 0
with torch.no_grad():
    for i in tqdm(range(len(test_X))):
        real_class = torch.argmax(test_y[i])
        net_out = net(test_X[i].view(-1, 1, 50, 50))[0]  # returns a list, 
        predicted_class = torch.argmax(net_out)

        if predicted_class == real_class:
            correct += 1
        total += 1
print(f"{total_epochs} Epoch Accuracy: ", round(correct/total, 3))

In [None]:
# BATCH_SIZE = 120
BATCH_SIZE = 80
EPOCHS = 1
total_epochs += EPOCHS

for epoch in range(EPOCHS):
    for i in tqdm(range(0, len(train_X), BATCH_SIZE)): # from 0, to the len of x, stepping BATCH_SIZE at a time. [:50] ..for now just to dev
        #print(f"{i}:{i+BATCH_SIZE}")
        batch_X = train_X[i:i+BATCH_SIZE].view(-1, 1, 50, 50)
        batch_y = train_y[i:i+BATCH_SIZE]

        net.zero_grad()

        outputs = net(batch_X)
        loss = loss_function(outputs, batch_y)
        loss.backward()
        optimizer.step()    # Does the update

    print(f"Epoch: {total_epochs}. Loss: {loss}")

In [None]:
correct = 0
total = 0
with torch.no_grad():
    for i in tqdm(range(len(test_X))):
        real_class = torch.argmax(test_y[i])
        net_out = net(test_X[i].view(-1, 1, 50, 50))[0]  # returns a list, 
        predicted_class = torch.argmax(net_out)

        if predicted_class == real_class:
            correct += 1
        total += 1
print(f"{total_epochs} Epoch Accuracy: ", round(correct/total, 3))

In [None]:
# BATCH_SIZE = 120
BATCH_SIZE = 80
EPOCHS = 1
total_epochs += EPOCHS

for epoch in range(EPOCHS):
    for i in tqdm(range(0, len(train_X), BATCH_SIZE)): # from 0, to the len of x, stepping BATCH_SIZE at a time. [:50] ..for now just to dev
        #print(f"{i}:{i+BATCH_SIZE}")
        batch_X = train_X[i:i+BATCH_SIZE].view(-1, 1, 50, 50)
        batch_y = train_y[i:i+BATCH_SIZE]

        net.zero_grad()

        outputs = net(batch_X)
        loss = loss_function(outputs, batch_y)
        loss.backward()
        optimizer.step()    # Does the update

    print(f"Epoch: {total_epochs}. Loss: {loss}")

In [None]:
correct = 0
total = 0
with torch.no_grad():
    for i in tqdm(range(len(test_X))):
        real_class = torch.argmax(test_y[i])
        net_out = net(test_X[i].view(-1, 1, 50, 50))[0]  # returns a list, 
        predicted_class = torch.argmax(net_out)

        if predicted_class == real_class:
            correct += 1
        total += 1
print(f"{total_epochs} Epoch Accuracy: ", round(correct/total, 3))

In [None]:
# BATCH_SIZE = 150
BATCH_SIZE = 100
EPOCHS = 1
total_epochs += EPOCHS

for epoch in range(EPOCHS):
    for i in tqdm(range(0, len(train_X), BATCH_SIZE)): # from 0, to the len of x, stepping BATCH_SIZE at a time. [:50] ..for now just to dev
        #print(f"{i}:{i+BATCH_SIZE}")
        batch_X = train_X[i:i+BATCH_SIZE].view(-1, 1, 50, 50)
        batch_y = train_y[i:i+BATCH_SIZE]

        net.zero_grad()

        outputs = net(batch_X)
        loss = loss_function(outputs, batch_y)
        loss.backward()
        optimizer.step()    # Does the update

    print(f"Epoch: {total_epochs}. Loss: {loss}")

In [None]:
correct = 0
total = 0
with torch.no_grad():
    for i in tqdm(range(len(test_X))):
        real_class = torch.argmax(test_y[i])
        net_out = net(test_X[i].view(-1, 1, 50, 50))[0]  # returns a list, 
        predicted_class = torch.argmax(net_out)

        if predicted_class == real_class:
            correct += 1
        total += 1
print(f"{total_epochs} Epoch Accuracy: ", round(correct/total, 3))

In [None]:
# BATCH_SIZE = 150
BATCH_SIZE = 100
EPOCHS = 1
total_epochs += EPOCHS

for epoch in range(EPOCHS):
    for i in tqdm(range(0, len(train_X), BATCH_SIZE)): # from 0, to the len of x, stepping BATCH_SIZE at a time. [:50] ..for now just to dev
        #print(f"{i}:{i+BATCH_SIZE}")
        batch_X = train_X[i:i+BATCH_SIZE].view(-1, 1, 50, 50)
        batch_y = train_y[i:i+BATCH_SIZE]

        net.zero_grad()

        outputs = net(batch_X)
        loss = loss_function(outputs, batch_y)
        loss.backward()
        optimizer.step()    # Does the update

    print(f"Epoch: {total_epochs}. Loss: {loss}")

In [None]:
correct = 0
total = 0
with torch.no_grad():
    for i in tqdm(range(len(test_X))):
        real_class = torch.argmax(test_y[i])
        net_out = net(test_X[i].view(-1, 1, 50, 50))[0]  # returns a list, 
        predicted_class = torch.argmax(net_out)

        if predicted_class == real_class:
            correct += 1
        total += 1
print("6 Epoch Accuracy: ", round(correct/total, 3))