In [None]:
# the fast.ai library
from fastai import *
from fastai.vision import *

# to inspect the directory
import os
from pathlib import Path

# for data manipulation (in this Kernel mainly used to read .csv files)
import pandas as pd

# for numerical analysis
import numpy as np

# to display imags
from PIL import Image

#import this for more CNN architectures
import torchvision.models

INPUT = Path("../input/digit-recognizer")
os.listdir(INPUT)

train_df =  pd.read_csv(INPUT/"train.csv")
train_df.head(3)

test_df =  pd.read_csv(INPUT/"test.csv")
test_df.head(3)

TRAIN = Path("../train")
TEST = Path("../test")

# Create training directory
for index in range(10):
    try:
        os.makedirs(TRAIN/str(index))
    except:
        pass

# Test whether creating the training directory was successful
sorted(os.listdir(TRAIN))

#Create test directory
try:
    os.makedirs(TEST)
except:
    pass

# save training images
for index, row in train_df.iterrows():
    label, digit = row[0], row[1:]

    filepath = TRAIN / str(label)
    filename = f"{index}.jpg"

    digit = digit.values
    digit = digit.reshape(28, 28)
    digit = digit.astype(np.uint8)

    img = Image.fromarray(digit)
    img.save(filepath / filename)

# save testing images
for index, digit in test_df.iterrows():
    filepath = TEST
    filename = f"{index}.jpg"

    digit = digit.values
    digit = digit.reshape(28, 28)
    digit = digit.astype(np.uint8)

    img = Image.fromarray(digit)
    img.save(filepath / filename)

# transforms
tfms = get_transforms(do_flip=False, max_zoom=1.2)

data = ImageDataBunch.from_folder(
    path = TRAIN,
    test = TEST,
    valid_pct = 0.2,
    bs = 16,
    size = 28,
    #num_workers = 0,
    ds_tfms = tfms
)

mnist_stats

data.normalize(mnist_stats)

# all the classes in data
print(data.classes)

learn = cnn_learner(data, base_arch=models.resnet18, metrics=accuracy, model_dir="/tmp/models", callback_fns=ShowGraph)

learn.fit_one_cycle(cyc_len=5)

interp = ClassificationInterpretation.from_learner(learn)

interp.plot_top_losses(9, figsize=(7, 7))

interp.plot_confusion_matrix()

class_score, y = learn.get_preds(DatasetType.Test)

probabilities = class_score[0].tolist()
[f"{index}: {probabilities[index]}" for index in range(len(probabilities))]

class_score = np.argmax(class_score, axis=1)

class_score[0].item()

sample_submission =  pd.read_csv(INPUT/"sample_submission.csv")
display(sample_submission.head(2))
display(sample_submission.tail(2))

# remove file extension from filename
ImageId = [os.path.splitext(path)[0] for path in os.listdir(TEST)]
# typecast to int so that file can be sorted by ImageId
ImageId = [int(path) for path in ImageId]
# +1 because index starts at 1 in the submission file
ImageId = [ID+1 for ID in ImageId]

submission  = pd.DataFrame({
    "ImageId": ImageId,
    "Label": class_score
})
# submission.sort_values(by=["ImageId"], inplace = True)
submission.to_csv("submission.csv", index=False)
display(submission.head(3))
display(submission.tail(3))
