# Digit recognizer

Le but est de reconnaître un chiffre à partir d'une image.

## Dépendances

In [46]:
import numpy as np 
import pandas as pd 

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Chargement des données

In [47]:
# On charge les données en float et non pas en double pour les rendre compatible avec PyTorch
train = pd.read_csv("../input/digit-recognizer/train.csv", dtype=np.float32) 
test = pd.read_csv("../input/digit-recognizer/test.csv", dtype=np.float32)
submission = pd.read_csv("/kaggle/input/digit-recognizer/sample_submission.csv")

test


Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Train/test split 

In [48]:
y = train.label.values
X = train.loc[:,train.columns != "label"].values/255

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 1) 

# PyTorch

Adaptation du tutoriel "Quickstart" de PyTorch pour les jeux de données de Kaggle.

https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html

## Préparation des données 

On rend les donnéees compatibles avec PyTorch.

In [49]:
train_X = torch.from_numpy(train_X)
train_y = torch.from_numpy(train_y).type(torch.LongTensor)

test_X = torch.from_numpy(test_X)
test_y = torch.from_numpy(test_y).type(torch.LongTensor)

batch_size = 64
n_iters = 10000
num_epochs = n_iters / (len(train_X) / batch_size)
num_epochs = int(num_epochs)

pytorch_train = torch.utils.data.TensorDataset(train_X,train_y)
pytorch_test = torch.utils.data.TensorDataset(test_X,test_y)

train_dataloader = DataLoader(pytorch_train, batch_size = batch_size, shuffle = False)
test_dataloader = DataLoader(pytorch_test, batch_size = batch_size, shuffle = False)

## Création du modèle

In [50]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

Using cpu device
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


## Fonction de coût

In [51]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

## Apprentissage

In [52]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

## Test

In [53]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [54]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 2.303183  [    0/33600]
loss: 2.295168  [ 6400/33600]
loss: 2.287284  [12800/33600]
loss: 2.288007  [19200/33600]
loss: 2.279348  [25600/33600]
loss: 2.273867  [32000/33600]
Test Error: 
 Accuracy: 29.8%, Avg loss: 2.277852 

Epoch 2
-------------------------------
loss: 2.277400  [    0/33600]
loss: 2.272997  [ 6400/33600]
loss: 2.261866  [12800/33600]
loss: 2.259529  [19200/33600]
loss: 2.249523  [25600/33600]
loss: 2.241884  [32000/33600]
Test Error: 
 Accuracy: 41.8%, Avg loss: 2.250355 

Epoch 3
-------------------------------
loss: 2.249249  [    0/33600]
loss: 2.246854  [ 6400/33600]
loss: 2.232117  [12800/33600]
loss: 2.225528  [19200/33600]
loss: 2.213697  [25600/33600]
loss: 2.203237  [32000/33600]
Test Error: 
 Accuracy: 50.7%, Avg loss: 2.216329 

Epoch 4
-------------------------------
loss: 2.215198  [    0/33600]
loss: 2.213667  [ 6400/33600]
loss: 2.193999  [12800/33600]
loss: 2.181334  [19200/33600]
loss: 2.167392  [25600/3