<h2>Starter Code</h2>

<h3>Libraries</h3>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from PIL import Image

#import torch modules 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import TensorDataset

import os
print(os.listdir("../input"))

<h3>Sample Submission</h3>
We are supposed to make 5 predictions for each image.

In [None]:
sample = open('../input/sample_submission.csv')
sample = pd.read_csv(sample)
sample.head()

<h3>Training Data</h3>
It consists of image file names and id for each of them.

In [None]:
train = open('../input/train.csv')
train = pd.read_csv(train)
train.head()

<h3>Data Loading & Preprocessing</h3>
Reference: [Whales. A Simple Guide!](https://www.kaggle.com/jhonatansilva31415/whales-a-simple-guide)

First, let's take a look at the images.

In [None]:
%matplotlib inline

train_dir = '../input/train/'
for whale in os.listdir(train_dir)[:3]:
    img = Image.open(train_dir+whale)
    plt.imshow(img)
    plt.show()

SInce images are in different sizes and formats, we need to handle them.

Resize the images into (128, 128) and make them into grayscale.<br>
Also, normalize the data and make them into torch tensors.

In [None]:
normalize = transforms.Normalize(
   mean=[0.485, 0.456, 0.406],
   std=[0.229, 0.224, 0.225]
)
preprocess = transforms.Compose([
   transforms.Grayscale(num_output_channels=1),
   transforms.Resize((128,128)),
   transforms.ToTensor(),
   normalize
])

It will take about 10 minutes to load all the images.

In [None]:
train_dir = '../input/train/'
train_file = os.listdir(train_dir)
train_file.sort()
train_x = torch.stack([preprocess(Image.open(train_dir+filename)) for filename in train_file])

test_dir = '../input/test/'
test_file = os.listdir(test_dir)
test_file.sort()
test_x = torch.stack([preprocess(Image.open(test_dir+filename)) for filename in test_file])

In [None]:
train_y = train['Id']

Separate data into training data and validation data.<br>
However, I am considering training without validation because the data is too small.

I decided not to use validation data.

In [None]:
#valid_x = train_x[:1500]
#valid_y = train_y[:1500]
#train_x = train_x[1500:]
#train_y = train_y[1500:].reset_index(drop=True)

It'd be better to train without 'new_whale'.

In [None]:
idx = (train_y != 'new_whale')
train_x = torch.stack([train_x[i] for i in range(len(idx)) if idx[i]])
train_y = train_y[idx].reset_index(drop=True)

print(len(train_x), len(train_y))

Factorize training data labels.

In [None]:
unique_classes = pd.unique(train_y)
encoding = dict(enumerate(unique_classes))
encoding = {value: key for key, value in encoding.items()}
train_y = train_y.replace(encoding)

train_y = torch.tensor(train_y.values)

Factorize validation data labels as well.

In [None]:
'''encoding['new_whale'] = len(encoding)

valid_y = valid_y.replace(encoding)
for i in range(len(valid_y)):
    try:
        int(valid_y[i])
    except:
        valid_y[i] = len(encoding) - 1
        
valid_y = torch.tensor(valid_y.values)'''

Now let's see the processed images.<br>
It seems good to start.

In [None]:
%matplotlib inline

for img in train_x[:3]:
    plt.imshow(img[0], cmap='gray')
    plt.show()

<h3>Simple CNN Model</h3>

In [None]:
print(len(encoding))

In [None]:
class CNN(nn.Module):
    def __init__(self ):
        super(CNN, self ).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
        self.conv2 = nn.Conv2d(32, 32, kernel_size=3)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3)
        self.conv4 = nn.Conv2d(64, 64, kernel_size=3)
        self.conv_drop = nn.Dropout2d(0.25)
        self.fc1 = nn.Linear(53824, 10000)
        self.fc2 = nn.Linear(10000, 5004)
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.conv_drop(F.max_pool2d(F.relu(self.conv2(x)), 2))
        x = F.relu(self.conv3(x))
        x = self.conv_drop(F.max_pool2d(F.relu(self.conv4(x)), 2, stride=2))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [None]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
            
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.3f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
    return pred

In [None]:
batch_size = 128
#test_batch_size = 7960
test_batch_size = 128
epochs = 10
lr = 0.0002
use_cuda = True
seed = 1

torch.manual_seed(seed)
device = torch.device("cuda" if use_cuda else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

train_data = TensorDataset(train_x, train_y)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, **kwargs)
#valid_data = TensorDataset(valid_x, valid_y)
#valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, shuffle=True, **kwargs)
test_data = TensorDataset(test_x)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=test_batch_size, shuffle=False, **kwargs)

model = CNN().to(device) 
optimizer = optim.RMSprop(model.parameters(), lr=lr)

In [None]:
for epoch in range(epochs):
    print('epoch', epoch, end=' - ')
    train(model, device, train_loader, optimizer, epoch)
    pred = test(model, device, test_loader)

In [None]:
prediction = [pred[i].item() for i in range(len(pred))]

In [None]:
submission = pd.DataFrame({'id':range(len(prediction)), 
                           'label':prediction,
                          }).set_index('id')

submission.head()

In [None]:
submission.to_csv('submission.csv', columns=['label']) 