In [2]:
import torch
import torch.nn.functional as F
import torchvision.transforms as tf
from torchvision import datasets, models
from PIL import Image
from torch.optim import lr_scheduler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import datetime
import tarfile
import os
from timeit import default_timer as timer
from typing import Tuple, List, Type, Dict, Any

In [3]:
!wget -P /content/gdrive/My\ Drive https://www.dropbox.com/s/nyy2q2zb1bkw3mi/train.tar.gz

##  Load and examine the data

In [4]:
path = "/content/gdrive/My Drive/train.tar.gz"
tar = tarfile.open(path, "r:gz")
tar.extractall(r"/content/gdrive/My Drive/train")
tar.close()

In [5]:
with open('/content/gdrive/My Drive/train/index.pkl', 'rb') as f:
    data_index = pickle.load(f)
data_index[1:3]

In [6]:
dataframe = pd.DataFrame(data_index)

In [7]:
dataframe = dataframe[['jpg_filename', 'mask_fname', 'mission', 'observations_dt', 'observed_TCC']]

Let's throw out the mission ai49 because there all the labels are equal to 8, it is obvious that this is nonsense.

In [8]:
dataframe = dataframe.set_index('mission').drop(labels = 'AI49')

In [9]:
dataframe = dataframe.reset_index(drop = False)

In [10]:
dataframe

## Create custom dataset

In [11]:
class CustomDataset(torch.utils.data.Dataset):
    
    def __init__(self, annotations, root_dir, train = True, transforms = None):
        
        super().__init__()
        
        self.annotations = annotations
        self.root_dir = os.path.abspath(root_dir)
        self.train = train
        self.transforms = transforms
        
    def __len__(self):
        
        return len(self.annotations)
        
    def __getitem__(self, index):
        
        img_path = None

        if self.train:

          img_path = os.path.join(self.root_dir, 
                                  self.annotations.iloc[index]['mission'], 
                                  'snapshots', 
                                  'snapshots-'+str((self.annotations.iloc[index]['observations_dt']).date()), 
                                  self.annotations.iloc[index]['jpg_filename'])
        
          label = torch.tensor(int(self.annotations.iloc[index]['observed_TCC']))
        
        else:
          if self.annotations.iloc[index]['mission'] == 'AI49':
            img_path = os.path.join(self.root_dir, 
                                    self.annotations.iloc[index]['mission'], 
                                    'snapshots',
                                    self.annotations.iloc[index]['mission'].swapcase() + 
                                    '-snapshots-' +
                                    str((self.annotations.iloc[index]['observations_dt']).date()), 
                                    self.annotations.iloc[index]['jpg_filename'])
          else:
            img_path = os.path.join(self.root_dir, 
                                    self.annotations.iloc[index]['mission'], 
                                    'snapshots', 
                                    'snapshots-'+str((self.annotations.iloc[index]['observations_dt']).date()), 
                                    self.annotations.iloc[index]['jpg_filename'])
          label = self.annotations.iloc[index]['jpg_filename']
        
        img = plt.imread(img_path) 

        if self.transforms:
            img = self.transforms(img)
            
        return (img, label)

In [12]:
transforms = tf.Compose([tf.ToPILImage(), 
                         tf.Resize([256, 256]),
        tf.RandomRotation(degrees=10),
        tf.RandomHorizontalFlip(),
        tf.ToTensor(),
        tf.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

In [13]:
SkyData = CustomDataset(dataframe, root_dir = '/content/gdrive/My Drive/train', transforms = transforms)

In [14]:
SkyData[254]

In [15]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(10, 10))
for i, row in enumerate(axes):
    for j, ax in enumerate(row):
        sample, label = SkyData[i*325+j*13122]
        ax.imshow(sample.cpu().numpy().transpose(1, 2, 0))
        ax.set_title('Label: {}'.format(int(label)))

In [16]:
def train_single_epoch(model : torch.nn.Module,
                       optimizer : torch.optim.Optimizer,
                       loss_function : torch.nn.Module,
                       data_loader : torch.utils.data.DataLoader):
    
    model.train()
    loss_total = 0
    
    for data in data_loader:
        
        X, y = data
        X, y = X.to(device), y.to(device)
        
        optimizer.zero_grad()
        output = model(X)
        
        loss = loss_function(output, y)
        loss_total += loss
        loss.backward()
        
        optimizer.step()
    
    loss_avg = loss_total / len(data_loader.dataset)
    
    return loss_avg

In [17]:
@torch.no_grad()
def validate_single_epoch(model: torch.nn.Module,
                          loss_function: torch.nn.Module, 
                          data_loader: torch.utils.data.DataLoader):
    
    model.eval()
    loss_total = 0
    accuracy_total = 0
    
    for data in data_loader:
        
        X, y = data
        X, y = X.to(device), y.to(device)
        
        output = model(X)
        
        loss = loss_function(output, y)
        loss_total += loss
        
        y_pred = output.argmax(dim = 1, keepdim = True).to(device)
        accuracy_total += y_pred.eq(y.view_as(y_pred)).sum().item()
        
    loss_avg = loss_total / len(data_loader.dataset)
    accuracy_avg = 100.0 * accuracy_total / len(data_loader.dataset)
    
    return {'loss' : loss_avg, 'accuracy' : accuracy_avg}

In [18]:
def train_model(model: torch.nn.Module, 
                train_dataset: torch.utils.data.Dataset,
                valid_dataset: torch.utils.data.Dataset,
                loss_function: torch.nn.Module = torch.nn.CrossEntropyLoss(),
                optimizer_class: Type[torch.optim.Optimizer] = torch.optim,
                optimizer_params: Dict = {},
                initial_lr = 0.01,
                lr_scheduler_class: Any = torch.optim.lr_scheduler.ReduceLROnPlateau,
                lr_scheduler_params: Dict = {},
                batch_size = 64,
                max_epochs = 1000,
                early_stopping_patience = 20, 
                best_model_root = './best_model.pth'):
    
    
    optimizer = torch.optim.Adam(model.parameters(), lr=initial_lr, **optimizer_params)
    lr_scheduler = lr_scheduler_class(optimizer, **lr_scheduler_params)
    
    train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=batch_size, pin_memory = True, num_workers = 1)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, num_workers = 1)

    best_valid_loss = None
    best_epoch = None
    loss_list = {'train' : list(), 'valid' : list()}
    accuracy_list = list()
    
    for epoch in range(max_epochs):
        
        print(f'Epoch {epoch}')
        
        start = timer()
        
        train_loss = train_single_epoch(model, optimizer, loss_function, train_loader)
        
        # Evaluate perfomance on the training set
        loss_list['train'].append(train_loss)
        
        # Evaluate perfomance on the cross-validation set
        valid_metrics = validate_single_epoch(model, loss_function, valid_loader)
        loss_list['valid'].append(valid_metrics['loss'])
        accuracy_list.append(valid_metrics['accuracy'])
        
        print('time:', timer() - start)
        print(f'Validation metrics: \n{valid_metrics}')

        lr_scheduler.step(valid_metrics['loss'])
        
        if best_valid_loss is None or best_valid_loss > valid_metrics['loss']:
            print(f'-----Best model yet, saving-----')
            best_valid_loss = valid_metrics['loss']
            best_epoch = epoch
            torch.save(model, best_model_root)
            
        if epoch - best_epoch > early_stopping_patience:
            print('Early stopping triggered')
            plot_learning_curves(loss_list, accuracy_list, best_epoch)
            return

## Create a model

In [19]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print('Using GPU', f'({torch.cuda.get_device_name()})')
else:
    device = torch.device('cpu')
    print('Using CPU')

In [20]:
model = models.resnet34(pretrained=True)


for param in model.parameters():
    param.requires_grad = False
    
  
num_ftrs = model.fc.in_features
model.fc = torch.nn.Sequential(
            torch.nn.Linear(num_ftrs,9)
        )

In [21]:
model.to(device)

print(model)
print('Total number of trainable parameters', 
      sum(p.numel() for p in model.parameters() if p.requires_grad))

In [22]:
train_dataset, valid_dataset = torch.utils.data.random_split(SkyData, [len(SkyData) - 15000, 15000])

### Train

In [23]:
train_model(model, 
            train_dataset=train_dataset, 
            valid_dataset=valid_dataset, 
            loss_function=torch.nn.CrossEntropyLoss(), 
            initial_lr=0.001, max_epochs = 3, batch_size = 128)

In [24]:
model = torch.load('./best_model.pth')

## Load test data

In [25]:
!wget -P /content/gdrive/My\ Drive https://www.dropbox.com/s/a9k38jd1keuwnnv/test.tar.gz

In [26]:
path = "/content/gdrive/My Drive/test.tar.gz"
tar = tarfile.open(path, "r:gz")
tar.extractall(r"/content/gdrive/My Drive/test")
tar.close()

In [27]:
with open('/content/gdrive/My Drive/test/index.pkl', 'rb') as f:
    data_index = pickle.load(f)
data_index[1:3]

In [28]:
dataframe = pd.DataFrame(data_index)

In [29]:
dataframe = dataframe[['jpg_filename', 'mask_fname', 'mission', 'observations_dt']]

In [30]:
transforms = tf.Compose([tf.ToPILImage(), 
                         tf.Resize([256, 256]),
                         tf.ToTensor(),
                         tf.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

In [31]:
SkyDataTest = CustomDataset(dataframe, root_dir = '/content/gdrive/My Drive/test', train=False, transforms = transforms)

In [32]:
test_loader = torch.utils.data.DataLoader(SkyDataTest, shuffle=False, batch_size=256)

## Let's make predict

In [33]:
def predict(model, test_loader):
    model.eval()
    with torch.no_grad():
        logits = []
    
        for inputs in test_loader:
            inputs = inputs[0].to(device)
            model.eval()
            outputs = model(inputs).cpu()
            logits.append(outputs)
            
    probs = torch.nn.functional.softmax(torch.cat(logits), dim=-1).numpy()
    return probs

In [34]:
probs = predict(model, test_loader)

In [35]:
preds = np.argmax(probs, axis=1)
test_filenames = [path[1] for path in SkyDataTest]
submit = pd.DataFrame({'jpg_filename': test_filenames, 'TCC': preds})
submit.head()

## Our sumbit

In [36]:
submit.to_csv('./submission.csv', index=False)