## Load libraries

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os,sys
import re
import math
from datetime import datetime
import time
sys.dont_write_bytecode = True

In [None]:
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
from skimage.color import rgb2gray
from skimage.transform import resize

from pathlib import Path
from typing import List, Set, Dict, Tuple, Optional, Iterable, Mapping, Union, Callable

from pprint import pprint
from ipdb import set_trace as brpt

In [None]:
# import holoviews as hv
# from holoviews import opts
# hv.extension('bokeh')

In [None]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from  torch.linalg import norm as tnorm
from torch.utils.data import Dataset, DataLoader, random_split

from torchvision import datasets, transforms

import pytorch_lightning as pl
from pytorch_lightning.core.lightning import LightningModule

# Select Visible GPU
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

## Set Path 
1. Add project root and src folders to `sys.path`
2. Set DATA_ROOT to `maptile_v2` folder

In [None]:
this_nb_path = Path(os.getcwd())
ROOT = this_nb_path.parent
SRC = ROOT/'src'
DATA_ROOT = Path("/data/hayley-old/maptiles_v2/")
paths2add = [this_nb_path, ROOT, SRC]

print("Project root: ", str(ROOT))
print('Src folder: ', str(SRC))
print("This nb path: ", str(this_nb_path))


for p in paths2add:
    if str(p) not in sys.path:
        sys.path.insert(0, str(p))
        print(f"\n{str(p)} added to the path.")
        
# print(sys.path)



In [None]:
from src.data.datasets.maptiles import Maptiles, MapStyles


In [None]:
styles = MapStyles.get_longnames()

In [None]:
def test_maptiles_1():
    cities = ['la']
    styles = ['OSMDefault']
    zooms = ['14']
    dset = Maptiles(
        data_root=DATA_ROOT, 
        cities=cities, 
        styles=styles, 
        zooms=zooms)
    dset.print_meta()
    dset.show_samples()
    
test_maptiles_1()

In [None]:
def test_maptiles_xform(in_size=64):
    xform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Resize(in_size)
    ])
    cities = ['paris']
    styles = ['OSMDefault']
    zooms = ['14']
    dset = Maptiles(data_root=DATA_ROOT, 
                    cities=cities, 
                    styles=styles, 
                    zooms=zooms, 
                    transform=xform)
    dset.print_meta()
#     dset.show_samples(order='chw')
    
test_maptiles_xform()

In [None]:
# Train a simple NN model to classify a city
#LightningModule is a subclass of nn.Module. We can think of it as an abstraction of a NN model (plus sugars for easier experimentation)
# Model specification
# - architecture and parameters in `__init__` method
# - forward computation in `forward` method
# - training step: in `training_step(self, batch, batch_idx)`

class LitModel(LightningModule):
    def __init__(self, nh1, nh2, 
                 dim_in=28*28, n_classes=10):
        super().__init__()
        
        # Define model architecture
        self.layer1 = nn.Linear(dim_in, nh1)
        self.layer2 = nn.Linear(nh1, nh2)
        self.layer3 = nn.Linear(nh2, n_classes)
        
    def forward(self, x):
        bs, n_channels, height, width = x.size()
        
        # (b, 1, 28,28) -> (b, 1*28*28)
        x = x.view(bs, -1)
        x = self.layer1(x)
        x = F.relu(x)
        x = self.layer2(x)
        x = F.relu(x)
        x = self.layer3(x)
        x = F.log_softmax(x, dim=1)
        
        return x
    
    def training_step(self, batch, batch_idx):
        "Implements one mini-batch iteration from batch intack -> pass through model -> return loss (ie. computational graph)"
        x, y = batch
        logits = self(x)
        loss = F.nll_loss(logits, y)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_ids):
        x, y = batch
        logits = self(x)
        loss = F.nll_loss(logits, y)
        return loss
    
    # Second component: Optimization Solver
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)
    
        
        

In [None]:
# Data loading
# -- we need torch.utils.data.DataLoader objects that specifies the datasets and how to load the data for train/valtest
from src.data.datasets.maptiles import MapStyles
from src.data.transforms.transforms import Identity
bs = 16
pin_memory = True
num_workers = 8
in_size = 64
n_channels = 3
cities = ['la', 'seoul']
# styles = ['StamenTerrainLines', 'OSMDefault', 'CartoVoyagerNoLabels']
styles = ['OSMDefault', 'CartoVoyagerNoLabels']

zooms = ['14']
xform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize(in_size),
    transforms.Grayscale() if n_channels > 1  else Identity,
#     transforms.Lambda(lambda t: t[[0]]) # get the first channel only
])


target_xform = transforms.Lambda(
    lambda label_dict: 0 if label_dict["style"]==styles[0] else 1
)
dset = Maptiles(
    data_root=DATA_ROOT, 
    cities=cities, 
    styles=styles, 
    zooms=zooms,# verbose=True,
    transform=xform, 
    target_transform=target_xform)
dset.print_meta()

dl_train = DataLoader(dset, batch_size=bs, shuffle=True, 
                      num_workers=num_workers, pin_memory=pin_memory)


In [None]:
from torchvision.utils import make_grid
from src.visualize.utils import show_timgs

In [None]:
x, y = next(iter(dl_train))
print(x.shape)
print(y)

In [None]:
y0, y1 = y[y==0], y[y==1]
len(y0), len(y1)
x0, x1 = x[y==0], x[y==1]
x0.shape, x1.shape

show_timgs(x0, cmap='gray')

In [None]:
show_timgs(x1, titles=y1.numpy(), cmap='gray')

In [None]:
# grid = make_grid(x, padding=10)
# grid.size()
# show(grid)

In [None]:
# Write a functiion to split a Maptiles (dataset) object into two groups by splitting the indices into 2 random groups
# -- usage: define the whole maptiles dataset, then split it into train and val datasets
from src.data.datasets.maptiles import Maptiles
def test_random_split_maptiles():
    dset = Maptiles(data_root=DATA_ROOT, 
                    cities=['paris'], styles=['OSMDefault'], zooms=['14'])
    dset0, dset1 = Maptiles.random_split(dset, 0.5)
    dset0.show_samples()
    dset1.show_samples()
    print(len(dset0), len(dset1), len(dset))
    assert len(dset0)+len(dset1) == len(dset)
test_random_split_maptiles()

## Start experiment
Instantiate the LighteningModule, and the PL's `Trainer`

In [None]:
# Instantiate the pl Module
nh1, nh2 = 100,100
model = LitModel(nh1=nh1, nh2=nh2, dim_in=in_size**2*n_channels, n_classes=2)

# Instantiate a PL `Trainer` object
# -- most basic trainer: uses good defaults, eg: auto-tensorboard logging, checkpoints, logs, etc.
max_eps = 300
trainer = pl.Trainer(gpus=1, max_epochs=max_eps)
trainer.fit(model, dl_train)


## Lightning Data Module
A better way to encaptulate/modularize the train/val datasets and train/val dataloading. Also handles:
- setting up the data in a machine: eg. downloading

Dec 9, 2020
- lightning callbacks
- logging

Let's transfer the data loading script to pl.LightningDataModule
```python
# Data loading
# -- we need torch.utils.data.DataLoader objects that specifies the datasets and how to load the data for train/valtest
from src.data.datasets.maptiles import MapStyles
from src.data.transforms.transforms import Identity
bs = 16
pin_memory = True
num_workers = 8
in_size = 64
n_channels = 3
cities = ['la', 'seoul']
# styles = ['StamenTerrainLines', 'OSMDefault', 'CartoVoyagerNoLabels']
styles = ['OSMDefault', 'CartoVoyagerNoLabels']

zooms = ['14']
xform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize(in_size),
    if n_channels > 1:transforms.Grayscale() else Identity,
#     transforms.Lambda(lambda t: t[[0]]) # get the first channel only
])


target_xform = transforms.Lambda(
    lambda label_dict: 0 if label_dict["style"]==styles[0] else 1
)
dset = Maptiles(DATA_ROOT, cities, styles, zooms,# verbose=True,
                transform=xform, target_transform=target_xform)
dset.print_meta()

dl_train = DataLoader(dset, batch_size=bs, shuffle=True, 
                      num_workers=num_workers, pin_memory=pin_memory)
```


In [None]:
class MaptilesModule(pl.LightningDataModule):
    
    def __init__(self, *,
                 cities: Iterable[str],
                 styles: Iterable[str],
                 zooms: Iterable[str],
                 transform: Callable = None,
                 target_transform: Callable = None,
                 df_fns: pd.DataFrame=None,
                 data_root: Path=None, # --end of Maptile init args
                 in_size: int=64,
                 n_channels: int=3,
                 bs: int=32,
                verbose: bool=False,
                pin_memory = True,
                num_workers = 8):
        """
        Args:
        
        
        """
        super().__init__()
        self.df_fns = df_fns
        self.data_root = data_root
        self.cities = cities
        self.styles = styles
        self.zooms = zooms
        
        # transforms
        self.transform = transform
        self.target_transform = target_transform
        self.in_size = in_size
        self.n_channels = n_channels
        
        # default transforms
        if self.transform is None:
            self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Resize(self.in_size),
#             transforms.Grayscale() if self.n_channels > 1 else Identity,
            ])

        # Default transsforms for Maptile dataset's target label_dict
        # Maptiles class's `__getitems__` returns (x, label_dict) 
        # -- where label_dict = {
        #    "city": city,
        #    "style": style,
        #    "zoom": zoom,
        #    "coord": coord}
        # Default: returns the style label 
        # -- ie. prepare a sample for the style prediction problem
        if self.target_transform is None:
            self.target_transform = transforms.Lambda(
            lambda label_dict: 0 if label_dict["style"]==styles[0] else 1
        )
        # data loading
        self.bs = bs
        self.verbose = verbose
        self.pin_memory = pin_memory
        self.num_workers = num_workers
        
    def prepare_data(self, download_dir: Path=None):
        if download_dir is None:
            download_dir = Path.cwd() #or self.data_dir?
        # TODO: 
        # download maptile dataset to the download_dir
        pass
    
    def setup(self, stage: str):
        # This function is called on every GPU in a node/machine
        # Sets self.train_ds, self.val_ds
        # -- this also configures this DataModule to have a specified transforms 
        # -- that will be applied to each sample in the dataset
        
#         # Default transforms for Maptile dataset's imgs
#         if self.transform is None:
#             xform = transforms.Compose([
#             transforms.ToTensor(),
#             transforms.Resize(self.in_size),
#             transforms.Grayscale() if self.n_channels > 1 else Identity,
#             ])

#         # Default transsforms for Maptile dataset's target label_dict
#         # Maptiles class's `__getitems__` returns (x, label_dict) 
#         # -- where label_dict = {
#         #    "city": city,
#         #    "style": style,
#         #    "zoom": zoom,
#         #    "coord": coord
#         #}
#         target_xform = transforms.Lambda(
#             lambda label_dict: 0 if label_dict["style"]==styles[0] else 1
#         )
        dset = Maptiles(
            df_fns=self.df_fns,
            data_root=self.data_root, 
            cities=self.cities, 
            styles=self.styles, 
            zooms=self.zooms, 
            transform=self.transform, 
            target_transform=self.target_transform)
        
        # split to train/val or test
        if stage == 'fit':
            self.train_ds, self.val_ds = random_split_maptiles(dset, 0.7)
            assert len(self.train_ds)+len(self.val_ds) == len(dset)
        
        if stage == 'test':
            # split the whole dataset into tr:val:test=4:3:3
            self.tv_ds, self.test_ds = random_split_maptiles(dset, 0.7)
            self.train_ds, self.val_ds = random_split_maptiles(self.tv_ds, 4./7.)
            print([len(x) for x in [self.train_ds, self.val_ds, self.test_ds]])
    
    
    # return the dataloader for each split
    def train_dataloader(self):
        return DataLoader(self.train_ds, batch_size=self.bs, pin_memory=self.pin_memory, num_workers=self.num_workers)
        
    def val_dataloader(self):
        return DataLoader(self.val_ds, batch_size=self.bs, pin_memory=self.pin_memory, num_workers=self.num_workers)

    def test_dataloader(self):
        return DataLoader(self.test_ds, batch_size=self.bs, pin_memory=self.pin_memory, num_workers=self.num_workers)
        

In [None]:
def test_maptiles_module():
    cities = ['paris']
    styles = ['OSMDefault']
    zooms = ['14']
    data_root = DATA_ROOT
    dm = MaptilesModule(data_root=data_root,
                        cities=cities,
                        styles=styles,
                        zooms=zooms)
    dm.setup(stage='test')
    train_dl, val_dl, test_dl = dm.train_dataloader(), dm.val_dataloader(), dm.test_dataloader()
    train_dl.dataset.show_samples(order='chw')
#     brpt()
test_maptiles_module()

## Start experiment - version 2



In [None]:
# Instantiate data module
cities = ['paris']
styles = ['OSMDefault', 'CartoVoyagerNoLabels']
dm = MaptilesModule(data_root=DATA_ROOT,
                        cities=cities,
                        styles=styles,
                        zooms=zooms)

# Instantiate the pl Module
nh1, nh2 = 100,100
dim_in = dm.in_size**2*dm.n_channels
model = LitModel(nh1=nh1, nh2=nh2, dim_in=dim_in, n_classes=2)

# Instantiate a PL `Trainer` object
# -- most basic trainer: uses good defaults, eg: auto-tensorboard logging, checkpoints, logs, etc.
# -- Pass the data module along with a pl module
max_eps = 300
trainer_config = {
    'gpus':1,
    'max_epochs': max_eps,
    'progress_bar_refresh_rate':10,
    'auto_lr_find': True,
    'terminate_on_nan':True,
    'val_check_interval': 10, #iterations
    
    
}
trainer = pl.Trainer(**trainer_config)
# trainer = pl.Trainer(fast_dev_run=True)

trainer.fit(model, dm)


In [None]:
logging 