## Load libraries

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os,sys
import re
import math
from datetime import datetime
import time
sys.dont_write_bytecode = True

In [None]:
import pandas as pd
import joblib

import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from skimage.color import rgb2gray
from skimage.transform import resize

from pathlib import Path
from typing import List, Set, Dict, Tuple, Optional, Iterable, Mapping, Union, Callable

from pprint import pprint
from ipdb import set_trace as brpt

In [None]:
# import holoviews as hv
# from holoviews import opts
# hv.extension('bokeh')

In [None]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from  torch.linalg import norm as tnorm
from torch.utils.data import Dataset, DataLoader, random_split

from torchvision import datasets, transforms

import pytorch_lightning as pl
from pytorch_lightning.core.lightning import LightningModule

# Select Visible GPU
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

## Set Path 
1. Add project root and src folders to `sys.path`
2. Set DATA_ROOT to `maptile_v2` folder

In [None]:
this_nb_path = Path(os.getcwd())
ROOT = this_nb_path.parent
SRC = ROOT/'src'
DATA_ROOT = Path("/data/hayley-old/maptiles_v2/")
paths2add = [this_nb_path, ROOT, SRC]

print("Project root: ", str(ROOT))
print('Src folder: ', str(SRC))
print("This nb path: ", str(this_nb_path))


for p in paths2add:
    if str(p) not in sys.path:
        sys.path.insert(0, str(p))
        print(f"\n{str(p)} added to the path.")
        
# print(sys.path)



## Basic stats of the Maptile dataset 
DATA_ROOT structure
```
Maptiles
|- city
    |- style1
        |- zoom
            |- x_y_z.png
            |- ...
            |- lnglat 
                |- x_y_z.txt
                |- ...
```
Ex:
```
Maptiles
|- paris
    |- OSMDefault
        |- 15
            |- 16603_11278_15.png
```
Note: for style watercolor images, the images are in fact in `.jpg` even though the suffix is `png`. 
So, when reading watercolor images, explicitly specify the format as a keyword argument to `imread` function.

- cities: 

In [None]:
from collections import defaultdict

class NestedDefaultDict(defaultdict):
    "src: https://stackoverflow.com/a/56338725"
    def __init__(self, *args, **kwargs):
        super().__init__(NestedDefaultDict, *args, **kwargs)

    def __repr__(self):
        return repr(dict(self))
    
    

In [None]:
zoom = 15
n_show = 4
debug = True

def collect_fns(data_root: Path, 
                cities: Iterable[str]=None,
                styles: Iterable[str]=None,
                zooms: Iterable[str]=None,
                verbose: bool=False,
                debug: bool=False, 
                n_show: int=4) -> pd.DataFrame:
    """
    Collect all Count the number of maptiles from `cities`, for each style in `styles` 
    and at each zoom level in `zooms`
    
    Args:
    - data_root (Path): Path object to the root folder for data

        
    - debug (bool)
    - n_show (int): number of images to sample and show for each city/style/zoom
    
    Note: If debug is false, n_show is ignored
    
    Returns:
    - fns (pd.DataFrame): with columns = ['city', 'style', 'zoom', 'fn']
    
    TODO: the `fn` column stores Path objects (rather than the string)? 
    -- or better to store str object?
    """ 
    # Collect as a record/row = Tuple[str, str, str, int] for a dataframe
    rows = [] 
    for city_dir in data_root.iterdir():
        if city_dir.is_dir():
            city = city_dir.stem
            if verbose: print(f"\n{city}")
            if city not in cities:
                if verbose: print(f"Skipping... {city}")
                continue
            for style_dir in city_dir.iterdir():
                if style_dir.is_dir():
                    style = style_dir.stem
                    if verbose: print(f"\n\t{style}")
                    if style not in styles:
                        if verbose: print(f"Skipping... {style}")
                        continue
                    for zoom_dir in style_dir.iterdir():
                        if zoom_dir.is_dir():
                            z = zoom_dir.stem
                            if verbose: print(f"\n\t\t{z}")
                            if z not in zooms:
                                if verbose: print(f"Skipping... {z}")
                                continue
                            for fpath in zoom_dir.iterdir():
                                if fpath.is_file():
                                    rows.append([city, style, z, fpath])
                            
                                        
                            # Debug
                            if debug:
                                print(f"{city}/{style}/{z}: {count}")
                                
                                #show samples
                                img_fns = [p for p in zoom_dir.iterdir() if p.is_file()][:n_show]
                                inds = np.random.choice(len(img_fns), size=n_show, replace=False)
                                
                                f, ax = plt.subplots(nrows=1, ncols=n_show)
                                f.suptitle(f"{city}/{style}/{z}")
                                ax = ax.flatten()
                                for i in range(n_show):
                                    ind = inds[i]
                                    try:
                                        img = plt.imread(img_fns[ind])
                                    except SyntaxError: # suffix is deceptive (ie. not PNG, but jpg)
                                        img = plt.imread(img_fns[ind], format='jpg')
                                    ax[i].imshow(img)
                                    print(img.min(), img.max(), img.dtype, img.shape)
                                brpt()
                                
        # Construct a dataframe
        df_counts = pd.DataFrame(rows, columns=['city', 'style', 'zoom', 'fpath'])

    return df_counts

In [None]:
def test_collect_fns():
    cities = ['paris']
    styles = ['StamenTerrainLines']
    zooms = ['11']
    df_fns = collect_fns(DATA_ROOT, cities, styles, zooms)
    print(len(df_fns))
    print(df_fns)
test_collect_fns()

In [None]:
def count_imgs(data_root: Path, 
              return_type: str='dataframe',
              debug: bool=False, n_show: int=4) -> Union[pd.DataFrame,Dict[str, Dict[str, Dict[str, int]]]]:
    """
    Count the number of maptiles for each city,
    for each style, at each zoom level.
    
    Args:
    - data_root (Path): Path object to the root folder for data
    - return_type (str): 'dataframe' or 'dict'
        - If 'dataframe': returns pd.Dataframe (flat)
        - If 'dict': returns dictionary (nested), similar to json
        
    - debug (bool)
    - n_show (int): number of images to sample and show for each city/style/zoom
    
    Note: If debug is false, n_show is ignored
    
    Returns:
    - counts (dict): counts[city(str)][style(str)][zoom(str)] stores 
    the number of maptiles at zoom, of style, from city.
    
    """ 
    # Collect as a record/row = Tuple[str, str, str, int] for a dataframe
    rows = [] 
    
    # Or, as a nested dictionary
    counts = NestedDefaultDict()
    for city_dir in data_root.iterdir():
        if city_dir.is_dir():
            city = city_dir.stem
            cities.append(city)
            for style_dir in city_dir.iterdir():
                if style_dir.is_dir():
                    style = style_dir.stem
                    for zoom_dir in style_dir.iterdir():
                        if zoom_dir.is_dir():
                            z = zoom_dir.stem
                            count = len([p for p in zoom_dir.iterdir() if p.is_file()])
                            
                            # Collect the count
                            counts[city][style][z] = count
                            rows.append([city, style, z, count])
                                        
                            # Debug
                            if debug:
                                print(f"{city}/{style}/{z}: {count}")
                                
                                #show samples
                                img_fns = [p for p in zoom_dir.iterdir() if p.is_file()][:n_show]
                                inds = np.random.choice(len(img_fns), size=n_show, replace=False)
                                
                                f, ax = plt.subplots(nrows=1, ncols=n_show)
                                f.suptitle(f"{city}/{style}/{z}")
                                ax = ax.flatten()
                                for i in range(n_show):
                                    ind = inds[i]
                                    try:
                                        img = plt.imread(img_fns[ind])
                                    except SyntaxError: # suffix is deceptive (ie. not PNG, but jpg)
                                        img = plt.imread(img_fns[ind], format='jpg')
                                    ax[i].imshow(img)
                                    print(img.min(), img.max(), img.dtype, img.shape)
                                brpt()
                                
        # Construct a dataframe
        df_counts = pd.DataFrame(rows, columns=['city', 'style', 'zoom', 'count'])


                                
    return df_counts if return_type == 'dataframe' else counts
                                

In [None]:
df_counts = count_imgs(DATA_ROOT)


In [None]:
df_counts[ (df_counts.city=='paris')];

In [None]:
from pretty_html_table import build_table

In [None]:
html_counts = build_table(df_counts, 'blue_dark')

In [None]:
with open('/data/hayley-old/maptiles_v2/counts.html','w') as f:
    f.write(html_counts)
    

## Maptile style shortnames


In [None]:
shortnames = {}
styles = df_counts["style"].unique()
pprint(styles)

In [None]:
class MapStyles():
    
    _long2short = {
        "EsriImagery": "Esri",
        "EsriWorldTopo": "EsriTopo",
        "CartoLightNoLabels": "CartoLight",
        "CartoVoyagerNoLabels": "CartoVoyager",
        "StamenTonerLines": "StamenTonerL",
        "StamenTonerBackground": "StamenTonerBg",
        "StamenTerrainLines": "StamenTerrainL",
        "StamenTerrainBackground": "StamenTerrainBg",
        "StamenWatercolor": "StamenWc",
        "OSMDefault": "OSM",
        "MtbmapDefault":" Mtb"
    }
    @classmethod
    def _short2long(cls):
        return {short:long for long,short in cls._long2short.items()}
    
    @classmethod
    def shortname(cls, style:str):
        return cls._long2short[style]
    
    @classmethod
    def longname(cls, short:str):
        return cls._short2long()[short]
    
    # TODO: Implement as delegation; Add "remove" method    
    @classmethod
    def update(cls, style:str, shortname:str) -> None:
        cls._long2short[style] = shortname
    
    def __init__(self):
        pass
    
    
def test_mapstyles_long2short():
    for s in styles:
        print(f"{s}: {MapStyles.shortname(s)}")
def test_mapstyles_short2long():
    d = MapStyles._long2short
    for long,short in d.items():
        print(f"{short}: {MapStyles.longname(short)}")
test_mapstyles_short2long()


In [None]:
[print(c.upper()) for c in cities];

## Pie chart to show  style proportions for each city

In [None]:
from src.visualize.utils import get_fig

def show_pie_chart(df_counts: pd.DataFrame, zoom: int, *, 
                   cities: Iterable[str]=None, styles: Iterable[str]=None)-> None:
    """
    Show style proportions of all maptiles at the specified zoom.
    Optionally, filter data to given cities and  styles.
    """
    zoom = str(zoom)
    df = df_counts[df_counts["zoom"]==zoom].drop('zoom', axis=1)
    
    if cities is not None:
        df = df[df["city"].isin(cities)]
    n_cities = len(df["city"].unique())
    if styles is not None:
        df = df[df["style"].isin(styles)]

    f, axes = get_fig(n_cities, factor=5.)
    f.suptitle(f"Zoom: {zoom}")
    # per-city pie chart to show each style's proportion
    g_iter = iter(df.groupby(['city']))
    for i, ax in enumerate(axes):
        if i < n_cities:
            name, g = next(g_iter)
            labels = g["style"].apply(MapStyles.shortname)
            ax.pie(g["count"], labels=labels, autopct='%1.1f%%');
            ax.set_title(name)
        else:
            f.delaxes(ax)
            

Train cities:
- berlin
- la
- montreal
- paris
- rome
- seoul
- chicago

In [None]:
# Show a pie chart for each of the specified zooms
zooms = df_counts['zoom'].unique()
train_cities = ["paris", "la", "manhattan", "boston", "berlin", "montreal"]

train_styles = [MapStyles.longname(shortname) for shortname in sorted(["CartoLight", "CartoVoyager", "Esri", "OSM", "StamenTonerL", "StamenWc"])]
for zoom in [15]: #zooms:
    show_pie_chart(df_counts, zoom=zoom, cities=train_cities, styles=train_styles)

In [None]:
show_pie_chart(df_counts, zoom=12)

## Count stat per city 

In [None]:
zoom = str(15)
condition = (df_counts["style"].isin(train_styles)) & (df_counts["zoom"] == zoom)
df = df_counts[condition].drop("zoom", axis=1).reset_index(drop=True)

# Make the dataframe into html table and save as a file
html = build_table(df, 'blue_dark')
with open(f'/data/hayley-old/maptiles_v2/train-styles_zoom:{zoom}_counts.html','w') as f:
    f.write(html)
    

In [None]:
# show entire dataframe
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
#     pprint(df_counts.groupby(["city", "zoom"]).sum("count"))
    pprint(df.groupby(["city"]).sum("count"))


## Organize maptiles into a structure that is standard for pytorch's `Dataset` class
- Ref: 
1. Finish the implementation v1
2. Test with vae

In [None]:
# How do I want to use this dataset object/ dataloader?
class MAPNIST(Dataset):
    
    def __init__(self, data_root: Path, 
                 cities: Iterable, 
                 styles: Iterable, 
                 zooms: Iterable[str], 
                 transform: Optional[Callable]=None,
                 target_transform: Optional[Callable]=None,
                verbose: bool=False):
        self.data_root = data_root
        self.cities = cities
        self.styles = styles
        self.zooms = zooms
        self.xform =transform
        self.target_xform = target_transform
        self.df_fns = collect_fns(self.data_root, self.cities, self.styles, self.zooms,
                                 verbose=verbose)
        self.df_counts = self.df_fns.groupby(['city', 'style', 'zoom']).sum('fpath')
        
        
    def __getitem__(self, idx: int) -> Tuple[np.ndarray, str]:
        """
        Return `idx`th sample from the dataset
        
        -x: (np.ndarray) of 3dim H=256,W=256,C=3. Values are in range [0.,1.]
        -y (str): style name (long/original name)
        
        """
        fpath = self.df_fns.iloc[idx]['fpath']
        try: 
            x = plt.imread(fpath)[...,:3]
        except SyntaxError: #read as jpg
            x = plt.imread(fpath, format='jpg')[...,:3]
            
        y = self.df_fns.iloc[idx]["style"]

        return (x,y)
    
    def __len__(self):
        "Return the number of samples in the dataset"
        return len(self.df_fns)
    
    def __repr__(self):
        return f"MAPNIST_{'-'.join(self.cities)}_{'-'.join(self.styles)}_{self.zoom}"
        
    def get_summary(self) -> pd.DataFrame:
        "Returns a dataframe of the counts of the maptiles for this object's cities and styles at zoom"
        df_counts = count_imgs(DATA_ROOT)
        condition = (df_counts["city"].isin(self.cities)) & (df_counts["style"].isin(self.styles)) & (df_counts["zoom"] == self.zoom)
        df_summary = df_counts[condition].drop("zoom", axis=1).reset_index(drop=True)
        return df_summary

                                  

In [None]:
def test_MAPNIST():
    cities = ['la']
    styles = ['OSMDefault']
    zooms = ['14']
    dset = MAPNIST(DATA_ROOT, cities, styles, zooms)
    for i in range(10):
        idx = np.random.randint(0,len(dset))
        print(len(dset))
        x,y = dset[idx]
        plt.imshow(x)
        plt.title(f"{idx}, {y}")
        plt.show()
test_MAPNIST()


todo:
- transform to smaller image size? eg. 64,64,3
- train with pl in the playground - just swap mnist with maptiles
--> Dec 7, 2020 3:56pm


In [None]:
ALFAFOLD?