In [3]:
import sys
sys.path.append("/b2p")

In [None]:
%matplotlib nbagg
import matplotlib.pyplot as plt
import os
from glob import glob 
import rasterio
from rasterio import features
from rasterio.windows import Window
import geopandas as gpd
import numpy as np
import copy
from tqdm.notebook import tqdm 
import warnings
# filter out RuntimeWarnings, due to geopandas/fiona read file spam
# https://stackoverflow.com/questions/64995369/geopandas-warning-on-read-file
warnings.filterwarnings("ignore",category=RuntimeWarning)

In [None]:
from sklearn import preprocessing as p
def plotNormalizedImg(data, ax):
    min_max_scaler = p.MinMaxScaler()
    normalizedData = min_max_scaler.fit_transform(data.astype(np.float64))*255

    normalizedData = np.round(normalizedData).astype(np.uint8)
    
    ax.imshow(normalizedData)
    plt.show()
with rasterio.open('./data/test_composite/Rwanda/all/35MPS_multiband.tiff') as rf:
    fig, axs = plt.subplots(1,3)
    b2 = rf.read(1)
    b3 = rf.read(2)
    b4 = rf.read(3)
    plotNormalizedImg(b2, axs[0])
    print(np.isnan(b2).any())
    print(np.isnan(b3).any())
    print(np.isnan(b4).any())
    plotNormalizedImg(b3, axs[1])
    plotNormalizedImg(b4, axs[2])
    plt.show()

In [None]:
# set inputs
s2_dir = '/b2p/data/tmp_s2/Rwanda/all'
bands = ['B02', 'B03','B04']
composite_dir = './data/test_composite'
dtype = np.float32
num_slices = 6

In [None]:
with rasterio.open(outpath, 'w', driver='Gtiff', width=g_ncols, height=g_nrows, count=1, crs=crs,
                                   transform=transform, dtype=dtype) as wf:
        wf.write(median_corrected.astype(dtype), 1)

In [None]:
from scipy.stats import norm
dif = np.diff(median_corrected, axis=1) # right col minus left
data = dif.mean(axis=0)
mu, std = norm.fit(data)
plt.subplots()
plt.hist(data, bins=25, density=True, alpha=0.6, color='b')
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
plt.show()

In [None]:
from src.api.sentinel2 import SinergiseSentinelAPI

# Download New Data

In [None]:
from src.api.sentinel2 import SinergiseSentinelAPI
buffer = 100
api = SinergiseSentinelAPI()

In [None]:
# Rwanda (first month of dry szn)
bbox = [28.7869, -3.6889, 30.9732, -0.1978]
outdir = './data/tmp_s2/Rwanda/all'
start_date = '2018-06-01'
end_date = '2018-07-31'

In [None]:
os.makedirs(outdir, exist_ok=True)
api.download(bbox, buffer, outdir, start_date, end_date)

In [None]:
# Uganda
d_to_bbox = { Kabarole: [30.0675, 0.1847, 30.6141, 1.0581],
    Kasese: [29.4585, -0.7429, 30.5516, 1.0038],
    Ibanda: [30.1691, -0.4964, 30.7157, 0.3770]
}
for district, bbox in d_to_bbox.items():
    outdir = os.path.join('./data/tmp_s2/Rwanda', district)
    # get first dry szn
    start_date = '2019-06-15'
    end_date = '2019-08-15'
    os.makedirs(outdir, exist_ok=True)
    api.download(bbox, buffer, outdir, start_date, end_date)
    start_date = '2019-01-01'
    end_date = '2019-02-28'

# Subset of Training Data

In [65]:
from glob import glob
from tqdm.notebook import tqdm
import os 
import pandas as pd
import numpy as np
import multiprocessing as mp
import random 
import subprocess


In [40]:
geo_loc_files = glob('/b2p/data/final_tiles/**/*.csv', recursive=True)
geo_loc_files = [f for f in geo_loc_files if 'Uganda' in f or 'Rwanda' in f]

In [41]:
# sanity check this is in cote d'ivore
any(['29NPE_geoloc.csv' in csv for csv in geo_loc_files])

False

In [26]:
dfs = [] 
def _task(csv):
    df = pd.read_csv(csv)
    root, _ = os.path.split(csv)
    df['tile'] = df.apply(lambda row: os.path.join(root, row['tile'].split('tmp_tiles/')[1]), axis = 1)
    return df 

In [38]:
# test on one file
csv = geo_loc_files[0]
df = _task(csv)

In [42]:
df = None
with mp.Pool(6) as p:
    dfs = tqdm(p.map(_task, geo_loc_files))
    df = pd.concat(dfs, ignore_index=True)

  0%|          | 0/26 [00:00<?, ?it/s]

In [45]:
df['country'] = df.apply(lambda row: row['tile'].split('/')[-4], axis=1)

In [48]:
num_found = df.loc[df['country'] == 'Uganda',"is_bridge"].to_numpy().sum()
print(f'Number of bridge locations, {num_found}')
print(f'Number of ties, {df.shape[0]}')


Number of bridge locations, 169
Number of ties, 2340000


In [46]:
df

Unnamed: 0.1,Unnamed: 0,tile,bbox,is_bridge,bridge_loc,country
0,0,/b2p/data/final_tiles/Rwanda/all/36MTD/36MTD_0...,"((-0.903361498186389, 30.30446420444574), (-0....",False,,Rwanda
1,1,/b2p/data/final_tiles/Rwanda/all/36MTD/36MTD_0...,"((-0.9066691181835521, 30.304461760911664), (-...",False,,Rwanda
2,2,/b2p/data/final_tiles/Rwanda/all/36MTD/36MTD_0...,"((-0.9099767381134334, 30.304459308443338), (-...",False,,Rwanda
3,3,/b2p/data/final_tiles/Rwanda/all/36MTD/36MTD_0...,"((-0.9132843579757893, 30.304456847040722), (-...",False,,Rwanda
4,4,/b2p/data/final_tiles/Rwanda/all/36MTD/36MTD_0...,"((-0.9165919777703728, 30.304454376703767), (-...",False,,Rwanda
...,...,...,...,...,...,...
2339995,89995,/b2p/data/final_tiles/Uganda/Kabarole/36NTG/36...,"((0.8326110119395832, 31.28732880325484), (0.8...",False,,Uganda
2339996,89996,/b2p/data/final_tiles/Uganda/Kabarole/36NTG/36...,"((0.8293011913316961, 31.2873302290497), (0.82...",False,,Uganda
2339997,89997,/b2p/data/final_tiles/Uganda/Kabarole/36NTG/36...,"((0.8259913706657966, 31.28733164916344), (0.8...",False,,Uganda
2339998,89998,/b2p/data/final_tiles/Uganda/Kabarole/36NTG/36...,"((0.8226815499421166, 31.287333063596094), (0....",False,,Uganda


In [49]:
df.to_csv('data/final_tiles/geoloc_rwanda_uganda_training.csv')

In [51]:
out_dir = 'data/final_tiles/'
# Seperate the training and validation into seperate files
b_ix = df.index[df['is_bridge']].tolist()
nb_ix = df.index[False == df['is_bridge']].tolist()
b_train_ix = random.sample(b_ix, int(round(0.7*len(b_ix))))
nb_train_ix = random.sample(nb_ix, int(round(0.7*len(nb_ix))))
b_val_ix = np.setdiff1d(b_ix, b_train_ix)
nb_val_ix = np.setdiff1d(nb_ix, nb_train_ix)
print(f'b_train_ix: {len(b_train_ix)}')
print(f'nb_train_ix: {len(nb_train_ix)}')
print(f'b_val_ix: {len(b_val_ix)}')
print(f'nb_val_ix: {len(nb_val_ix)}')

train_csv = os.path.join(out_dir, 'train_df.csv')
val_csv = os.path.join(out_dir, 'val_df.csv')
train_df = pd.concat(
    [
        df.iloc[b_train_ix],
        df.iloc[nb_train_ix]
    ], 
    ignore_index=True
)
val_df = pd.concat(
    [
        df.iloc[b_val_ix],
        df.iloc[nb_val_ix]
    ], 
    ignore_index=True
)
train_df.to_csv(train_csv) 
val_df.to_csv(val_csv) 
print(f'Saving to {train_csv} and {val_csv}')

b_train_ix: 385
nb_train_ix: 1637615
b_val_ix: 165
nb_val_ix: 701835
Saving to data/final_tiles/train_df.csv and data/final_tiles/val_df.csv


In [58]:
out_dir = 'data/final_tiles/'
# Make a training set where the # of bridge tiles == # of no bridge tiles
b_ix = df.index[df['is_bridge']].tolist()
nb_ix = df.index[False == df['is_bridge']].tolist()
b_train_ix = random.sample(b_ix, int(round(0.7*len(b_ix))))
nb_train_ix = random.sample(nb_ix, int(round(0.7*len(b_ix))))
b_val_ix = np.setdiff1d(b_ix, b_train_ix)
nb_val_ix = np.setdiff1d(nb_ix, nb_train_ix)
nb_val_ix = random.sample(nb_val_ix.tolist(), len(b_val_ix))
print(f'b_train_ix: {len(b_train_ix)}')
print(f'nb_train_ix: {len(nb_train_ix)}')
print(f'b_val_ix: {len(b_val_ix)}')
print(f'nb_val_ix: {len(nb_val_ix)}')

train_csv = os.path.join(out_dir, 'train_df_lite.csv')
val_csv = os.path.join(out_dir, 'val_df_lite.csv')
train_df = pd.concat(
    [
        df.iloc[b_train_ix],
        df.iloc[nb_train_ix]
    ], 
    ignore_index=True
)
val_df = pd.concat(
    [
        df.iloc[b_val_ix],
        df.iloc[nb_val_ix]
    ], 
    ignore_index=True
)
train_df.to_csv(train_csv) 
val_df.to_csv(val_csv) 
print(f'Saving to {train_csv} and {val_csv}')

b_train_ix: 385
nb_train_ix: 385
b_val_ix: 165
nb_val_ix: 165
Saving to data/final_tiles/train_df_lite.csv and data/final_tiles/val_df_lite.csv


In [63]:
from bin.get_optical_data import upload_to_s3
print('-----------------------------')
print('Compressing raw s2')
tile_dir = './data/final_tiles'
tar_file = './final_tiles.tar.gz'
tar_cmd = f'tar -czvf {tar_file} {tile_dir}'
print(tar_cmd)


-----------------------------
Compressing raw s2
tar -czvf ./final_tiles.tar.gz ./data/final_tiles


In [66]:
process = subprocess.Popen(tar_cmd.split(), shell=False)
process.communicate()
# Upload raw s2 to s3
print('Uploading to s3')
upload_to_s3(
    tar_file, 
    os.path.join('sentinel2_raw', os.path.basename(tar_file))
)

KeyboardInterrupt: 

## Inspect tiles

In [71]:
import torch 
from glob import glob 
from bin.composites_to_tiles import get_bridge_locations
from bin.composites_to_tiles import bridge_in_bbox
from src.utilities.coords import tiff_to_bbox

In [69]:
bridge_locs = get_bridge_locations('data/ground_truth')

In [74]:
multiband_tiff = None
for c in glob('data/composites/**/*.tiff', recursive=True):
    bbox = tiff_to_bbox(c)
    if bridge_in_bbox(bbox, bridge_locs):
        multiband_tiff = c 
        print(c)
        break

data/composites/Rwanda/all/36MTD_multiband.tiff


In [77]:
prefix = os.path.basename(multiband_tiff)[:5]
tile = glob(f'data/final_tiles/**/{prefix}/*', recursive=True)

In [82]:
for t in tile[:10]:
    print(torch.load(t).shape)

torch.Size([37, 3, 37])
torch.Size([37, 3, 37])
torch.Size([37, 3, 37])
torch.Size([37, 3, 37])
torch.Size([37, 3, 37])
torch.Size([37, 3, 37])
torch.Size([37, 3, 37])
torch.Size([37, 3, 37])
torch.Size([37, 3, 37])
torch.Size([37, 3, 37])


torch.Size([37, 3, 37])