In [1]:
import logging
logging.getLogger().setLevel(logging.INFO)

from glob import glob
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.model_selection import train_test_split 

In [2]:
RANDOM_STATE = 10
VALIDATE_SIZE = 0.05

In [3]:
DATA_DIR = "../data/raw/"
zone_to_crs = {'gros_islet': '+init=epsg:32620',
               'castries': '+init=epsg:32620',
               'dennery': '+init=epsg:32620',
               'mixco_3': '+init=epsg:32616',
               'mixco_1_and_ebenezer': '+init=epsg:32616',
               'borde_rural': '+init=epsg:32618',
               'borde_soacha': '+init=epsg:32618'}

geojsons = glob(DATA_DIR + "**/**/*.geojson", recursive=True)

In [4]:
trainset = {}
testset = {}
for geojson in geojsons:
    df = gpd.read_file(geojson)    
    df["path"] = geojson
    df["area"] = df["geometry"].area
    place = geojson.split("/")[-3]    
    zone = geojson.split("/")[-2]
    df["zone"] = zone
    df["place"] = place
    #df = df.to_crs(zone_to_crs[zone])
    if "train" in geojson:
        trainset[zone] = df
    else:
        testset[zone] = df
logging.info(f"trainset: {', '.join(list(trainset.keys()))}")        
logging.info(f"testset: {', '.join(list(testset.keys()))}") 

INFO:root:trainset: gros_islet, castries, dennery, mixco_3, mixco_1_and_ebenezer, borde_rural, borde_soacha
INFO:root:testset: dennery, mixco_3, mixco_1_and_ebenezer, borde_rural, borde_soacha


In [5]:
df = pd.concat(trainset.values())
df = df[df["verified"]].copy()
df["stratify"] = df["place"] + '-' + df["roof_material"]

In [6]:
df.head()

Unnamed: 0,id,roof_material,verified,geometry,path,area,zone,place,stratify
0,7a3d643c,healthy_metal,True,"POLYGON ((-60.89448 13.90834, -60.89439 13.908...",../data/raw/st_lucia/dennery/train-dennery.geo...,9.666502e-09,dennery,st_lucia,st_lucia-healthy_metal
1,7a26d820,healthy_metal,True,"POLYGON ((-60.89363 13.90841, -60.89363 13.908...",../data/raw/st_lucia/dennery/train-dennery.geo...,3.379775e-09,dennery,st_lucia,st_lucia-healthy_metal
2,7a394078,healthy_metal,True,"POLYGON ((-60.89311 13.90847, -60.89304 13.908...",../data/raw/st_lucia/dennery/train-dennery.geo...,2.924767e-08,dennery,st_lucia,st_lucia-healthy_metal
3,7a3f5742,healthy_metal,True,"POLYGON ((-60.89240 13.90854, -60.89238 13.908...",../data/raw/st_lucia/dennery/train-dennery.geo...,7.166416e-09,dennery,st_lucia,st_lucia-healthy_metal
4,7a1cc592,healthy_metal,True,"POLYGON ((-60.89263 13.90856, -60.89261 13.908...",../data/raw/st_lucia/dennery/train-dennery.geo...,9.548593e-09,dennery,st_lucia,st_lucia-healthy_metal


In [7]:
df.tail()

Unnamed: 0,id,roof_material,verified,geometry,path,area,zone,place,stratify
5990,7a39e44c,incomplete,True,"POLYGON ((-74.16782 4.57210, -74.16774 4.57204...",../data/raw/colombia/borde_soacha/train-borde_...,5.953899e-09,borde_soacha,colombia,colombia-incomplete
5991,7a3d882c,incomplete,True,"POLYGON ((-74.16773 4.57212, -74.16764 4.57218...",../data/raw/colombia/borde_soacha/train-borde_...,5.46367e-09,borde_soacha,colombia,colombia-incomplete
5992,7a3df9d8,irregular_metal,True,"POLYGON ((-74.16791 4.57185, -74.16787 4.57190...",../data/raw/colombia/borde_soacha/train-borde_...,6.502043e-09,borde_soacha,colombia,colombia-irregular_metal
5993,7a2f5522,irregular_metal,True,"POLYGON ((-74.16778 4.57216, -74.16769 4.57223...",../data/raw/colombia/borde_soacha/train-borde_...,6.058331e-09,borde_soacha,colombia,colombia-irregular_metal
5994,7a3e1b8e,irregular_metal,True,"POLYGON ((-74.16752 4.57189, -74.16744 4.57196...",../data/raw/colombia/borde_soacha/train-borde_...,5.642552e-09,borde_soacha,colombia,colombia-irregular_metal


In [8]:
train, validate = train_test_split(df, test_size=VALIDATE_SIZE, stratify=df['stratify'], random_state=RANDOM_STATE)

In [9]:
display(train.head())
len(train)

Unnamed: 0,id,roof_material,verified,geometry,path,area,zone,place,stratify
412,7a201fee,irregular_metal,True,"POLYGON ((-74.17097 4.57116, -74.17091 4.57114...",../data/raw/colombia/borde_soacha/train-borde_...,3.494962e-09,borde_soacha,colombia,colombia-irregular_metal
3750,7a22246a,healthy_metal,True,"POLYGON ((-74.16979 4.56719, -74.16971 4.56724...",../data/raw/colombia/borde_soacha/train-borde_...,1.044841e-08,borde_soacha,colombia,colombia-healthy_metal
766,7a380640,irregular_metal,True,"POLYGON ((-74.15841 4.55054, -74.15837 4.55055...",../data/raw/colombia/borde_rural/train-borde_r...,1.519204e-09,borde_rural,colombia,colombia-irregular_metal
211,7a21a882,healthy_metal,True,"POLYGON ((-74.16981 4.57313, -74.16979 4.57325...",../data/raw/colombia/borde_soacha/train-borde_...,6.56208e-09,borde_soacha,colombia,colombia-healthy_metal
877,7a36a4d0,irregular_metal,True,"POLYGON ((-74.16898 4.57093, -74.16895 4.57096...",../data/raw/colombia/borde_soacha/train-borde_...,2.826957e-09,borde_soacha,colombia,colombia-irregular_metal


14126

In [10]:
display(validate.head())
len(validate)

Unnamed: 0,id,roof_material,verified,geometry,path,area,zone,place,stratify
3530,7a2f23ea,irregular_metal,True,"POLYGON ((-74.16071 4.54610, -74.16068 4.54614...",../data/raw/colombia/borde_rural/train-borde_r...,1.304878e-09,borde_rural,colombia,colombia-irregular_metal
3738,7a3ad0dc,healthy_metal,True,"POLYGON ((-74.16974 4.56796, -74.16968 4.56797...",../data/raw/colombia/borde_soacha/train-borde_...,7.720474e-09,borde_soacha,colombia,colombia-healthy_metal
2141,7a35bb2e,irregular_metal,True,"POLYGON ((-74.17017 4.56986, -74.17016 4.56990...",../data/raw/colombia/borde_soacha/train-borde_...,2.093354e-08,borde_soacha,colombia,colombia-irregular_metal
3354,7a1d6434,concrete_cement,True,"POLYGON ((-74.16974 4.56634, -74.16964 4.56637...",../data/raw/colombia/borde_soacha/train-borde_...,5.995396e-09,borde_soacha,colombia,colombia-concrete_cement
91,7a2c5aac,irregular_metal,True,"POLYGON ((-90.57700 14.61226, -90.57698 14.612...",../data/raw/guatemala/mixco_3/train-mixco_3.ge...,3.636595e-09,mixco_3,guatemala,guatemala-irregular_metal


744

In [11]:
train["roof_material"].value_counts(normalize=True)*100

healthy_metal      49.638964
irregular_metal    35.247062
concrete_cement     9.323234
incomplete          4.495257
other               1.295484
Name: roof_material, dtype: float64

In [12]:
validate["roof_material"].value_counts(normalize=True)*100

healthy_metal      49.596774
irregular_metal    35.215054
concrete_cement     9.408602
incomplete          4.435484
other               1.344086
Name: roof_material, dtype: float64

# Save DataFrames

In [13]:
OUTDIR = "../data/processed/"
total = len(zone_to_crs) - 1
for idx, (zone, crs) in enumerate(sorted(zone_to_crs.items())):
    #train
    subset = train[train["zone"]==zone].drop(columns=["area", "stratify"]).copy()
    subset["crs"] = crs
    subset = subset.to_crs(crs)
    if len(subset):
        subset.to_pickle(OUTDIR + f"train/trainset_{idx:01d}_of_{total:01d}_{zone}.pkl")
    #validate
    subset = validate[validate["zone"]==zone].drop(columns=["area", "stratify"]).copy()
    subset["crs"] = crs
    subset = subset.to_crs(crs)
    if len(subset):
        subset.to_pickle(OUTDIR + f"validate/valset_{idx:01d}_of_{total:01d}_{zone}.pkl")

# Process testset

In [14]:
test = pd.concat(testset.values())

In [15]:
OUTDIR = "../data/processed/"
total = len(zone_to_crs) - 1
for idx, (zone, crs) in enumerate(sorted(zone_to_crs.items())):
    #train
    subset = test[test["zone"]==zone].drop(columns=["area"]).copy()
    subset["crs"] = crs
    subset = subset.to_crs(crs)
    if len(subset):
        subset.to_pickle(OUTDIR + f"test/testset_{idx:01d}_of_{total:01d}_{zone}.pkl")

In [2]:
freqs = {"healthy_metal":49.638964,
"irregular_metal":35.247062,
"concrete_cement" :9.323234,
"incomplete":4.495257,
"other":1.295484}

In [5]:
w = {k:100/v for k, v in freqs.items()}
w_max = max(list(w.values()))
w = {k:v/w_max for k, v in w.items()}

In [6]:
w

{'healthy_metal': 0.0260981272695377,
 'irregular_metal': 0.03675438253548622,
 'concrete_cement': 0.1389522133628739,
 'incomplete': 0.2881890846285318,
 'other': 1.0}