In [1]:
import logging
logging.getLogger().setLevel(logging.INFO)

from glob import glob
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.model_selection import train_test_split 

In [2]:
RANDOM_STATE = 10
VALIDATE_SIZE = 0.10

In [3]:
DATA_DIR = "../data/raw/"
zone_to_crs = {'gros_islet': '+init=epsg:32620',
               'castries': '+init=epsg:32620',
               'dennery': '+init=epsg:32620',
               'mixco_3': '+init=epsg:32616',
               'mixco_1_and_ebenezer': '+init=epsg:32616',
               'borde_rural': '+init=epsg:32618',
               'borde_soacha': '+init=epsg:32618'}

geojsons = glob(DATA_DIR + "**/**/*.geojson", recursive=True)

In [4]:
trainset = {}
testset = {}
for geojson in geojsons:
    df = gpd.read_file(geojson)    
    df["path"] = geojson
    df["area"] = df["geometry"].area
    place = geojson.split("/")[-3]    
    zone = geojson.split("/")[-2]
    df["zone"] = zone
    df["place"] = place
    #df = df.to_crs(zone_to_crs[zone])
    if "train" in geojson:
        trainset[zone] = df
    else:
        testset[zone] = df
logging.info(f"trainset: {', '.join(list(trainset.keys()))}")        
logging.info(f"testset: {', '.join(list(testset.keys()))}") 

INFO:root:trainset: gros_islet, castries, dennery, mixco_3, mixco_1_and_ebenezer, borde_rural, borde_soacha
INFO:root:testset: dennery, mixco_3, mixco_1_and_ebenezer, borde_rural, borde_soacha


In [5]:
df = pd.concat(trainset.values())
df["stratify"] = df["place"] + '-' + df["roof_material"]

In [6]:
df.head()

Unnamed: 0,id,roof_material,verified,geometry,path,area,zone,place,stratify
0,7a262f60,healthy_metal,False,"POLYGON ((-60.94632 14.06951, -60.94650 14.069...",../data/raw/st_lucia/gros_islet/train-gros_isl...,1.872463e-08,gros_islet,st_lucia,st_lucia-healthy_metal
1,7a1e905c,healthy_metal,False,"POLYGON ((-60.94563 14.06945, -60.94569 14.069...",../data/raw/st_lucia/gros_islet/train-gros_isl...,2.394356e-08,gros_islet,st_lucia,st_lucia-healthy_metal
2,7a1f3142,healthy_metal,False,"POLYGON ((-60.94521 14.06958, -60.94530 14.069...",../data/raw/st_lucia/gros_islet/train-gros_isl...,1.705274e-08,gros_islet,st_lucia,st_lucia-healthy_metal
3,7a29c97c,healthy_metal,False,"POLYGON ((-60.94603 14.06961, -60.94582 14.069...",../data/raw/st_lucia/gros_islet/train-gros_isl...,2.23428e-08,gros_islet,st_lucia,st_lucia-healthy_metal
4,7a35e2b6,healthy_metal,False,"POLYGON ((-60.94556 14.06968, -60.94559 14.069...",../data/raw/st_lucia/gros_islet/train-gros_isl...,4.442277e-09,gros_islet,st_lucia,st_lucia-healthy_metal


In [7]:
df.tail()

Unnamed: 0,id,roof_material,verified,geometry,path,area,zone,place,stratify
5990,7a39e44c,incomplete,True,"POLYGON ((-74.16782 4.57210, -74.16774 4.57204...",../data/raw/colombia/borde_soacha/train-borde_...,5.953899e-09,borde_soacha,colombia,colombia-incomplete
5991,7a3d882c,incomplete,True,"POLYGON ((-74.16773 4.57212, -74.16764 4.57218...",../data/raw/colombia/borde_soacha/train-borde_...,5.46367e-09,borde_soacha,colombia,colombia-incomplete
5992,7a3df9d8,irregular_metal,True,"POLYGON ((-74.16791 4.57185, -74.16787 4.57190...",../data/raw/colombia/borde_soacha/train-borde_...,6.502043e-09,borde_soacha,colombia,colombia-irregular_metal
5993,7a2f5522,irregular_metal,True,"POLYGON ((-74.16778 4.57216, -74.16769 4.57223...",../data/raw/colombia/borde_soacha/train-borde_...,6.058331e-09,borde_soacha,colombia,colombia-irregular_metal
5994,7a3e1b8e,irregular_metal,True,"POLYGON ((-74.16752 4.57189, -74.16744 4.57196...",../data/raw/colombia/borde_soacha/train-borde_...,5.642552e-09,borde_soacha,colombia,colombia-irregular_metal


In [8]:
train, validate = train_test_split(df, test_size=VALIDATE_SIZE, stratify=df['stratify'], random_state=RANDOM_STATE)

In [9]:
display(train.head())
len(train)

Unnamed: 0,id,roof_material,verified,geometry,path,area,zone,place,stratify
4542,7a3a1610,healthy_metal,True,"POLYGON ((-74.16462 4.56226, -74.16472 4.56233...",../data/raw/colombia/borde_soacha/train-borde_...,3.75646e-09,borde_soacha,colombia,colombia-healthy_metal
2287,7a224346,concrete_cement,True,"POLYGON ((-74.16381 4.54883, -74.16372 4.54878...",../data/raw/colombia/borde_rural/train-borde_r...,3.694646e-09,borde_rural,colombia,colombia-concrete_cement
1857,7a2a16ca,irregular_metal,True,"POLYGON ((-74.16995 4.56916, -74.16991 4.56920...",../data/raw/colombia/borde_soacha/train-borde_...,2.882309e-09,borde_soacha,colombia,colombia-irregular_metal
1258,7a2abe0e,concrete_cement,True,"POLYGON ((-90.58623 14.62263, -90.58617 14.622...",../data/raw/guatemala/mixco_1_and_ebenezer/tra...,6.101722e-09,mixco_1_and_ebenezer,guatemala,guatemala-concrete_cement
2339,7a1e8d8c,irregular_metal,True,"POLYGON ((-90.58346 14.61994, -90.58341 14.619...",../data/raw/guatemala/mixco_1_and_ebenezer/tra...,2.218616e-09,mixco_1_and_ebenezer,guatemala,guatemala-irregular_metal


20297

In [10]:
display(validate.head())
len(validate)

Unnamed: 0,id,roof_material,verified,geometry,path,area,zone,place,stratify
1207,7a2d3ab2,healthy_metal,True,"POLYGON ((-90.58506 14.62286, -90.58503 14.622...",../data/raw/guatemala/mixco_1_and_ebenezer/tra...,1.508116e-09,mixco_1_and_ebenezer,guatemala,guatemala-healthy_metal
1524,7a2f9f5a,irregular_metal,True,"POLYGON ((-90.58291 14.62608, -90.58290 14.626...",../data/raw/guatemala/mixco_1_and_ebenezer/tra...,1.791103e-09,mixco_1_and_ebenezer,guatemala,guatemala-irregular_metal
853,7a36ca46,irregular_metal,True,"POLYGON ((-90.58318 14.62182, -90.58313 14.621...",../data/raw/guatemala/mixco_1_and_ebenezer/tra...,4.85063e-09,mixco_1_and_ebenezer,guatemala,guatemala-irregular_metal
249,7a2e64c8,irregular_metal,True,"POLYGON ((-74.17010 4.57346, -74.17009 4.57351...",../data/raw/colombia/borde_soacha/train-borde_...,5.737366e-09,borde_soacha,colombia,colombia-irregular_metal
3864,7a2308d0,healthy_metal,True,"POLYGON ((-74.16086 4.54482, -74.16078 4.54476...",../data/raw/colombia/borde_rural/train-borde_r...,5.062026e-09,borde_rural,colombia,colombia-healthy_metal


2256

In [11]:
train["roof_material"].value_counts(normalize=True)*100

healthy_metal      65.694438
irregular_metal    23.239888
concrete_cement     6.730059
incomplete          2.965956
other               1.369661
Name: roof_material, dtype: float64

In [12]:
validate["roof_material"].value_counts(normalize=True)*100

healthy_metal      65.735816
irregular_metal    23.226950
concrete_cement     6.737589
incomplete          2.969858
other               1.329787
Name: roof_material, dtype: float64

# Save DataFrames

In [14]:
OUTDIR = "../data/processed/"
total = len(zone_to_crs) - 1
for idx, (zone, crs) in enumerate(sorted(zone_to_crs.items())):
    #train
    subset = train[train["zone"]==zone].drop(columns=["area", "stratify"]).copy()
    subset["crs"] = crs
    subset = subset.to_crs(crs)
    subset.to_pickle(OUTDIR + f"train/trainset_{idx:01d}_of_{total:01d}_{zone}.pkl")
    #validate
    subset = validate[validate["zone"]==zone].drop(columns=["area", "stratify"]).copy()
    subset["crs"] = crs
    subset = subset.to_crs(crs)
    subset.to_pickle(OUTDIR + f"validate/valset_{idx:01d}_of_{total:01d}_{zone}.pkl")