## Eurofab model training

In [3]:
import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from core.utils import used_keys

from palettable.colorbrewer.qualitative import Set3_12
from sklearn.metrics import davies_bouldin_score, f1_score

from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
from sklearn import model_selection
from sklearn.metrics import accuracy_score, balanced_accuracy_score, make_scorer

In [4]:
v = 'v3'
sample_size = 150_000
mapping_level = 3

train_test_iteration = 4


Read the training data, groups and labels

In [5]:
%%time
X_train = pd.read_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_data{train_test_iteration}.pq')
y = pd.read_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_labels{train_test_iteration}.pq')

CPU times: user 1min 50s, sys: 2min 41s, total: 4min 31s
Wall time: 32.8 s


In [6]:
X_train.shape

(42428296, 288)

Map the labels to the appropriate aggregation level

In [7]:
cluster_mapping = pd.read_parquet(f'/data/uscuni-ulce/processed_data/clusters/cluster_mapping_{v}.pq')

if mapping_level == 3:
    level_cut = cluster_mapping[3].astype(str)
    level_cut[level_cut == '2'] = '8'
    cluster_names = {
    '1': 'Central Urban Developments',
     '3': 'Dense Urban Developments',
     '4': 'Street-aligned Developments',
     '5': 'Sparse Rural Development',
     '6': 'Linear Road Network Developments',
     '7': 'Sparse Road Network Developments',
     '8': 'Large Scale Developments'
    }

elif mapping_level == 4:
    # # assign outliers to the industrial cluster
    level_cut = cluster_mapping[4].astype(str)
    level_cut[level_cut == '3'] = '15'
    level_cut[level_cut == '4'] = '15'
    level_cut[level_cut == '10'] = '15'
    cluster_names = {'1': 'Dense Connected Developments',
     '2': 'Large Interconnected Blocks',
     '3': 'Extensive Courtyard Complexes',
     '4': 'Massive Connected Aggregations',
     '5': 'Dense Standalone Buildings',
     '6': 'Compact Development',
     '7': 'Cul-de-Sac Layout',
     '8': 'Aligned Winding Streets',
     '9': 'Sparse Rural Development',
     '10': 'Large Wide-Spaced Complexes',
     '11': 'Dispersed Linear Development',
     '12': 'Linear Development',
     '13': 'Sparse Open Layout',
     '14': 'Sparse Road-Linked Development',
     '15': 'Large Utilitarian Development',
     '16': 'Extensive Wide-Spaced Developments'}

In [None]:
has_building = ~y.index.str.split('_').str[-1].str.startswith('-')

X_train = X_train[has_building]
y = y[has_building]

assert y.final_without_noise.isna().sum() == 0
assert (y.final_without_noise == -1).sum() == 0
assert (X_train.index == y.index).all()

y.shape

In [None]:
%%time
y['final_without_noise'] = y['final_without_noise'].map(level_cut.to_dict())
y['final_without_noise'].value_counts()

Undersample and Predict only building classification, so drop empty ETCs.

In [None]:
%%time

np.random.seed(123)
train_indices = []
classes = y.final_without_noise.unique()
has_building = ~y.index.str.split('_').str[-1].str.startswith('-')

# prague_only = y.index.str.startswith('65806')

for cluster in classes:
    random_indices = np.random.choice(np.where((y.final_without_noise == cluster) & (has_building))[0], sample_size, replace=False, )
    train_indices.append(random_indices)

train_indices = np.concat(train_indices)

In [None]:
X_train = X_train.iloc[train_indices]
y = y.iloc[train_indices]
groups = groups.iloc[train_indices]

assert y.final_without_noise.isna().sum() == 0

In [None]:
X_resampled, y_resampled = X_train, y

#### Random Forest

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

best_model =  RandomForestClassifier(random_state=123, n_jobs=-1, 
                                     oob_score=True,
                                     n_estimators=50, max_samples=200_000, max_features=.5, min_samples_leaf=5, )

best_model.fit(X_resampled, y_resampled)

In [None]:
best_model.oob_score_, best_model.score(X_resampled, y_resampled)

In [None]:
X_test = pd.read_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_data{train_test_iteration}.pq')
y_test = pd.read_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_labels{train_test_iteration}.pq')

y_test['final_without_noise'] = y_test['final_without_noise'].map(level_cut.to_dict())

In [18]:
has_building_test = ~y_test.index.str.split('_').str[-1].str.startswith('-')

X_test = X_test[has_building_test]
y_test = y_test[has_building_test]

assert y_test.final_without_noise.isna().sum() == 0
assert (X_test.index == y_test.index).all()

y_test.shape

(2846671, 1)

In [20]:
%%time
predictions = best_model.predict(X_test[to_keep])

CPU times: user 37.8 s, sys: 8.77 s, total: 46.6 s
Wall time: 3.93 s


In [21]:
mapped_predictions = predictions

In [25]:
## vienna accuracy is a good predictor of overall austrian accuracy
acc = accuracy_score(mapped_predictions, y_test)
weighted = f1_score(y_test, mapped_predictions, average='weighted')
micro = f1_score(y_test, mapped_predictions, average='micro')
macro = f1_score(y_test, mapped_predictions, average='macro')

0.4010375628233821

In [29]:
f1s_vals = f1_score(y_test, mapped_predictions, average=None)

f1s = pd.Series(
    f1s_vals,
    index = [cluster_names[k] for k in sorted(np.unique(mapped_predictions))]
)
f1s = f1s.sort_values()
f1s

Dense Connected Developments          0.293044
Dispersed Linear Development          0.069147
Linear Development                    0.146127
Sparse Open Layout                    0.512451
Sparse Road-Linked Development        0.176687
Large Utilitarian Development         0.300625
Extensive Wide-Spaced Developments    0.162850
Large Interconnected Blocks           0.259278
Dense Standalone Buildings            0.539817
Compact Development                   0.086306
Cul-de-Sac Layout                     0.462139
Aligned Winding Streets               0.272353
Sparse Rural Development              0.509370
dtype: float64

In [None]:
overall_acc = pd.Series([acc, weighted, micro, macro], index=['Overall accuracy', 'Weighted F1', 'Micro F1', 'Macro F1'])

In [None]:
overall_acc.to_csv(f'/data/uscuni-eurofab/processed_data/results/overall_acc_{mapping_level}_{train_test_iteration}.csv')

In [None]:
f1s.to_csv(f'/data/uscuni-eurofab/processed_data/results/class_f1s_{mapping_level}_{train_test_iteration}.csv')