## Eurofab model training

In [4]:
import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from core.utils import used_keys

from palettable.colorbrewer.qualitative import Set3_12
from sklearn.metrics import davies_bouldin_score, f1_score

from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
from sklearn import model_selection
from sklearn.metrics import accuracy_score, balanced_accuracy_score, make_scorer

In [5]:
v = 'v3'
sample_size = 150_000


mapping_level = 4

train_test_iteration = 5

Read the training data, groups and labels

In [6]:
%%time
X_train = pd.read_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_data{train_test_iteration}.pq')
y = pd.read_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/training_labels{train_test_iteration}.pq')

CPU times: user 53.1 s, sys: 56.7 s, total: 1min 49s
Wall time: 21.6 s


In [7]:
X_train.shape

(42356391, 72)

Map the labels to the appropriate aggregation level

In [8]:
cluster_mapping = pd.read_parquet(f'/data/uscuni-ulce/processed_data/clusters/cluster_mapping_{v}.pq')

if mapping_level == 3:
    level_cut = cluster_mapping[3].astype(str)
    level_cut[level_cut == '2'] = '8'
    cluster_names = {
    '1': 'Central Urban Developments',
     '3': 'Dense Urban Developments',
     '4': 'Street-aligned Developments',
     '5': 'Sparse Rural Development',
     '6': 'Linear Road Network Developments',
     '7': 'Sparse Road Network Developments',
     '8': 'Large Scale Developments'
    }

elif mapping_level == 4:
    # # assign outliers to the industrial cluster
    level_cut = cluster_mapping[4].astype(str)
    level_cut[level_cut == '3'] = '15'
    level_cut[level_cut == '4'] = '15'
    level_cut[level_cut == '10'] = '15'
    cluster_names = {'1': 'Dense Connected Developments',
     '2': 'Large Interconnected Blocks',
     '3': 'Extensive Courtyard Complexes',
     '4': 'Massive Connected Aggregations',
     '5': 'Dense Standalone Buildings',
     '6': 'Compact Development',
     '7': 'Cul-de-Sac Layout',
     '8': 'Aligned Winding Streets',
     '9': 'Sparse Rural Development',
     '10': 'Large Wide-Spaced Complexes',
     '11': 'Dispersed Linear Development',
     '12': 'Linear Development',
     '13': 'Sparse Open Layout',
     '14': 'Sparse Road-Linked Development',
     '15': 'Large Utilitarian Development',
     '16': 'Extensive Wide-Spaced Developments'}

In [9]:
has_building = ~y.index.str.split('_').str[-1].str.startswith('-')

X_train = X_train[has_building]
y = y[has_building]

assert y.final_without_noise.isna().sum() == 0
assert (y.final_without_noise == -1).sum() == 0
assert (X_train.index == y.index).all()

y.shape

(42328361, 1)

In [10]:
%%time
y['final_without_noise'] = y['final_without_noise'].map(level_cut.to_dict())
y['final_without_noise'].value_counts()

CPU times: user 1.03 s, sys: 621 ms, total: 1.65 s
Wall time: 1.42 s


final_without_noise
13    6611359
8     6345371
6     5311809
14    5309003
7     4902473
5     3754986
1     3470687
9     2712652
12    1931229
11     681755
15     619206
16     506201
2      171630
Name: count, dtype: int64

Undersample and Predict only building classification, so drop empty ETCs.

In [11]:
%%time

if sample_size > y['final_without_noise'].value_counts().iloc[-1]:
    sample_size = y['final_without_noise'].value_counts().iloc[-1] - 1_000

np.random.seed(123)
train_indices = []
classes = y.final_without_noise.unique()
has_building = ~y.index.str.split('_').str[-1].str.startswith('-')

# prague_only = y.index.str.startswith('65806')

for cluster in classes:
    random_indices = np.random.choice(np.where((y.final_without_noise == cluster) & (has_building))[0], sample_size, replace=False, )
    train_indices.append(random_indices)

train_indices = np.concat(train_indices)

CPU times: user 43.6 s, sys: 1.85 s, total: 45.4 s
Wall time: 45.4 s


In [12]:
X_train = X_train.iloc[train_indices]
y = y.iloc[train_indices]

assert y.final_without_noise.isna().sum() == 0

In [13]:
X_resampled, y_resampled = X_train, y.final_without_noise

#### Random Forest

In [14]:
%%time
from sklearn.ensemble import RandomForestClassifier

best_model =  RandomForestClassifier(random_state=123, n_jobs=-1, 
                                     oob_score=True,
                                     n_estimators=500, max_samples=200_000, max_features=.5, min_samples_leaf=5, )

best_model.fit(X_resampled, y_resampled)

CPU times: user 1h 53min 32s, sys: 29.5 s, total: 1h 54min 2s
Wall time: 7min 27s


In [15]:
best_model.oob_score_, best_model.score(X_resampled, y_resampled)

(0.5490276923076923, 0.6166579487179488)

In [16]:
X_test = pd.read_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_data{train_test_iteration}.pq')
y_test = pd.read_parquet(f'/data/uscuni-eurofab/processed_data/train_test_data/testing_labels{train_test_iteration}.pq')

y_test['final_without_noise'] = y_test['final_without_noise'].map(level_cut.to_dict())

In [17]:
has_building_test = ~y_test.index.str.split('_').str[-1].str.startswith('-')

X_test = X_test[has_building_test]
y_test = y_test[has_building_test]

assert y_test.final_without_noise.isna().sum() == 0
assert (X_test.index == y_test.index).all()

y_test.shape

(2919465, 1)

In [18]:
%%time
predictions = best_model.predict(X_test)

CPU times: user 8min 19s, sys: 34.2 s, total: 8min 54s
Wall time: 29 s


In [19]:
mapped_predictions = predictions

In [20]:
acc = accuracy_score(mapped_predictions, y_test)
weighted = f1_score(y_test, mapped_predictions, average='weighted')
micro = f1_score(y_test, mapped_predictions, average='micro')
macro = f1_score(y_test, mapped_predictions, average='macro')

In [21]:
f1s_vals = f1_score(y_test, mapped_predictions, average=None)

f1s = pd.Series(
    f1s_vals,
    index = [cluster_names[k] for k in sorted(np.unique(mapped_predictions))]
)
f1s = f1s.sort_values()
f1s

Compact Development                   0.071677
Large Interconnected Blocks           0.135922
Linear Development                    0.136729
Aligned Winding Streets               0.163077
Sparse Road-Linked Development        0.170178
Dispersed Linear Development          0.194825
Extensive Wide-Spaced Developments    0.241953
Dense Connected Developments          0.249593
Sparse Open Layout                    0.273417
Cul-de-Sac Layout                     0.298000
Large Utilitarian Development         0.304041
Sparse Rural Development              0.444935
Dense Standalone Buildings            0.521541
dtype: float64

In [22]:
overall_acc = pd.Series([acc, weighted, micro, macro], index=['Overall accuracy', 'Weighted F1', 'Micro F1', 'Macro F1'])
overall_acc

Overall accuracy    0.271828
Weighted F1         0.291152
Micro F1            0.271828
Macro F1            0.246607
dtype: float64

In [23]:
overall_acc.to_csv(f'/data/uscuni-eurofab/processed_data/results/overall_acc_{mapping_level}_{train_test_iteration}.csv')

In [24]:
f1s.to_csv(f'/data/uscuni-eurofab/processed_data/results/class_f1s_{mapping_level}_{train_test_iteration}.csv')

## All Scores

In [2]:
import glob
import pandas as pd


test_countries_names = ['Slovakia', 'Poland', 'Germany', 'Austria', 'Czechia']

mapping_level = 4

In [3]:
overall_accs = [pd.read_csv(f1fp).set_index('Unnamed: 0')['0'] for f1fp in sorted(glob.glob(f'/data/uscuni-eurofab/processed_data/results/overall_acc_{mapping_level}*'))]
overall_accs = pd.concat(overall_accs, axis=1)
overall_accs.index.name = ''
overall_accs.columns = test_countries_names
overall_accs

Unnamed: 0,Slovakia,Poland,Germany,Austria,Czechia
,,,,,
Overall accuracy,0.372471,0.397033,0.342773,0.394995,0.271828
Weighted F1,0.369049,0.387912,0.344311,0.436307,0.291152
Micro F1,0.372471,0.397033,0.342773,0.394995,0.271828
Macro F1,0.290357,0.323803,0.282813,0.296239,0.246607


In [4]:
f1s3 = [pd.read_csv(f1fp).set_index('Unnamed: 0')['0'] for f1fp in sorted(glob.glob(f'/data/uscuni-eurofab/processed_data/results/class_f1s_{mapping_level}*'))]
f1s3 = pd.concat(f1s3, axis=1)
f1s3.index.name = ''
f1s3.columns = test_countries_names
f1s3

Unnamed: 0,Slovakia,Poland,Germany,Austria,Czechia
,,,,,
Large Interconnected Blocks,0.147011,0.116789,0.237069,0.283361,0.135922
Sparse Open Layout,0.170826,0.258573,0.285227,0.497482,0.273417
Compact Development,0.172981,0.084837,0.295487,0.089964,0.071677
Large Utilitarian Development,0.216145,0.299314,0.261595,0.302493,0.304041
Dense Connected Developments,0.216633,0.253915,0.375377,0.296466,0.249593
Extensive Wide-Spaced Developments,0.227086,0.410615,0.079021,0.167235,0.241953
Dispersed Linear Development,0.249629,0.386398,0.233418,0.094352,0.194825
Sparse Rural Development,0.27648,0.593759,0.148764,0.510575,0.444935
Aligned Winding Streets,0.304806,0.188761,0.324487,0.266017,0.163077
