## Eurofab model training

In [1]:
import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from core.utils import used_keys

from palettable.colorbrewer.qualitative import Set3_12
from sklearn.metrics import davies_bouldin_score

from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
from sklearn import model_selection
from sklearn.metrics import accuracy_score, balanced_accuracy_score, make_scorer

Read the training data, groups and labels

In [2]:
%%time
X_train = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_data4.pq')
y = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_labels4.pq')
groups = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons4.pq')

CPU times: user 59.5 s, sys: 56.9 s, total: 1min 56s
Wall time: 27.2 s


In [3]:
most_underrepresented_class = y.value_counts().values[-1]

In [4]:
%%time
train_indices = []
classes = y.final_without_noise.unique()
has_building = ~y.index.str.split('_').str[-1].str.startswith('-')
# prague_only = y.index.str.startswith('65806')
for cluster in classes:
    random_indices = np.random.choice(np.where((y.final_without_noise == cluster) & (has_building))[0], most_underrepresented_class - 1000, replace=False)
    train_indices.append(random_indices)


CPU times: user 35.4 s, sys: 1.94 s, total: 37.3 s
Wall time: 37.1 s


In [5]:
train_indices = np.concat(train_indices)

In [6]:
# # limit to prague only
# X_train = X_train[X_train.index.str.startswith('65806')]
# y = y[y.index.str.startswith('65806')]
# groups = groups[groups.index.str.startswith('65806')]

Predict only building classification, so drop empty ETCs.

In [7]:
X_train = X_train.iloc[train_indices].fillna(0)
y = y.iloc[train_indices].fillna(0)
groups = groups.iloc[train_indices].fillna(0)

Setup kfold splitter based on spatial contiguity

In [8]:
gkf = model_selection.StratifiedGroupKFold(n_splits=5)
splits = gkf.split(
    X_train.values,
    y.final_without_noise.values,
    groups=groups.hexagons.values,
)
scoring = {"Balanced Accuracy": make_scorer(balanced_accuracy_score), "Accuracy": make_scorer(accuracy_score)}


In [9]:
factorize_y = y.final_without_noise.factorize()

Setup grid search and evaluation pipeline

In [10]:
# calibrated_forest = CalibratedClassifierCV(
#    estimator=RandomForestClassifier(random_state=123, n_jobs=-1))

# param_grid = {
#     'estimator__max_depth': [2, 4, 6, 8, 20],
#     # 'estimator__max_depth': [2, 4, 6],
#     'estimator__min_samples_split': [2, 50, 100, 200, 400]
#     # 'estimator__min_samples_split': [200, 400]
# }


# search = GridSearchCV(calibrated_forest, param_grid, cv=splits, scoring=scoring, refit="Accuracy", return_train_score=True)

# search.fit(
#     X_train.values,
#     factorize_y[0]
# )

In [11]:
import xgboost

In [12]:
%%time
from xgboost import XGBClassifier
xgb_model = XGBClassifier(nthread=1, learning_rate=0.02, n_estimators=600,)
search = GridSearchCV(
    xgb_model,
    {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        },
    verbose=1,
    n_jobs=-5,
    cv=splits,
    scoring=scoring,
    refit="Accuracy",
    return_train_score=True
)

search.fit(
    X_train.values,
    factorize_y[0]
)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits
CPU times: user 3h 25s, sys: 1h 4min 10s, total: 4h 4min 36s
Wall time: 16d 20h 23min 5s


Select the best model and predict the test data

In [13]:
best_model = search.best_estimator_

In [14]:
import pickle
pickle.dump(best_model, open("best_clasif.pkl", "wb"))

In [31]:
X_test = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/testing_data4.pq')
y_test = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/testing_labels4.pq')

# # # limit to vienna only
# X_test = X_test[X_test.index.str.startswith('84986')]
# y_test = y_test[y_test.index.str.startswith('84986')]

In [32]:
X_test = X_test[~X_test.index.str.split('_').str[-1].str.startswith('-')].fillna(0)
y_test = y_test[~y_test.index.str.split('_').str[-1].str.startswith('-')].fillna(0)


In [33]:
%%time
predictions = best_model.predict(X_test)

CPU times: user 6min 56s, sys: 0 ns, total: 6min 56s
Wall time: 6min 56s


In [34]:
mapped_predictions = factorize_y[1][predictions]

In [36]:
## vienna accuracy is a good predictor of overall austrian accuracy
accuracy_score(mapped_predictions, y_test)

0.19537944497274184

### Plot predictions

In [23]:
from lonboard import SolidPolygonLayer, Map
from lonboard.basemap import CartoBasemap
from lonboard.colormap import apply_categorical_cmap
from palettable.colorbrewer.qualitative import Set3_12
from core.cluster_validation import get_color

In [24]:
region_id = 84986 

In [25]:
buildings = gpd.read_parquet(f'/data/uscuni-eurofab/processed_data/buildings/buildings_{region_id}.parquet')

In [26]:
labels = y_test[y_test.index.str.startswith(str(region_id))].final_without_noise
labels.index = labels.index.str.split('_').str[-1].astype(int)

In [27]:
buildings["predicted_label"] = pd.Series(mapped_predictions, labels.index)
buildings["true_label"] = labels

In [28]:
layer = SolidPolygonLayer.from_geopandas(
    gdf=buildings[["geometry", "predicted_label", 'true_label']], get_fill_color=get_color(buildings['true_label'].values.astype(int)), opacity=0.15
)

  gdf=buildings[["geometry", "predicted_label", 'true_label']], get_fill_color=get_color(buildings['true_label'].values.astype(int)), opacity=0.15


In [29]:
m = Map(layer, basemap_style=CartoBasemap.Positron)
m

Map(basemap_style=<CartoBasemap.Positron: 'https://basemaps.cartocdn.com/gl/positron-gl-style/style.json'>, la…

In [30]:
layer.get_fill_color = get_color(buildings['predicted_label'].values.astype(int))

  layer.get_fill_color = get_color(buildings['predicted_label'].values.astype(int))
