In [1]:
import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from core.utils import used_keys

In [2]:
from palettable.colorbrewer.qualitative import Set3_12
from sklearn.metrics import davies_bouldin_score

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons


In [4]:
from sklearn import model_selection
from sklearn.metrics import accuracy_score, balanced_accuracy_score, make_scorer


In [5]:
%%time
X_train = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_data4.pq')
y = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_labels4.pq')
groups = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/training_hexagons4.pq')

CPU times: user 56.1 s, sys: 39.6 s, total: 1min 35s
Wall time: 25.3 s


In [6]:
# limit to prague only
X_train = X_train[X_train.index.str.startswith('65806')]
y = y[y.index.str.startswith('65806')]
groups = groups[groups.index.str.startswith('65806')]



In [7]:
## drop empty cells
X_train = X_train[~X_train.index.str.split('_').str[-1].str.startswith('-')].fillna(0)
y = y[~y.index.str.split('_').str[-1].str.startswith('-')].fillna(0)
groups = groups[~groups.index.str.split('_').str[-1].str.startswith('-')].fillna(0)

In [8]:
# setup kfold splits based on spatial contiguity
gkf = model_selection.StratifiedGroupKFold(n_splits=5)
splits = gkf.split(
    X_train.values,
    y.final_without_noise.values,
    groups=groups.hexagons.values,
)

In [9]:
# setup grid search and evaluation
calibrated_forest = CalibratedClassifierCV(
   estimator=RandomForestClassifier(random_state=123, n_jobs=-1))

param_grid = {
    'estimator__max_depth': [2, 4, 6, 8, 20],
    'estimator__min_samples_split': [2, 50, 100, 200, 400]
}

scoring = {"Balanced Accuracy": make_scorer(balanced_accuracy_score), "Accuracy": make_scorer(accuracy_score)}

search = GridSearchCV(calibrated_forest, param_grid, cv=splits, scoring=scoring, refit="Accuracy", return_train_score=True)

In [10]:
%%time
search.fit(
    X_train.values,
    y.final_without_noise.values
)



KeyboardInterrupt: 

In [11]:
best_model = search.best_estimator

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator'

In [None]:
X_test = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/testing_data4.pq')
y_test = pd.read_parquet('/data/uscuni-eurofab/processed_data/train_test_data/testing_labels4.pq')

# limit to prague only
X_test = X_test[X_test.index.str.startswith('65806')]
y_test = y_test[y_test.index.str.startswith('65806')]

In [None]:
predictions = best_model.predict(X_test)

In [None]:
accuracy(predictions, y_test)

In [None]:
## testing data...

In [None]:
1

### why are we using spatial kfold -  predictive model

In [3]:
region_id = 4182

tessellations_dir = graph_dir = enclosures_dir = '../data/ms_buildings/'
chars_dir = '../data/ms_buildings/chars/'

In [4]:
primary = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')

In [5]:
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)

In [6]:
X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_train_subset, y, test_size=0.15, random_state=42)

In [19]:
clf = RandomForestClassifier(random_state=0, n_jobs=-1, verbose=True)

In [20]:
%%time
clf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.5s


CPU times: user 2min 24s, sys: 345 ms, total: 2min 24s
Wall time: 8.15 s


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.0s finished


In [21]:
clf.score(X_test, y_test)

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.1s finished


0.9484593837535014

In [22]:
from sklearn import model_selection

gkf = model_selection.StratifiedGroupKFold(n_splits=5)
splits = gkf.split(
    X_train_subset,
    y,
    groups=tessellation_subset.enclosure_index,
)
split_label = np.empty(len(X_train_subset), dtype=float)
for i, (train, test) in enumerate(splits):
    split_label[test] = i

In [23]:
train = split_label != 0
X_train = X_train_subset.loc[train]
y_train = y[train]

test = split_label == 0
X_test = X_train_subset.loc[test]
y_test = y[test]

In [24]:
rf_spatial_cv = RandomForestClassifier(random_state=0, n_jobs=-1)
rf_spatial_cv.fit(X_train, y_train)

In [25]:
rf_spatial_cv.score(X_test, y_test)

0.6201246008062405

In [26]:
new_labels = clf.predict(X_train_subset)

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.6s finished


In [27]:
new_labels = rf_spatial_cv.predict(X_train_subset)

### Plot predictions

In [28]:
from lonboard import SolidPolygonLayer, Map
from lonboard.basemap import CartoBasemap
from lonboard.colormap import apply_categorical_cmap
from palettable.colorbrewer.qualitative import Set3_12
from core.cluster_validation import get_color

In [29]:
plotting = tessellation_subset

plotting["label"] = new_labels

In [30]:
layer = SolidPolygonLayer.from_geopandas(
    gdf=plotting[["geometry", "label"]], get_fill_color=get_color(plotting['label'].values.astype(int)), opacity=0.15
)



In [32]:
m = Map(layer, basemap_style=CartoBasemap.Positron)
m

Map(basemap_style=<CartoBasemap.Positron: 'https://basemaps.cartocdn.com/gl/positron-gl-style/style.json'>, la…