## Dissertation Dataset Exploration (v2)
### Gavin Rolls

I've queried OSM data using different building type indicators. I'll give some additional modelling/exploration a try here

## Imports and Clean Up

In [4]:
# Library Imports

#Basics
import pandas as pd
import geopandas as gpd
import numpy as np

#Shapely
from shapely import wkt
import shapely.geometry
from shapely.geometry import Polygon, MultiPolygon

#Plots and Stats
import matplotlib.pyplot as plt
import seaborn as sns

#Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder

#ML from mljar-supervised
from supervised.automl import AutoML

#Warning Supression
import warnings

In [6]:
# Import Cleaned Employment Geography File (Version 2)

empl_geog = gpd.read_file('data/combined_data_v2/empl_geog.geojson')

# WKT strings back to Shapely geometries
empl_geog['building_poly'] = empl_geog['building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog['commercial_building_poly'] = empl_geog['commercial_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog['retail_building_poly'] = empl_geog['retail_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog['office_building_poly'] = empl_geog['office_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
empl_geog['residential_building_poly'] = empl_geog['residential_building_poly'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)

empl_geog['place_points'] = empl_geog['place_points'].apply(lambda x: shapely.wkt.loads(x) if isinstance(x, str) else x)
# Convert string representations of lists back to lists
empl_geog['category_list'] = empl_geog['category_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

empl_geog.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,06 : Extraction of crude petroleum and natural gas,07 : Mining of metal ores,...,num_office_buildings,num_places,building_poly,commercial_building_poly,retail_building_poly,office_building_poly,residential_building_poly,place_points,category_list,geometry
0,E01000001,City of London 001A,1465,lsoa2011:E01000001 : City of London 001A,0,0,0,0,10,0,...,0,243,"MULTIPOLYGON (((-0.0983297 51.5158158, -0.0983...","MULTIPOLYGON (((-0.0966079 51.5147477, -0.0967...","MULTIPOLYGON (((-0.098412 51.5155493, -0.09835...",,"MULTIPOLYGON (((-0.0980623 51.5182674, -0.0983...","MULTIPOINT ((-0.0987982 51.5176724), (-0.09860...","[pub, coffee_shop, immigration_law, shared_off...","POLYGON ((-0.09729 51.52158, -0.09652 51.52027..."
1,E01000002,City of London 001B,1436,lsoa2011:E01000002 : City of London 001B,0,0,0,0,0,0,...,0,419,"MULTIPOLYGON (((-0.0945314 51.5147658, -0.0944...","MULTIPOLYGON (((-0.0951296 51.5145545, -0.0951...","MULTIPOLYGON (((-0.0940575 51.5145934, -0.0940...",,"MULTIPOLYGON (((-0.0936221 51.5181541, -0.0946...","MULTIPOINT ((-0.0958288 51.5168656), (-0.09565...","[business_manufacturing_and_supply, education,...","POLYGON ((-0.08813 51.51941, -0.08929 51.51752..."
2,E01000003,City of London 001C,1346,lsoa2011:E01000003 : City of London 001C,0,0,0,0,0,0,...,0,55,"MULTIPOLYGON (((-0.0959637 51.5209148, -0.0959...",,"POLYGON ((-0.0959633 51.5209414, -0.0955256 51...",,"MULTIPOLYGON (((-0.0959637 51.5209148, -0.0959...","MULTIPOINT ((-0.0976057 51.5226912), (-0.09757...","[health_and_medical, hospital, parking, indust...","POLYGON ((-0.09679 51.52325, -0.09647 51.52282..."
3,E01000005,City of London 001E,985,lsoa2011:E01000005 : City of London 001E,0,10,0,0,0,0,...,0,534,"MULTIPOLYGON (((-0.0757822 51.511071, -0.07577...","MULTIPOLYGON (((-0.0752739 51.5111313, -0.0754...","MULTIPOLYGON (((-0.0779263 51.5147252, -0.0780...","POLYGON ((-0.0731987 51.5118923, -0.0731919 51...",,"MULTIPOINT ((-0.0799241 51.5156172), (-0.07992...","[professional_services, dance_club, insurance_...","POLYGON ((-0.07323 51.51000, -0.07553 51.50974..."
4,E01000006,Barking and Dagenham 016A,1703,lsoa2011:E01000006 : Barking and Dagenham 016A,0,0,0,0,0,0,...,0,11,"MULTIPOLYGON (((0.085895 51.5386732, 0.0859413...",,,,,"MULTIPOINT ((0.0856198 51.538706), (0.0858424 ...","[landmark_and_historical_building, constructio...","POLYGON ((0.09115 51.53909, 0.09326 51.53787, ..."


In [8]:
# Import Category Encoded File (POIs)

encoded_categories = pd.read_csv('data/combined_data/empl_geog_category_encoded.csv')

#Drop extraneous column
encoded_categories = encoded_categories.drop(columns=['Unnamed: 0'])

#Store category columns
category_columns = encoded_categories.columns[1:]

encoded_categories.head()

Unnamed: 0,LSOA11CD,category_list_3d_printing_service,category_list_abortion_clinic,category_list_abuse_and_addiction_treatment,category_list_accommodation,category_list_accountant,category_list_acoustical_consultant,category_list_active_life,category_list_acupuncture,category_list_addiction_rehabilitation_center,...,category_list_women's_clothing_store,category_list_women's_health_clinic,category_list_wood_and_pulp,category_list_woodworking_supply_store,category_list_writing_service,category_list_yoga_instructor,category_list_yoga_studio,category_list_youth_organizations,category_list_zoo,category_list_None
0,E01000001,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,E01000002,0.0,0.0,0.0,1.0,5.0,0.0,0.0,1.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,E01000003,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01000005,0.0,0.0,0.0,1.0,2.0,0.0,2.0,1.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,E01000006,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
# Geometric Features Extraction (copied from earlier but includes building types now)

#I'm going to exclude num polygons, average perimeter, and total perimeter cause they're not so helpful (cause Collinearity matrix shows its very close to count)

def extract_multipolygon_features(multipolygon, lsoa_geometry):
    if isinstance(multipolygon, MultiPolygon):
        polygons = list(multipolygon.geoms)
    elif isinstance(multipolygon, Polygon):
        polygons = [multipolygon]
    else:
        return pd.Series({
            'total_area': 0,
            'avg_building_area': 0,
            'lsoa_area_ratio': 0,
        })

    num_polygons = len(polygons)
    areas = [polygon.area for polygon in polygons]
    total_area = sum(areas)
    avg_building_area = total_area / num_polygons if num_polygons > 0 else 0

    #Built-up area ratio could result in a div by zero error if there are no buildings in an LSOA so this logic has to be included here
    try:
        lsoa_area_ratio = total_area / lsoa_geometry.area
    except ZeroDivisionError:
        lsoa_area_ratio = 0

    return pd.Series({
        'total_area': total_area,
        'lsoa_area_ratio': lsoa_area_ratio,
        'avg_building_area': avg_building_area,
    })


In [99]:
#Apply function above

#All Buildings
# all_buildings_geometry_features = empl_geog.apply(
#    lambda row: extract_multipolygon_features(row['building_poly'], row['geometry']),
#    axis=1
#)
#all_buildings_geometry_features.add_prefix('all_')


#Residential                          
residential_buildings_geometry_features = empl_geog.apply(
    lambda row: extract_multipolygon_features(row['residential_building_poly'], row['geometry']),
    axis=1
)
all_buildings_geometry_features.add_prefix('residential_')

# Commercial
commercial_buildings_geometry_features = empl_geog.apply(
    lambda row: extract_multipolygon_features(row['commercial_building_poly'], row['geometry']),
    axis=1
)
commercial_buildings_geometry_features = commercial_buildings_geometry_features.add_prefix('commercial_')

# Office
office_buildings_geometry_features = empl_geog.apply(
    lambda row: extract_multipolygon_features(row['office_building_poly'], row['geometry']),
    axis=1
)
office_buildings_geometry_features = office_buildings_geometry_features.add_prefix('office_')

# Retail
retail_buildings_geometry_features = empl_geog.apply(
    lambda row: extract_multipolygon_features(row['retail_building_poly'], row['geometry']),
    axis=1
)
retail_buildings_geometry_features = retail_buildings_geometry_features.add_prefix('retail_')

all_geom_features = pd.concat([
    all_buildings_geometry_features,
    residential_buildings_geometry_features,
    commercial_buildings_geometry_features,
    office_buildings_geometry_features,
    retail_buildings_geometry_features
], axis=1, ignore_index=True)

#Capture Column Names
geo_features = all_geom_features.columns

all_geom_features.head(50)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,6.954275e-07,0.966502,1.669026e-05,1.142252e-07,0.026458,4.569009e-07,2.656669e-07,0.076921,1.328335e-06,0.0,0.0,0.0,2.200346e-07,0.101934,1.760277e-06
1,4.409557e-07,0.722431,2.116588e-05,1.405123e-07,0.014388,4.215369e-07,2.245372e-07,0.09963,2.918983e-06,0.0,0.0,0.0,4.283602e-07,0.131587,3.855242e-06
2,7.60204e-07,1.126545,8.362244e-06,1.051174e-07,0.056645,4.204698e-07,0.0,0.0,0.0,0.0,0.0,0.0,1.375725e-07,0.018533,1.375725e-07
3,2.474802e-07,0.540976,1.336393e-05,0.0,0.0,0.0,3.214321e-07,0.065058,1.60716e-06,2.773609e-08,0.001123,2.773609e-08,3.156538e-07,0.127778,3.156538e-06
4,1.598686e-07,0.179683,3.35724e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,7.216192e-08,0.275445,7.071869e-06,4.436179e-08,0.006911,1.774472e-07,0.0,0.0,0.0,2.032864e-07,0.007918,2.032864e-07,5.505318e-08,0.004289,1.101064e-07
6,1.2218e-07,0.209613,5.25374e-06,0.0,0.0,0.0,0.0,0.0,0.0,2.35158e-07,0.009382,2.35158e-07,2.354605e-08,0.000939,2.354605e-08
7,8.681309e-08,0.317807,5.295598e-06,0.0,0.0,0.0,0.0,0.0,0.0,5.894408e-08,0.021225,3.536645e-07,0.0,0.0,0.0
8,1.452742e-07,0.298902,1.35105e-05,5.623035e-08,0.007464,3.373821e-07,6.004655e-08,0.002657,1.200931e-07,1.248022e-07,0.057983,2.620846e-06,8.596902e-08,0.00951,4.298451e-07
9,1.955352e-08,0.243971,2.854813e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [92]:
# Combine Datasets

data_with_geom = pd.concat([empl_geog, all_geom_features])

all_data = pd.merge(data_with_geom, encoded_categories, on='LSOA11CD')

all_data.head()

Unnamed: 0,LSOA11CD,LSOA11NM_x,population,Area,"01 : Crop and animal production, hunting and related service activities",02 : Forestry and logging,03 : Fishing and aquaculture,05 : Mining of coal and lignite,06 : Extraction of crude petroleum and natural gas,07 : Mining of metal ores,...,category_list_women's_clothing_store,category_list_women's_health_clinic,category_list_wood_and_pulp,category_list_woodworking_supply_store,category_list_writing_service,category_list_yoga_instructor,category_list_yoga_studio,category_list_youth_organizations,category_list_zoo,category_list_None
0,E01000001,City of London 001A,1465.0,lsoa2011:E01000001 : City of London 001A,0.0,0.0,0.0,0.0,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,E01000002,City of London 001B,1436.0,lsoa2011:E01000002 : City of London 001B,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,E01000003,City of London 001C,1346.0,lsoa2011:E01000003 : City of London 001C,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E01000005,City of London 001E,985.0,lsoa2011:E01000005 : City of London 001E,0.0,10.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,E01000006,Barking and Dagenham 016A,1703.0,lsoa2011:E01000006 : Barking and Dagenham 016A,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Test AutoML with new Data

In [109]:
# create training and testing data
features = ['num_buildings', 'num_places', 'population'] + list(geo_features) + list(category_columns)
target = 'total_employment'

# Split the dataset - 80/20 train test
X_train, X_test, y_train, y_test = train_test_split(all_data[features], all_data[target], test_size=0.2, random_state=42)

# the results_path should point to a folder that you have already created to store the model results
automl = AutoML(results_path="automl_results_v2/basic_building_categories/", mode='Explain')
# fit the model
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)
r2 = r2_score(y_test, predictions)
print(f'R^2 Score: {r2}')


Linear algorithm was disabled.
This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.
R^2 Score: 0.6131872834806253


 [  458.48956299]
 [  173.14588928]
 [  732.44445801]
 [  220.64738464]
 [   64.53336716]
 [  686.12097168]
 [   90.78416443]
 [  125.45743561]
 [  216.39910889]
 [  527.15093994]
 [  330.07846069]
 [  450.22036743]
 [  575.32165527]
 [  601.29278564]
 [ 2527.45922852]
 [  426.37042236]
 [  335.65753174]
 [  582.30987549]
 [  516.32287598]
 [  570.64453125]
 [  250.34088135]
 [ 3192.63891602]
 [  567.04803467]
 [  234.87542725]
 [  208.5448761 ]
 [  131.15454102]
 [  176.0307312 ]
 [  231.52787781]
 [  738.16894531]
 [ 2370.3918457 ]
 [   90.19581604]
 [  135.15077209]
 [  741.06555176]
 [  348.76568604]
 [  502.46340942]
 [   23.66993713]
 [  135.53430176]
 [  804.90734863]
 [ 1382.9239502 ]
 [  117.89431763]
 [  260.13381958]
 [11508.76171875]
 [  535.72192383]
 [   68.41119385]
 [  378.15322876]
 [  168.08595276]
 [  123.43571472]
 [  380.24755859]
 [  275.76986694]
 [  238.43829346]
 [ 1653.51330566]
 [  320.17147827]
 [ 1441.00878906]
 [  154.97052002]
 [  523.13372803]
 [  133.92

## Filter POIs and Collapse Categories

In [None]:
# This work is currently storedin DataExploration.ipynb - some adaptation will be needed to migrate things over

## Target Employment Density