## Install requirements

In [None]:
!pip install -r requirements.txt

### explore functions natively to fetch data

In [3]:
import pandas as pd
import geopandas as gpd
import yaml

from src.fetch_poi import FetchPoi
from src.hexes import Hexes
from src.score_poi import ScorePoi

In [4]:
    config = yaml.safe_load(open("config/config.yaml"))


In [5]:
    # get hexes for location
    hexes = Hexes(place=config['location'])
    hexes.get_place_hex(h3_resolution=config['h3_resolution'])

In [15]:
    fetch = FetchPoi(config['location'], osm_endpoint=config['overpass_endpoint'])
    datasets = {}

In [17]:
hex_scores = hexes.place_hexes

In [18]:
    for dataset in config['datasets']:
        print(f'fetching {dataset}')
        if dataset not in datasets:
            tag = config['datasets'][dataset]['tag']
            values = config['datasets'][dataset]['values']
            _, datasets[dataset] = fetch.fetch_data(tag=tag, values=values)
            datasets[dataset]['lat'] = datasets[dataset]['centroid'].apply(lambda x: x.y)
            datasets[dataset]['lon'] = datasets[dataset]['centroid'].apply(lambda x: x.x)
            datasets[dataset].dropna(subset=tag, inplace=True)
            datasets[dataset] = datasets[dataset].loc[:, datasets[dataset].isnull().mean() < .3]
            scoring = ScorePoi(datasets[dataset], k=100, d=500)
            hex_scores = scoring.score_poi(hex_scores, poi_name=dataset, aggregation='sum')

fetching amenity


100%|██████████| 9/9 [00:00<00:00, 1001.80it/s]
INFO:root:getting data per grid for London....

100%|██████████| 33/33 [01:17<00:00,  2.36s/it]
INFO:root:Computing geometry from lat, lon
INFO:root:Computing circles of interest based on parameters k and d. 
                      current values: k = 100, d = 500

  self.data['centroid'] = self.data.geometry.centroid
INFO:root:getting hexagons influenced by POI according to distances k and d...
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:3035
Right CRS: None

  candidate_hexes = gpd.sjoin(hexagons.to_crs(3035), self.data[['circle_of_interest']])
INFO:root:filtering POIs matching hexes...
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:3035

  candidate_poi = gpd.sjoin(self.data, hexagons[['geometry']].to_crs(3035), how='inner')
INFO:root:computing scores for data...
INFO:root:getting distances between poi and h

fetching shop


100%|██████████| 9/9 [00:00<00:00, 982.89it/s]
INFO:root:getting data per grid for London....

100%|██████████| 33/33 [00:48<00:00,  1.47s/it]
INFO:root:Computing geometry from lat, lon
INFO:root:Computing circles of interest based on parameters k and d. 
                      current values: k = 100, d = 500

  self.data['centroid'] = self.data.geometry.centroid
INFO:root:getting hexagons influenced by POI according to distances k and d...
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:3035
Right CRS: None

  candidate_hexes = gpd.sjoin(hexagons.to_crs(3035), self.data[['circle_of_interest']])
INFO:root:filtering POIs matching hexes...
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:3035

  candidate_poi = gpd.sjoin(self.data, hexagons[['geometry']].to_crs(3035), how='inner')
INFO:root:computing scores for data...
INFO:root:getting distances between poi and he

fetching leisure


100%|██████████| 9/9 [00:00<00:00, 949.27it/s]
INFO:root:getting data per grid for London....

100%|██████████| 33/33 [00:37<00:00,  1.14s/it]
INFO:root:Computing geometry from lat, lon
INFO:root:Computing circles of interest based on parameters k and d. 
                      current values: k = 100, d = 500

  self.data['centroid'] = self.data.geometry.centroid
INFO:root:getting hexagons influenced by POI according to distances k and d...
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:3035
Right CRS: None

  candidate_hexes = gpd.sjoin(hexagons.to_crs(3035), self.data[['circle_of_interest']])
INFO:root:filtering POIs matching hexes...
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:3035

  candidate_poi = gpd.sjoin(self.data, hexagons[['geometry']].to_crs(3035), how='inner')
INFO:root:computing scores for data...
INFO:root:getting distances between poi and he

fetching highway


100%|██████████| 9/9 [00:00<00:00, 837.48it/s]
INFO:root:getting data per grid for London....

100%|██████████| 33/33 [00:55<00:00,  1.69s/it]
INFO:root:Computing geometry from lat, lon
INFO:root:Computing circles of interest based on parameters k and d. 
                      current values: k = 100, d = 500

  self.data['centroid'] = self.data.geometry.centroid
INFO:root:getting hexagons influenced by POI according to distances k and d...
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:3035
Right CRS: None

  candidate_hexes = gpd.sjoin(hexagons.to_crs(3035), self.data[['circle_of_interest']])
INFO:root:filtering POIs matching hexes...
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:3035

  candidate_poi = gpd.sjoin(self.data, hexagons[['geometry']].to_crs(3035), how='inner')
INFO:root:computing scores for data...
INFO:root:getting distances between poi and he

fetching railway


100%|██████████| 9/9 [00:00<00:00, 998.14it/s]
INFO:root:getting data per grid for London....

100%|██████████| 33/33 [00:32<00:00,  1.02it/s]
INFO:root:Computing geometry from lat, lon
INFO:root:Computing circles of interest based on parameters k and d. 
                      current values: k = 100, d = 500

  self.data['centroid'] = self.data.geometry.centroid
INFO:root:getting hexagons influenced by POI according to distances k and d...
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:3035
Right CRS: None

  candidate_hexes = gpd.sjoin(hexagons.to_crs(3035), self.data[['circle_of_interest']])
INFO:root:filtering POIs matching hexes...
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:3035

  candidate_poi = gpd.sjoin(self.data, hexagons[['geometry']].to_crs(3035), how='inner')
INFO:root:computing scores for data...
INFO:root:getting distances between poi and he

In [19]:
hex_scores

Unnamed: 0,hex_id,geometry,score_amenity,score_shop,score_leisure,score_highway,score_railway
0,88195dadb3fffff,"POLYGON ((-0.48146 51.50856, -0.48818 51.50734...",8.340623,0.385595,0.000000,16.020005,0.302102
1,88195db485fffff,"POLYGON ((-0.09288 51.66737, -0.09960 51.66619...",17.070924,0.577442,0.401325,34.676886,1.190289
2,88194ad08bfffff,"POLYGON ((-0.00511 51.40646, -0.01180 51.40527...",11.552341,0.000000,6.326450,51.740510,0.670263
3,88195dacc5fffff,"POLYGON ((-0.38926 51.54225, -0.39599 51.54104...",6.432914,2.281679,0.000000,25.557636,0.000000
4,88194ad5c1fffff,"POLYGON ((0.05349 51.31897, 0.04681 51.31778, ...",5.278275,0.157626,1.540873,15.927384,0.000000
...,...,...,...,...,...,...,...
2401,88194ad987fffff,"POLYGON ((-0.33633 51.40676, -0.34304 51.40555...",20.409464,0.266839,1.000000,24.665717,1.087481
2402,88194e6d11fffff,"POLYGON ((0.11433 51.47015, 0.10764 51.46897, ...",15.908132,3.621407,0.416125,41.867790,0.579252
2403,88194e6db5fffff,"POLYGON ((0.09947 51.41650, 0.09278 51.41532, ...",7.051774,0.350400,2.361531,27.010386,0.000000
2404,88194e6aedfffff,"POLYGON ((0.09494 51.60693, 0.08823 51.60575, ...",6.550627,1.195497,4.097375,31.087521,1.719024


In [76]:
fetch.place_map

Unnamed: 0,geometry,bbox_north,bbox_south,bbox_east,bbox_west,place_id,osm_type,osm_id,lat,lon,class,type,place_rank,importance,addresstype,name,display_name
0,"POLYGON ((-0.51038 51.46809, -0.51036 51.46795...",51.691874,51.28676,0.334016,-0.510375,273846562,relation,65606,51.489334,-0.144055,boundary,ceremonial,25,0.882089,city,London,"London, Greater London, England, United Kingdom"


## Merge all geodataframes into one

In [20]:
import pandas as pd


In [21]:
all_data = pd.DataFrame()

In [22]:
for data in datasets:
    datasets[data].rename(columns={data: 'poi'}, inplace=True)
    datasets[data]['category'] = data
    all_data = pd.concat([all_data, datasets[data]], ignore_index=True)

In [23]:
all_data.columns

Index(['poi', 'place', 'geometry', 'shape', 'name', 'addr:street', 'centroid',
       'lat', 'lon', 'category', 'id', 'naptan:AtcoCode', 'naptan:Bearing',
       'naptan:CommonName', 'naptan:Indicator', 'naptan:verified',
       'public_transport', 'network', 'wikidata', 'wikipedia', 'operator',
       'wheelchair'],
      dtype='object')

In [24]:
all_data = all_data[['category', 'poi', 'place', 'geometry', 'shape', 'name', 'centroid', 'lat', 'lon', 'id']].drop_duplicates()

In [25]:
all_data

Unnamed: 0,category,poi,place,geometry,shape,name,centroid,lat,lon,id
0,amenity,school,London,"POLYGON ((-0.56590 51.49836, -0.56473 51.49856...",Polygon,,POINT (-0.56490 51.49764),51.497639,-0.564903,
1,amenity,school,London,"POLYGON ((-0.51036 51.48113, -0.50933 51.48110...",Polygon,Pippins School,POINT (-0.51001 51.48075),51.480750,-0.510011,
2,amenity,school,London,"POLYGON ((-0.52434 51.48335, -0.52450 51.48263...",Polygon,Colnbrook Church of England Primary School,POINT (-0.52387 51.48271),51.482713,-0.523871,
3,amenity,school,London,"POLYGON ((-0.55224 51.49684, -0.55066 51.49813...",Polygon,Langley Grammar School,POINT (-0.55261 51.49843),51.498430,-0.552611,
4,amenity,school,London,"POLYGON ((-0.60874 51.49046, -0.60880 51.49050...",Polygon,Eton College,POINT (-0.60665 51.49489),51.494888,-0.606654,
...,...,...,...,...,...,...,...,...,...,...
44891,railway,station,London,POINT (0.23420 51.59322),Point,Harold Wood,POINT (0.23420 51.59322),51.593217,0.234200,
44892,railway,train_station_entrance,London,POINT (0.29965 51.61376),Point,,POINT (0.29965 51.61376),51.613763,0.299647,
44893,railway,train_station_entrance,London,POINT (0.29938 51.61383),Point,,POINT (0.29938 51.61383),51.613825,0.299385,
44894,railway,train_station_entrance,London,POINT (0.29942 51.61382),Point,,POINT (0.29942 51.61382),51.613823,0.299416,


### Filter points of interest out of the map region (fetcher may get some near the borders due to grid created by the fetch_poi package)

In [26]:
data_within_boundaries = gpd.sjoin(all_data, fetch.place_map[['geometry']], how='inner').drop(columns=['index_right'])

In [27]:
data_within_boundaries

Unnamed: 0,category,poi,place,geometry,shape,name,centroid,lat,lon,id
328,amenity,school,London,"POLYGON ((-0.40022 51.47024, -0.39988 51.47038...",Polygon,Beavers Community Primary School,POINT (-0.40025 51.46926),51.469264,-0.400250,
329,amenity,school,London,"POLYGON ((-0.40970 51.48567, -0.41017 51.48612...",Polygon,Cranford Community College,POINT (-0.40826 51.48754),51.487541,-0.408256,
330,amenity,school,London,"POLYGON ((-0.42208 51.44957, -0.42413 51.44819...",Polygon,Southville Primary School,POINT (-0.42178 51.44886),51.448857,-0.421782,
331,amenity,school,London,"POLYGON ((-0.40856 51.49096, -0.40864 51.49079...",Polygon,The Old Rectory Nursery School,POINT (-0.40844 51.49066),51.490663,-0.408439,
332,amenity,school,London,"POLYGON ((-0.40969 51.48305, -0.40979 51.48314...",Polygon,The Cedars Primary School,POINT (-0.41015 51.48421),51.484212,-0.410154,
...,...,...,...,...,...,...,...,...,...,...
44886,railway,station,London,POINT (0.23477 51.55827),Point,Upminster Bridge,POINT (0.23477 51.55827),51.558265,0.234769,
44887,railway,station,London,POINT (0.21811 51.55387),Point,Hornchurch,POINT (0.21811 51.55387),51.553875,0.218109,
44889,railway,station,London,POINT (0.20573 51.58179),Point,Gidea Park,POINT (0.20573 51.58179),51.581785,0.205728,
44890,railway,station,London,POINT (0.22024 51.56860),Point,Emerson Park,POINT (0.22024 51.56860),51.568601,0.220238,


## Ingest with mlrun

In [2]:
import mlrun
import yaml

In [3]:
project = mlrun.get_or_create_project("application-runtime", "./", user_project=True, )

> 2024-09-16 14:45:19,744 [info] Created and saved project: {"context":"./","from_template":null,"name":"application-runtime-felipe","overwrite":false,"save":true}
> 2024-09-16 14:45:19,746 [info] Project created successfully: {"project_name":"application-runtime-felipe","stored_in_db":true}


In [4]:
project.set_source(source="git://github.com/felipenv/application-runtime-demo.git#main", pull_at_runtime=True)

In [5]:
config = yaml.safe_load(open("config/config.yaml"))


In [6]:
function = project.set_function('src/ingest.py',
                                  name='ingest-geo', 
                                  kind='job',
                                  image='mlrun/mlrun', 
                                  requirements=['geojson', 'geopandas', 'h3pandas','osmnx','overpy', 'pandas', 'h3', 'tobler'], 
                                  with_repo=True,
                                  handler='fetch')

In [7]:
#function.deploy()

In [8]:
function.to_dict()

{'spec': {'image': '',
  'description': '',
  'state_thresholds': {'pending_scheduled': '1h',
   'pending_not_scheduled': '-1',
   'image_pull_backoff': '1h',
   'executing': '24h'},
  'command': 'src/ingest.py',
  'build': {'source': './',
   'base_image': 'mlrun/mlrun',
   'requirements': ['geojson',
    'geopandas',
    'h3pandas',
    'osmnx',
    'overpy',
    'pandas',
    'h3',
    'tobler']},
  'priority_class_name': 'igz-workload-medium',
  'disable_auto_mount': False,
  'default_handler': 'fetch',
  'preemption_mode': 'prevent',
  'resources': {'requests': {'memory': '1Mi', 'cpu': '25m'},
   'limits': {'memory': '20Gi', 'cpu': '2'}},
  'tolerations': None,
  'affinity': {'nodeAffinity': {'requiredDuringSchedulingIgnoredDuringExecution': {'nodeSelectorTerms': [{'matchExpressions': [{'key': 'spot',
         'operator': 'NotIn',
         'values': ['true']}]}]}}},
  'node_selector': {}},
 'kind': 'job',
 'metadata': {'project': 'application-runtime-felipe',
  'name': 'ingest-geo

In [9]:
get_data_run = project.run_function("ingest-geo", params={"config":config}, auto_build=True, local=False, returns=['pois', 'hexes'])

> 2024-09-16 14:45:59,596 [error] error getting build status: details: MLRunNotFoundError('Function tag not found application-runtime-felipe/ingest-geo:latest'), caused by: 404 Client Error: Not Found for url: http://mlrun-api:8080/api/v1/build/status?name=ingest-geo&project=application-runtime-felipe&tag=latest&logs=no&offset=0&last_log_timestamp=0.0&verbose=no
> 2024-09-16 14:45:59,596 [info] Function is not deployed and auto_build flag is set, starting deploy...
> 2024-09-16 14:45:59,928 [info] Started building image: .mlrun/func-application-runtime-felipe-ingest-geo:latest
> 2024-09-16 14:47:38,678 [info] Storing function: {"db":"http://mlrun-api:8080","name":"ingest-geo-fetch","uid":"9b779647a1b047f6b0ba59089e7c12ae"}
> 2024-09-16 14:47:38,943 [info] Job is running in the background, pod: ingest-geo-fetch-dz5ml
> 2024-09-16 14:47:45,085 [info] extracting source from git://github.com/felipenv/application-runtime-demo.git#main to /mlrun/code
The `utils.config` function is deprecated

project,uid,iter,start,state,kind,name,labels,inputs,parameters,results,artifacts
application-runtime-felipe,...9e7c12ae,0,Sep 16 14:47:45,completed,run,ingest-geo-fetch,v3io_user=felipekind=jobowner=felipemlrun/client_version=1.7.0-rc43mlrun/client_python_version=3.9.18host=ingest-geo-fetch-dz5ml,,"config={'location': 'London', 'h3_resolution': 8, 'overpass_endpoint': 'https://overpass-api.de/api/interpreter', 'datasets': {'amenity': {'tag': 'amenity', 'values': ['school', 'hospital', 'pub', 'restaurant'], 'weight': 0.2}, 'shop': {'tag': 'shop', 'values': ['supermarket'], 'weight': 0.2}, 'leisure': {'tag': 'leisure', 'values': ['fitness_centre', 'swimming_pool'], 'weight': 0.2}, 'highway': {'tag': 'highway', 'values': ['bus_stop'], 'weight': 0.2}, 'railway': {'tag': 'railway', 'values': ['station', 'tram_stop'], 'weight': 0.2}}, 'plots': {'mapbox_token': 'pk.eyJ1IjoibWFja2xhcnMiLCJhIjoiY2w2dXo2OGRvMWh0MzNmcGJ3eDQyNDBrOCJ9.uyOgYliAIB1EZ8xWWyH1BQ', 'zoom': 7, 'center': {'lat': 51.5, 'lon': -0.07}, 'pois': {'hover_info': ['category', 'poi', 'name'], 'colors': {'amenity': 'purple', 'shop': '#EC9706', 'leisure': 'brown', 'highway': '#ED7014', 'railway': '#AAFF00'}}, 'hexes': {'hover_info': ['hex_id', 'score'], 'palette': 'RdYlGn'}}}",,poishexes





> 2024-09-16 14:53:15,083 [info] Run execution finished: {"name":"ingest-geo-fetch","status":"completed"}


## Verify if data is present in artifacts

In [10]:
pois = get_data_run.artifact('pois')
hexes = get_data_run.artifact('hexes')

#### Unpack artifacts of geodataframe type. Must tell the Package Manager what's the data type

In [11]:
from geopandas import GeoDataFrame
packager = mlrun.package.packagers_manager.PackagersManager()
pois = packager.unpack(pois, type_hint=GeoDataFrame)
hexes = packager.unpack(hexes, type_hint=GeoDataFrame)

> 2024-09-16 14:57:51,677 [info] downloading v3io:///projects/application-runtime-felipe/artifacts/ingest-geo-fetch/0/pois.pkl to local temp file
> 2024-09-16 14:57:51,973 [info] downloading v3io:///projects/application-runtime-felipe/artifacts/ingest-geo-fetch/0/hexes.pkl to local temp file


In [12]:
type(pois)

geopandas.geodataframe.GeoDataFrame

In [13]:
pois

Unnamed: 0,category,poi,place,geometry,shape,name,centroid,lat,lon,id
344,amenity,school,London,"POLYGON ((-0.40022 51.47024, -0.39988 51.47038...",Polygon,Beavers Community Primary School,POINT (-0.40025 51.46926),51.469264,-0.400250,
345,amenity,school,London,"POLYGON ((-0.40970 51.48567, -0.41017 51.48612...",Polygon,Cranford Community College,POINT (-0.40826 51.48754),51.487541,-0.408256,
346,amenity,school,London,"POLYGON ((-0.42208 51.44957, -0.42413 51.44819...",Polygon,Southville Primary School,POINT (-0.42178 51.44886),51.448857,-0.421782,
347,amenity,school,London,"POLYGON ((-0.40856 51.49096, -0.40864 51.49079...",Polygon,The Old Rectory Nursery School,POINT (-0.40844 51.49066),51.490663,-0.408439,
348,amenity,school,London,"POLYGON ((-0.40969 51.48305, -0.40979 51.48314...",Polygon,The Cedars Primary School,POINT (-0.41015 51.48421),51.484212,-0.410154,
...,...,...,...,...,...,...,...,...,...,...
44914,railway,station,London,POINT (0.23477 51.55827),Point,Upminster Bridge,POINT (0.23477 51.55827),51.558265,0.234769,
44915,railway,station,London,POINT (0.21811 51.55387),Point,Hornchurch,POINT (0.21811 51.55387),51.553875,0.218109,
44917,railway,station,London,POINT (0.20573 51.58179),Point,Gidea Park,POINT (0.20573 51.58179),51.581785,0.205728,
44918,railway,station,London,POINT (0.22024 51.56860),Point,Emerson Park,POINT (0.22024 51.56860),51.568601,0.220238,


In [14]:
hexes

Unnamed: 0,hex_id,geometry,score_amenity,score_shop,score_leisure,score_highway,score_railway
0,88194ac319fffff,"POLYGON ((-0.12924 51.37152, -0.13593 51.37032...",14.276499,2.528629,0.280687,32.735134,0.852986
1,88195da411fffff,"POLYGON ((-0.17117 51.56846, -0.17789 51.56727...",24.566962,0.481855,6.191345,20.363966,0.579007
2,88195da755fffff,"POLYGON ((-0.20172 51.64388, -0.20845 51.64269...",3.639815,0.678248,10.200763,16.924445,0.112831
3,88194e680dfffff,"POLYGON ((0.03029 51.60408, 0.02358 51.60290, ...",22.111656,3.413863,2.221552,41.154421,0.784496
4,88194ac151fffff,"POLYGON ((-0.18096 51.34518, -0.18765 51.34398...",2.146308,0.108535,0.468924,18.182788,0.170729
...,...,...,...,...,...,...,...
2401,88194ad9ddfffff,"POLYGON ((-0.30040 51.42178, -0.30710 51.42057...",40.586922,4.242969,4.970946,53.791511,1.590637
2402,88195dae97fffff,"POLYGON ((-0.32018 51.53760, -0.32690 51.53640...",9.613247,1.648059,0.207915,26.949379,1.381755
2403,88195db5d1fffff,"POLYGON ((-0.13383 51.68140, -0.14055 51.68022...",0.802176,0.000000,1.000000,5.789776,0.000000
2404,88194aca6bfffff,"POLYGON ((-0.30700 51.37366, -0.31370 51.37245...",11.147009,0.626548,1.262218,25.593025,0.611944
