# Testing of a new library for reading OSM PBF files using DuckDB: QuackOSM

---

## Import the main library and other supporting ones

In [10]:
import quackosm as qosm
import pandas as pd
import duckdb

---

## Let's try some first examples

### Load data as GeoDataFrame

In [3]:
# Loading Berlin's datas found on "https://download.geofabrik.de/"
qosm.convert_pbf_to_geodataframe('data/berlin-latest.osm.pbf')

Unnamed: 0_level_0,tags,geometry
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1
node/1722876789,"{'river:waterway_distance': '26', 'seamark:dis...",POINT (13.06828 52.39526)
node/617225609,"{'maxheight': '6.27', 'maxheight_referenceleve...",POINT (13.06278 52.39295)
node/2727015653,{'barrier': 'gate'},POINT (13.08744 52.38818)
node/1456258887,"{'crossing:barrier': 'no', 'railway': 'level_c...",POINT (13.09465 52.39162)
node/617362180,"{'name': 'S Babelsberg/Wattstraße', 'public_tr...",POINT (13.09417 52.39159)
...,...,...
way/312338059,"{'leaf_type': 'broadleaved', 'natural': 'wood'}","POLYGON ((13.74445 52.41704, 13.74439 52.41698..."
way/388929487,{'natural': 'wood'},"POLYGON ((13.74758 52.41744, 13.74755 52.41747..."
way/312338064,{'natural': 'wood'},"POLYGON ((13.74778 52.41737, 13.74782 52.41734..."
way/312338063,"{'leaf_type': 'broadleaved', 'natural': 'wood'}","POLYGON ((13.7463 52.4167, 13.74635 52.41668, ..."


### Convert PBF to GeoParquet


In [8]:
gpq_path = qosm.convert_pbf_to_parquet('data/berlin-latest.osm.pbf')

gpq_path.as_posix()

'files/berlin-latest_nofilter_noclip_compact_sorted.parquet'

### Try to read this GeoParquet file using Pandas

In [9]:
df = pd.read_parquet('files/berlin-latest_nofilter_noclip_compact_sorted.parquet')
print(df.head())

        feature_id                                               tags  \
0  node/1722876789  [(river:waterway_distance, 26), (seamark:dista...   
1   node/617225609  [(maxheight, 6.27), (maxheight_referencelevel,...   
2  node/2727015653                                  [(barrier, gate)]   
3  node/1456258887  [(crossing:barrier, no), (railway, level_cross...   
4   node/617362180  [(name, S Babelsberg/Wattstraße), (public_tran...   

                                            geometry  
0  b'\x01\x01\x00\x00\x00\xb80\x1c\x19\xf5"*@\xfa...  
1  b'\x01\x01\x00\x00\x00\x17\xdb\xff[$ *@29\xb53...  
2  b'\x01\x01\x00\x00\x00\xd1t\x1by\xc5,*@\xe8\x0...  
3  b'\x01\x01\x00\x00\x00\xbd\xea\xb7ev0*@\xba\xc...  
4  b'\x01\x01\x00\x00\x00\x96vj.70*@\x14\xba\xa6\...  


### Inspect the file with DuckDB

In [11]:
duckdb.load_extension('spatial')
duckdb.read_parquet(str(gpq_path)).order("feature_id")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌──────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────────────────────┐
│    feature_id    │                                                   tags                                                    │           geometry            │
│     varchar      │                                           map(varchar, varchar)                                           │           geometry            │
├──────────────────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────┼───────────────────────────────┤
│ node/10000041539 │ {noexit=yes}                                                                                              │ POINT (13.542045 52.569623)   │
│ node/10000166557 │ {amenity=bench, backrest=yes, material=wood, seats=4}                                                     │ POINT (13.4299284 52.4291085) │
│ node/10000166558 │ {leaf_cycle=d