In [None]:
!pip install -q duckdb  --pre --upgrade
!pip install -q pyarrow --upgrade
!pip install geopandas  --upgrade
import sys
sys.exit(0)

In [None]:
import duckdb
import urllib.request
import zipfile
import os
duckdb.sql("INSTALL spatial; LOAD spatial;")

In [None]:
population_url = "https://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv"
local_population_file = "world_population.csv.zip"  # Renamed to reflect it's a zip file
urllib.request.urlretrieve(population_url, local_population_file)
extract_dir = "/lakehouse/default/Files/zip"
os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(local_population_file, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

url = "https://raw.githubusercontent.com/datasets/geo-countries/master/data/countries.geojson"
local_file = "/lakehouse/default/Files/countries.geojson"
urllib.request.urlretrieve(url, local_file)

In [None]:
duckdb.sql(f"""
                CREATE OR REPLACE view population AS
                SELECT * from read_csv('{extract_dir}/API*.csv',header=1,normalize_names=1);
                from population limit 4
          """).show(max_width=100)

In [None]:
duckdb.sql(f"""
                  CREATE OR REPLACE view countries AS
                  SELECT * FROM ST_Read('{local_file}');
                  from countries limit 4
            """).show(max_width=100)

In [None]:
# Save to Parquet with geometry preserved
duckdb.sql("""
    COPY (select name,geom,_2024 as pop from countries join population on "ISO3166-1-Alpha-3" = country_code)   TO '/lakehouse/default/Files/countries.parquet' (FORMAT PARQUET);
""")

In [None]:
duckdb.sql("select path_in_schema ,type,geo_bbox, geo_types   from parquet_metadata( '/lakehouse/default/Files/countries.parquet')").show(max_width=150)

In [None]:
duckdb.sql("select geom, ST_AsText(geom) as wkt  from '/lakehouse/default/Files/countries.parquet' ").show(max_width=150)

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
gdf = gpd.read_parquet('/lakehouse/default/Files/countries.parquet')
fig, ax = plt.subplots(1, 1, figsize=(15, 10))  # Increase figure size
gdf.plot(column='pop', ax=ax, legend=True)  # Add legend
ax.set_title('World Population by Country (2024)')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
plt.show()