In [7]:
from vectorgeo.raster import RasterPatches
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import os

In [8]:
import os
import pandas as pd
from collections import Counter
from h3 import h3
from scipy.sparse import csr_matrix

def calculate_h3_index(row, resolution=7):
    minx, maxx, miny, maxy = row['minx'], row['maxx'], row['miny'], row['maxy']
    latitude = (miny + maxy) / 2
    longitude = (minx + maxx) / 2
    return h3.geo_to_h3(latitude, longitude, resolution)

# Initialize the global counter for place types
global_counter = Counter()

# Initialize the counter-of-counters for H3 indices
h3_counter = Counter()

# Directory containing the parquet files
places_dir = "/home/ubuntu/vectorgeo/tmp/theme=places/type=place"

# Iterate over all files in the directory
files_to_load = os.listdir(places_dir)[0:2]
for i, place_filename in enumerate(files_to_load):
    print(f"    Loading file {i+1} / {len(files_to_load)}")
    filepath = os.path.join(places_dir, place_filename)
    
    if not os.path.isfile(filepath):
        continue
    
    places_df = pd.read_parquet(filepath)
    
    # Update the global counter for place types
    local_counts = places_df['categories'].apply(lambda x: x['main']).value_counts()
    global_counter.update(local_counts.to_dict())
    
    # Calculate H3 index and update the counter-of-counters
    places_df['h3_index'] = places_df['bbox'].apply(calculate_h3_index)
    for h3_index, group_df in places_df.groupby('h3_index'):
        local_counts = group_df['categories'].apply(lambda x: x['main']).value_counts()
        if h3_index not in h3_counter:
            h3_counter[h3_index] = Counter()
        h3_counter[h3_index].update(local_counts.to_dict())

# Get the top 1000 most common place types
top_1000_places = [place for place, _ in global_counter.most_common(1000)]
top_1000_places = sorted(top_1000_places)

# Create a sparse matrix
h3_indices = sorted(h3_counter.keys())
place_indices = {place: i for i, place in enumerate(top_1000_places)}

rows, cols, data = [], [], []
for row, h3_index in enumerate(h3_indices):
    for place, count in h3_counter[h3_index].items():
        if place in place_indices:
            col = place_indices[place]
            rows.append(row)
            cols.append(col)
            data.append(count)

sparse_matrix = csr_matrix((data, (rows, cols)), shape=(len(h3_indices), len(top_1000_places)))

# Create a DataFrame to map row indices to H3 indices
h3_index_df = pd.Series({'row_index': range(len(h3_indices)), 'h3_index': h3_indices})

# Create a DataFrame to map column indices to place names
place_name_df = pd.DataFrame({'col_index': range(len(top_1000_places)), 'place_name': top_1000_places})

# Now, sparse_matrix is the feature matrix you can use for downstream machine learning.
# h3_index_df and place_name_df can be used to map row and column indices back to H3 indices and place names, respectively.


    Loading file 1 / 2
    Loading file 2 / 2


In [15]:
h3_indices

['870016d5bffffff',
 '870018943ffffff',
 '87001a648ffffff',
 '870020165ffffff',
 '870020169ffffff',
 '87002016bffffff',
 '87002016cffffff',
 '87002031dffffff',
 '870020322ffffff',
 '870020331ffffff',
 '870020335ffffff',
 '8700208c9ffffff',
 '8700208cbffffff',
 '8700208cdffffff',
 '870020a82ffffff',
 '870020a86ffffff',
 '870020a9cffffff',
 '870020ab0ffffff',
 '870020ab3ffffff',
 '870020ab4ffffff',
 '870020b9affffff',
 '870020bb0ffffff',
 '870020bb2ffffff',
 '870024892ffffff',
 '8700259b3ffffff',
 '8700283a9ffffff',
 '8700286e1ffffff',
 '870028858ffffff',
 '8700294eeffffff',
 '870029528ffffff',
 '870029854ffffff',
 '87002bdb3ffffff',
 '87002caf6ffffff',
 '87002d764ffffff',
 '87002d86cffffff',
 '87002d96bffffff',
 '87004d245ffffff',
 '870069491ffffff',
 '87006bb71ffffff',
 '87006c584ffffff',
 '870071146ffffff',
 '8700926f2ffffff',
 '87009349affffff',
 '8700966daffffff',
 '8700c4025ffffff',
 '8700e4320ffffff',
 '8700e5990ffffff',
 '8700f29a9ffffff',
 '8700f4009ffffff',
 '8700f4021ffffff',


In [9]:
sparse_matrix.shape

(663368, 1000)

In [10]:
sparse_matrix.nnz

2768525

In [12]:
sparse_matrix.nnz / (663368 * 1000)

0.004173437669589127

In [13]:
place_name_df

Unnamed: 0,col_index,place_name
0,0,abortion_clinic
1,1,abuse_and_addiction_treatment
2,2,accommodation
3,3,accountant
4,4,active_life
...,...,...
995,995,wood_and_pulp
996,996,writing_service
997,997,yoga_studio
998,998,youth_organizations


In [14]:
h3_index_df

Unnamed: 0,row_index,h3_index
0,0,870016d5bffffff
1,1,870018943ffffff
2,2,87001a648ffffff
3,3,870020165ffffff
4,4,870020169ffffff
...,...,...
663363,663363,87f3b4b00ffffff
663364,663364,87f3b4b18ffffff
663365,663365,87f3b4b6bffffff
663366,663366,87f3b4b84ffffff
