# Altscore `cost_of_living` prediction pre processing

h3 lat/lon to HEX example

In [None]:
# Example code snippet to convert galactic coordinates to H3 index
import h3

latitude = 37.7749  # Example coordinate
longitude = -122.4194  # Example coordinate
resolution = 8  # H3 encryption level

h3_index = h3.latlng_to_cell(latitude, longitude, resolution)
print(f"H3 Index: {h3_index}")

## Hex ID extraction

Define functions to get hex_id using dask for large data

In [None]:
import numpy as np
import pandas as pd
import dask.dataframe as dd

In [None]:
def add_h3_column(df):
    df = df.copy() 
    df['hex_id'] = df.apply(lambda row: h3.latlng_to_cell(row['lat'], row['lon'], 8), axis=1)
    return df[['device_id', 'lat', 'lon', 'timestamp', 'hex_id']]  


In [None]:
ddf = dd.read_parquet('mobility_data.parquet', columns=['device_id', 'lat', 'lon', 'timestamp'], blocksize="100MB")

In [None]:
meta = {
    'device_id': 'int64',
    'lat': 'f8',
    'lon': 'f8',
    'timestamp': 'int64',
    'hex_id': 'str'
}
# Apply function to each partition
ddf = ddf.map_partitions(add_h3_column, meta=meta)

In [None]:
for i, partition in enumerate(ddf.to_delayed()):
    partition_df = partition.compute()
    partition_df.to_parquet(f"processed_data_{i}.parquet")  # Save each partition separately
    print(f"Processed partition {i}")

## Area computation

Get area of hex_id in Square Km

Define function for Area

In [None]:
import geopandas as gpd
from shapely.geometry import Polygon
# Function to compute area of a hex_id using lat/lon points
def compute_hex_area(hex_id, df):
    # Get all points belonging to this hex_id
    hex_points = df[df['hex_id'] == hex_id][['lat', 'lon']].values

    # If there are not enough points to form a polygon, return 0
    if len(hex_points) < 3:
        return 0

    # Create a convex hull polygon
    polygon = Polygon(hex_points).convex_hull

    # Convert to GeoDataFrame to compute area (using WGS84 projection)
    gdf = gpd.GeoDataFrame(geometry=[polygon], crs="EPSG:4326")
    gdf = gdf.to_crs("EPSG:3857")  # Convert to meters-based projection

    return gdf.area.iloc[0] / 1e6  # Convert to square kilometers

In [None]:
for file in parquet_files:
    print(file)
    df = pd.read_parquet(file)
    hex_areas = {hex_id: compute_hex_area(hex_id, df) for hex_id in df['hex_id'].unique()}

    # Convert to DataFrame
    hex_area_df = pd.DataFrame(list(hex_areas.items()), columns=['hex_id', 'area_sq_km'])
    df = df.merge(hex_area_df, on="hex_id", how="left")
    df.to_parquet(file, index=False)