In [2]:
import time
from functools import reduce
from pathlib import Path

import networkx as nx
import polars as pl
from locus.utils.pl_utils import batch_iter
from tqdm import tqdm

%matplotlib widget

In [3]:
PROJECT_ROOT = Path().cwd().parent
PROCESSED_DATA_DIR = PROJECT_ROOT / "data" / "processed"

In [4]:
df = pl.scan_parquet(PROCESSED_DATA_DIR / "LDoGI/shards/*.parquet")
print(df.head().collect())
c = df.select(pl.len()).collect()["len"][0] # count
c

shape: (5, 4)
┌─────┬────────────┬────────────┬───────────────────────────────────┐
│ id  ┆ latitude   ┆ longitude  ┆ image                             │
│ --- ┆ ---        ┆ ---        ┆ ---                               │
│ i64 ┆ f64        ┆ f64        ┆ binary                            │
╞═════╪════════════╪════════════╪═══════════════════════════════════╡
│ 0   ┆ 41.906     ┆ 12.455     ┆ b"\xff\xd8\xff\xe0\x00\x10JFIF\x… │
│ 1   ┆ 48.211072  ┆ 16.36736   ┆ b"\xff\xd8\xff\xe0\x00\x10JFIF\x… │
│ 2   ┆ 43.942876  ┆ 12.774091  ┆ b"\xff\xd8\xff\xe0\x00\x10JFIF\x… │
│ 3   ┆ 41.339055  ┆ 14.507789  ┆ b"\xff\xd8\xff\xe0\x00\x10JFIF\x… │
│ 4   ┆ -23.210269 ┆ -44.693223 ┆ b"\xff\xd8\xff\xe0\x00\x10JFIF\x… │
└─────┴────────────┴────────────┴───────────────────────────────────┘


3993900

In [19]:
df = pl.scan_parquet(PROCESSED_DATA_DIR / "LDoGI/shards/*.parquet")
df = df.drop("id", "image")
print(df.head().collect())
c = df.select(pl.len()).collect()["len"][0] # count
c

shape: (5, 2)
┌────────────┬────────────┐
│ latitude   ┆ longitude  │
│ ---        ┆ ---        │
│ f64        ┆ f64        │
╞════════════╪════════════╡
│ 41.906     ┆ 12.455     │
│ 48.211072  ┆ 16.36736   │
│ 43.942876  ┆ 12.774091  │
│ 41.339055  ┆ 14.507789  │
│ -23.210269 ┆ -44.693223 │
└────────────┴────────────┘


4233900

In [20]:
G = nx.read_gml(PROCESSED_DATA_DIR / "LDoGI/quadtrees/quadtree.gml")

In [21]:
# create enum of cell states
class CellState:
    STOPPED = 0
    EVALUATING = 1
    ACTIVE = 2

In [22]:
active_cells = [node for node in list(G.nodes) if G.nodes[node]["state"] == CellState.ACTIVE]

In [23]:
len(active_cells)

6398

In [25]:
active_cells[0]

'000'

In [26]:
sum_path_lens = reduce(lambda s, x: s + len(x), active_cells, 0)
max_path_len = max([len(x) for x in active_cells])
count_active_cells = len(active_cells)

avg_path_len = sum_path_lens / count_active_cells
print(max_path_len)
print(avg_path_len)

18
10.36214442013129


In [27]:
def calc_enclosing_cell(lon: float, lat: float, active_cells: list[str]):
    """
    Given a point (lon, lat) and a graph, return the cell that encloses the point.
    """

    def get_next_cell(lon: float, lat: float, west_lon: float, east_lon: float, south_lat: float, north_lat: float):
        ret_west_lon = west_lon
        ret_east_lon = east_lon
        ret_south_lat = south_lat
        ret_north_lat = north_lat

        quad = 0

        half_lon = (west_lon + east_lon) / 2
        if lon > half_lon:
            quad += 1
            ret_west_lon = half_lon
        else:
            ret_east_lon = half_lon

        half_lat = (south_lat + north_lat) / 2
        if lat < half_lat:
            quad += 2
            ret_north_lat = half_lat
        else:
            ret_south_lat = half_lat

        return quad, (ret_west_lon, ret_east_lon, ret_south_lat, ret_north_lat)

    west_lon = -180
    east_lon = 180
    south_lat = -90
    north_lat = 90

    cell = ""
    cell_pool = [c for c in active_cells]

    while True:
        quad, (west_lon, east_lon, south_lat, north_lat) = get_next_cell(
            lon, lat, west_lon, east_lon, south_lat, north_lat
        )

        cell += str(quad)
        cell_pool = [c for c in cell_pool if c.startswith(cell)]

        if len(cell_pool) == 1 and cell == cell_pool[0]:
            return cell

        if len(cell_pool) == 0:
            f"Not found: {cell}"
            return None

In [28]:
calc_enclosing_cell(12.455, 41.906, active_cells)

'1200120013211231'

In [29]:
h = df.head(10000).collect()
print(h)

shape: (10_000, 2)
┌────────────┬─────────────┐
│ latitude   ┆ longitude   │
│ ---        ┆ ---         │
│ f64        ┆ f64         │
╞════════════╪═════════════╡
│ 41.906     ┆ 12.455      │
│ 48.211072  ┆ 16.36736    │
│ 43.942876  ┆ 12.774091   │
│ 41.339055  ┆ 14.507789   │
│ -23.210269 ┆ -44.693223  │
│ …          ┆ …           │
│ 34.13809   ┆ -118.353404 │
│ 14.098951  ┆ -87.907104  │
│ 15.474857  ┆ -88.176269  │
│ 55.689835  ┆ 12.570998   │
│ 34.277928  ┆ 132.570133  │
└────────────┴─────────────┘


In [30]:
# get 14th element as list
h.row(14)

(38.700515, -9.056854)

In [31]:
problem_line = calc_enclosing_cell(h.row(14)[1], h.row(14)[0], active_cells)

In [32]:
print(problem_line)

None


In [33]:
start_time = time.time()

for i, row in enumerate(h.iter_rows()):
    if calc_enclosing_cell(row[1], row[0], active_cells) is None:
        print(i)


end_time = time.time()
print(end_time - start_time)

14
180
527
923
949
1109
1111
1153
1376
1416
1728
2596
2916
3098
3343
3377
3457
3521
3862
3951
4201
4495
4517
4534
4633
4721
5167
5531
5637
5651
5981
6109
6156
6258
7315
7376
7585
7631
7847
7982
8031
8225
8613
9027
9273
9465
9746
7.17464017868042
