In [1]:
from typing import List

import pandas as pd
import polars as pl
from sqlalchemy import select

from data.data import Session, engine
from data.model import Trip, Route, Location
from data import locations
from visualisations import maps
from analysis.bunching import is_bunched, is_bunched_pl

In [2]:
%%html
<!-- This block injects CSS into the page so that dataframes appear compact--!>
<style>.dataframe td{white-space:nowrap;}</style>

In [3]:
# Below is an example of how to build up a query using SQLAlchemy
locations_stmt = (
    select(
        Location.lat,
        Trip.direction_id,
        Trip.route_direction,
        Route.short_name,
    )
    .select_from(Location)
    .join(Route, Location.route_id == Route.id)
    .join(Trip, Location.trip_id == Trip.id)
)

# SQLAlchemy statements can be displayed as regular sql
print(locations_stmt)

SELECT locations.lat, trips.direction_id, trips.route_direction, routes.short_name 
FROM locations JOIN routes ON locations.route_id = routes.id JOIN trips ON locations.trip_id = trips.id


In [4]:
locations_pd = locations.get_locations()

To compare performance of different dataframes, we'll also create polars
dataframes in an eager and lazy variant

In [5]:
locations_pl = pl.from_pandas(locations_pd)
locations_pl_lazy = locations_pl.lazy()

In [6]:
locations_pd.sample(5)

Unnamed: 0,lat,lon,bearing,speed,request_timestamp,direction_id,route_direction,short_name
1666750,-33.872795,151.205185,84.0,5.6,1695335217,1,Chiswick to City Domain,504
1324460,-33.739471,151.068787,260.0,0.0,1695250017,1,Pennant Hills Station to Pennant Hills HS,8076
1529295,-33.894173,151.112839,75.0,10.5,1695333358,1,Mortlake to Ashfield,464
641507,-33.891201,151.249374,276.0,0.0,1695160498,0,Bronte to North Bondi,379
30001,-33.785126,150.899872,6.0,7.7,1695071158,1,Blacktown to Arndell Park via Huntingwood (Loo...,724


Each of the three cells below performs the same calculation using a
different method: The last time I ran this, the runtimes were:

* Pandas: 2.5s
* Polars (eager): 1.0s
* Polars (lazy): 0.5s

Using the lazy evaluation in this example, **Polars is 5x faster than
Pandas**!!

In Pandas, we typically run one calculation at a time and save the
interim results. In Polars, the recommended method is lazy evaluation.
This approach builds up the full query first and then evaluate it in one
step. Polars is then able to better-optimise the full calculation.

Other benefits of Polars in this case are:

* The structure is still similar to Pandas
* There is no need to handle indexes
* The "where" condition can be written pythonically without string
  interpolation

In [7]:
bunched_df = is_bunched(locations_pd)

In [8]:
bunched_df_pl = is_bunched_pl(locations_pl)

In [9]:
bunched_df_pl_lazy = is_bunched_pl(locations_pl_lazy)

In [10]:
fig = maps.position_map(bunched_df.query("~bunched and request_timestamp == 1695026696"))
maps.add_positions(fig, bunched_df.query("bunched and request_timestamp == 1695026696"))

In [11]:
bunched_df

Unnamed: 0,lat,lon,bearing,speed,request_timestamp,direction_id,route_direction,short_name,bunched
0,-33.806393,150.846405,277.0,13.700000,1695026696,1,Mount Druitt to Eastern Creek via Rooty Hill (...,738,False
1,-33.923782,150.927826,200.0,0.000000,1695026696,1,Liverpool to Parramatta via T-way,T80,True
2,-33.822212,151.191467,151.0,0.000000,1695026696,0,Royal North Shore Hospital to Balmoral,114,False
3,-33.929504,150.983047,258.0,16.400000,1695026696,0,Burwood to Liverpool,M90,False
4,-33.783604,151.280960,156.0,13.100000,1695026696,1,Palm Beach to Manly via Mona Vale & Dee Why,199,False
...,...,...,...,...,...,...,...,...,...
1805151,-33.786228,151.108582,230.0,0.000000,1695337737,1,Macquarie University to Meadowbank Wharf via Ryde,518,False
1805152,-33.797546,151.179077,250.0,0.000000,1695337737,0,Chatswood to Colwell Cres & Beaconsfield Rd,255,False
1805153,-33.821083,151.204681,145.0,16.200001,1695337737,0,City King Street Wharf to Gladesville via Nort...,252,False
1805154,-33.771362,151.088776,230.0,0.000000,1695337737,0,McMahons Point to Epping via North Sydney,291,False
