In [244]:
import pandas as pd
import os
from pathlib import Path
from datetime import datetime, timezone, timedelta
import pytz
from timezonefinder import TimezoneFinder
from tqdm.autonotebook import tqdm as notebook_tqdm
notebook_tqdm.pandas()

In [245]:
CACHE_DIR = Path("../cache")

In [246]:
dfs = []
for f in os.listdir(CACHE_DIR):
    if f.endswith(".parquet"):
        dfs.append(pd.read_parquet(CACHE_DIR / f))

In [247]:
assert len(dfs) > 0
df = dfs[0]
for d in dfs[1:]:
    df = pd.concat([df, d], axis=0)
df.sort_values(by="timestamp", ascending=True, inplace=True)
df.timestamp = pd.to_datetime(df.timestamp, utc=True, unit='s')
df.reset_index(drop=True, inplace=True)

### Prepare timezone calc

In [248]:
tzf = TimezoneFinder()
new_year = datetime(2023, 12, 31, 23, 59, 59)
new_year_utc = datetime(2023, 12, 31, 23, 59, 59, tzinfo=timezone.utc)

def determine_timezone(r):
    return pytz.timezone(tzf.timezone_at(lat=r['latitude'], lng=r['longitude'])).utcoffset(new_year)

### Calculate timezone offset for each position of flight and select only points where local time is before new year

In [249]:
df.loc[:, 'tzoffset'] = df.progress_apply(determine_timezone, axis=1, engine='python', engine_kwargs={"parallel": True})
df.loc[:, 'localtime'] = df.timestamp + df.tzoffset
df = df[df.localtime < new_year_utc]

100%|██████████| 308904/308904 [00:20<00:00, 14744.90it/s]


### For each flight select point with maximum local time

In [250]:
idxes = df.groupby(by="flightid")[['localtime']].idxmax()
df = df.loc[idxes.localtime]

### Filter out some not interesting planes

In [256]:
df

Unnamed: 0,timestamp,flightid,latitude,longitude,track,altitude,ground_speed,on_ground,callsign,source,registration,origin,destination,typecode,eta,vertical_speed,squawk,position_buffer,tzoffset,localtime
172236,2023-12-31 22:59:08+00:00,862168858,45.807999,8.772266,146,787,0,False,LILC,2,LILC,,,GLID,0,0,0,[],0 days 01:00:00,2023-12-31 23:59:08+00:00
215256,2024-01-01 02:59:50+00:00,862224180,-33.377903,-70.579254,61,2250,1,False,STM32,2,STM32,,,GLID,0,0,0,[],-1 days +21:00:00,2023-12-31 23:59:50+00:00
172982,2023-12-31 22:59:52+00:00,862234339,49.806431,19.000584,121,1318,1,False,SPABB2,2,SP-ABB2,,,GLID,0,0,0,[],0 days 01:00:00,2023-12-31 23:59:52+00:00
172669,2023-12-31 22:59:41+00:00,862403967,51.803802,9.277600,217,1541,1,False,WINDRAD,2,WINDRAD,,,GLID,0,0,0,[],0 days 01:00:00,2023-12-31 23:59:41+00:00
108618,2023-12-31 17:59:42+00:00,862545510,47.167969,22.855984,331,908,1,False,HA8107,2,HA-8107,,,TRIN,0,0,0,[],0 days 02:00:00,2023-12-31 19:59:42+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282274,2024-01-01 09:59:57+00:00,863244255,21.297958,-157.865143,51,550,88,False,N4230H,0,N4230H,HNL,,R44,0,0,0,[],-1 days +14:00:00,2023-12-31 23:59:57+00:00
279209,2024-01-01 09:59:36+00:00,863244950,21.333334,-158.066666,181,2100,80,False,N19867,3,N19867,HNL,,C172,0,0,0,[],-1 days +14:00:00,2023-12-31 23:59:36+00:00
301253,2024-01-01 11:59:56+00:00,863245468,-3.757401,-173.889572,31,37004,502,False,FJI810,5,DQ-FAI,NAN,LAX,A359,0,0,0,[],-1 days +12:00:00,2023-12-31 23:59:56+00:00
299510,2024-01-01 11:59:52+00:00,863247483,-7.277201,-173.875977,39,37004,503,False,FJI880,5,DQ-FAM,NAN,YVR,A359,0,0,0,[],-1 days +12:00:00,2023-12-31 23:59:52+00:00


In [266]:
df = df[(~df.on_ground) & (df.origin != '') & (df.destination != '')]

In [None]:
df.to_parquet("../cache/combined.parquet")

In [272]:
dfc = df.copy()

### Split all planes into two geozones

In [288]:
dfleft, dfright = df[df.tzoffset.dt.total_seconds() < 0], df[df.tzoffset.dt.total_seconds() >= 0]

In [293]:
dfleft.to_parquet(CACHE_DIR / "left.parquet")
dfright.to_parquet(CACHE_DIR / "right.parquet")