In [None]:
import logging
from datetime import datetime as DT
from datetime import timedelta as TD
from dateutil import parser

from rich import print
import pandas as pd
from pandas import Series
from pandas import DataFrame as DF
import plotly.express as px

from core.s3_utils import S3_Bucket
from jobs.base_jobs.job_interval import Jobinterval
from core.constants import *
from core.time_series_processing import preprocess_date, estimate_dummy_soh
from jobs.high_mobility.constants import *

In [None]:
bucket = S3_Bucket()
keys = {}

for brand in HM_HANDLED_BRANDS:
    brand_keys = Series(bucket.list_keys(f"processed_ts/{brand}/time_series/"), dtype="string")
    if len(brand_keys) == 0:
        print(f"""
            No time series found in the 'processed_ts/{brand}/time_series)' folder.
            No processed time series have been generated.
        """)
        continue
    # Only retain .parquet files
    brand_keys = brand_keys[brand_keys.str.endswith(".parquet")]
    brand_keys = (
        pd.concat((brand_keys, brand_keys.str.split("/", expand=True).loc[:, 1:]), axis="columns")
        .rename(columns={0:"key", 3:"vin"})
        .loc[:, ["key", "vin"]]
        .assign(vin=lambda df: df["vin"].str.split(".", expand=True).iloc[:, 0])
    )
    keys[brand] = brand_keys

keys

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
max_odo_dict = {}
for brand, brand_keys in keys.items(): 
    brand_max_odos:Series = brand_keys["key"].apply(lambda key: bucket.read_parquet_df(key)["odometer"].max())
    max_odo_dict[brand] = brand_max_odos

In [None]:
max_odos = pd.concat(max_odo_dict, keys=max_odo_dict.keys(), names=["brand"]).reset_index(0, drop=False).rename(columns={"key": "odometer"})
max_odos

In [None]:
px.histogram(max_odos, color="brand", opacity=0.6)