In [None]:
from core.pandas_utils import *
from core.s3_utils import S3_Bucket
from core.singleton_s3_bucket import bucket
from core.caching_utils import cache_result
from transform.raw_tss.tesla_raw_tss import get_raw_tss

In [None]:
@cache_result("./tesla_response_keys.parquet", "local_storage")
def get_tesla_keys() -> DF:
    return bucket.list_responses_keys_of_brand("tesla")

response_keys = get_tesla_keys()

In [None]:
raw_tss_subset = get_raw_tss(read_parquet_kwargs={"columns":["vin", "readable_date"]})

In [None]:
response_keys["date"] = response_keys["file"].str[:-5]
response_keys

In [None]:
last_parsed_date = (
    raw_tss_subset
    .groupby("vin", observed=True, as_index=False)
    .agg(last_parsed_date=pd.NamedAgg("readable_date", "max"))
)
response_keys_to_parse = (
    response_keys
    .merge(last_parsed_date, "left", "vin")
    .query("date > last_parsed_date")
)

len(response_keys_to_parse) / len(response_keys)

In [None]:
response_keys_to_parse

In [None]:
response_keys_to_parse

In [None]:
# def get_raw_tss(keys:DF) -> DF:
#     return (
#         keys
#         .apply(parse_response_as_raw_ts, axis="columns")
#     )

# def parse_response_as_raw_ts(key: Series) -> DF:
#     # print(key)
#     response = bucket.read_json_file(key["key"])
#     if response is None:
#         print(f"Did not parse key {key['key']} because the object returned by read_json_file was None.")
#         return DF()
#     raw_ts = DF.from_records(response)
#     raw_ts["vin"] = key["vin"]

#     return raw_ts

# raw_tss = get_raw_tss(response_keys_to_parse.iloc[:1000])
# sanity_check(raw_tss)

In [None]:
# import os
# import json
# import logging
# import boto3
# import aioboto3
# import pandas as pd
# from concurrent.futures import ThreadPoolExecutor
# from pandas import DataFrame as DF, Series

# class Asyinc_S3_Bucket:
#     def __init__(self, creds: dict[str, str] = None):
#         """Initialize S3 client with given credentials and environment variables."""
#         assert "S3_ENDPOINT" in os.environ, "S3_ENDPOINT variable is not in the environment."
        
#         if creds is None:
#             creds = S3_Bucket.get_creds_from_dot_env()
#         self.creds = creds

#         self._s3_client = boto3.client(
#             "s3",
#             region_name="fr-par",
#             endpoint_url=os.getenv("S3_ENDPOINT"),
#             aws_access_key_id=creds["aws_access_key_id"],
#             aws_secret_access_key=creds["aws_secret_access_key"],
#         )
#         self.bucket_name = creds["bucket_name"]
#         self.logger = logging.getLogger("S3_BUCKET")

#     def read_json_file(self, key: str):
#         """Reads a single JSON file from S3."""
#         try:
#             response = self._s3_client.get_object(Bucket=self.bucket_name, Key=key)
#             object_content = response["Body"].read().decode("utf-8")
#             return json.loads(object_content)
#         except Exception as e:
#             self.logger.error(f"Failed to read key {key}: {e}")
#             return None

#     def read_multiple_json_files(self, keys: list, max_workers=32):
#         """Reads multiple JSON files concurrently using ThreadPoolExecutor."""
#         with ThreadPoolExecutor(max_workers=max_workers) as executor:
#             results = list(executor.map(self.read_json_file, keys))
#         return results


In [None]:
def get_raw_tss(keys: DF, bucket: S3_Bucket) -> DF:
    """Reads multiple JSON files in parallel and parses them into a DataFrame."""
    json_responses = bucket.read_multiple_json_files(keys["key"].tolist())

    # Parse responses into DataFrame
    parsed_dfs = [
        parse_response_as_raw_ts(response, vin)
        for response, vin in zip(json_responses, keys["vin"])
        if response is not None
    ]
    
    # Concatenate results into a single DataFrame
    return concat(parsed_dfs, ignore_index=True) if parsed_dfs else DF()

def parse_response_as_raw_ts(response: dict, vin: str) -> DF:
    """Parses a single JSON response into a DataFrame and adds VIN."""
    raw_ts = DF.from_records(response)
    raw_ts["vin"] = vin
    return raw_ts

new_week_raw_tss = []

# Usage
for week, keys in response_keys_to_parse.astype({"date":"datetime64[ms]"}).groupby(pd.Grouper(key='date', freq='W')):
    print(keys.shape)
    raw_tss = get_raw_tss(keys, bucket)
    bucket.save_df_as_parquet(raw_tss, f"raw_ts/tesla/tesla_raw_tss_{week.date()}.parquet")
    new_week_raw_tss.append(raw_tss)

In [None]:
new_raw_tss = concat((get_raw_tss(), *new_week_raw_tss))
bucket.save_df_as_parquet(new_raw_tss, "/raw_tss/tesla/new_raw_tss.parquet")