# number of unique vins through out the transform pipeline
The goal of this notebook is to check where vins are "lost" in the pipeline.  
To make our estimations as acurrate as possible we have to prune out some parts of the data.  
Sometimes this removes data of the entirety of the life of a vehicle.  
This in turn, causes the monitor website to have no results for those vehicles. 

## Setup

### Imports

In [None]:
from core.sql_utils import * 
from core.pandas_utils import * 
from transform.fleet_info.main import fleet_info
from pyspark.sql import functions as F


In [None]:
from core.s3.s3_utils import S3Service, S3Settings
from core.spark_utils import create_spark_session
settings = S3Settings()

spark = create_spark_session(
    settings.S3_KEY,
    settings.S3_SECRET
)

s3 = S3Service()

## Pipeline analysis

In [None]:
MAKES = ["bmw", ]#"ford", "kia", "mercedes_benz", "renault", "tesla_fleet_telemetry", "volvo_cars", "volkswagen"] # "tesla", "opel" , "ds", 

In [None]:
print(fleet_info["vin"].nunique())
fleet_info_spark = spark.createDataFrame(fleet_info.rename(columns={"vin": "VIN"}))


In [None]:
nunique_vins_in_raw_tss = Series({make: s3.read_parquet_df_spark(spark, f'raw_ts/{make}/time_series/raw_ts_spark.parquet').join(fleet_info_spark, on="VIN", how="inner").select(F.countDistinct("VIN").alias("unique_vin_count")).collect()[0][0] for make in MAKES}    )

In [None]:
nunique_vins_in_result_phases = Series({make: s3.read_parquet_df_spark(spark, f'result_phases/result_phases_{make}.parquet').join(fleet_info_spark, on="VIN", how="inner").select(F.countDistinct("VIN").alias("unique_vin_count")).collect()[0][0] for make in MAKES}  )

In [None]:
nunique_vins_in_processed_phases = Series({make: s3.read_parquet_df_spark(spark, f"processed_phases/processed_phases_{make}.parquet").join(fleet_info_spark, on="VIN", how="inner").select(F.countDistinct("VIN").alias("unique_vin_count")).collect()[0][0] for make in MAKES})

In [None]:
from core.sql_utils import *
engine = get_sqlalchemy_engine()
with engine.connect() as con:
    dbeaver_df = pd.read_sql(text("""SELECT * from vehicle_data
        LEFT join vehicle on vehicle.id = vehicle_data.vehicle_id
        LEFT join vehicle_model on vehicle.vehicle_model_id = vehicle_model.id
        LEFT join oem on vehicle_model.oem_id = oem.id
        LEFT join fleet on vehicle.fleet_id = fleet.id;"""), con)



In [None]:
nunique_vins_in_vehicle_data = dbeaver_df.groupby("oem_name")["vin"].nunique()

In [None]:
unique_vin_counts = pd.concat(
    {
        "vehicle": fleet_info.groupby("make")["vin"].nunique(),
        "raw_tss": nunique_vins_in_raw_tss,
        "processed_phases": nunique_vins_in_processed_phases,
        "results_phases": nunique_vins_in_result_phases,
        "vehicle_data": nunique_vins_in_vehicle_data,
    },
    axis="columns",
)
unique_vin_counts.loc["total"] = unique_vin_counts.sum().astype("int")
unique_vin_counts