# number of unique vins through out the transform pipeline
The goal of this notebook is to check where vins are "lost" in the pipeline.  
To make our estimations as acurrate as possible we have to prune out some parts of the data.  
Sometimes this removes data of the entirety of the life of a vehicle.  
This in turn, causes the monitor website to have no results for those vehicles. 

## Setup

### Imports

In [None]:
import json
from os.path import exists

from core.sql_utils import * 
from core.pandas_utils import * 
from transform.fleet_info.main import fleet_info
from transform.processed_tss.config import *
from transform.raw_tss.main import get_raw_tss
from core.singleton_s3_bucket import bucket

## Pipeline analysis

In [None]:
fleet_info["vin"].nunique()

In [None]:
unique_vins_in_raw_tss = {make: get_raw_tss(make, read_parquet_kwargs={"columns":["vin"]})["vin"].pipe(uniques_as_series) for make in ALL_MAKES}    

In [None]:
nunique_vins_in_raw_tss = Series({make: make_raw_tss_vins[make_raw_tss_vins.isin(fleet_info["vin"])].nunique() for make, make_raw_tss_vins in unique_vins_in_raw_tss.items()})
nunique_vins_in_raw_tss

In [None]:
MAKES_WITH_RAW_RES = ["bmw", "ford", "kia", "mercedes-benz", "renault", "tesla", "volvo-cars"]
nunique_vins_in_raw_results = Series({make: bucket.read_parquet_df(f"raw_results/{make}.parquet").query("vin in @fleet_info.vin")["vin"].nunique() for make in MAKES_WITH_RAW_RES})
nunique_vins_in_raw_results

In [None]:
nuniques_vins_in_vehicle_data = (
    pd.read_sql_query(
        """
        select * from vehicle_data
        join vehicle on vehicle.id = vehicle_data.vehicle_id
        join vehicle_model on vehicle.vehicle_model_id = vehicle_model.id
        join oem on vehicle_model.oem_id = oem.id
        join fleet on vehicle.fleet_id = fleet.id
        where fleet.fleet_name = 'Ayvens' and vehicle.activation_status
        """,
        con
    )
    .groupby("oem_name")
    ["vin"]
    .nunique()
)
nuniques_vins_in_vehicle_data

In [None]:
unique_vin_counts = pd.concat(
    {
        "vehicle": fleet_info.groupby("oem_name")["vin"].nunique(),
        "raw_tss": nunique_vins_in_raw_tss,
        "raw_results": nunique_vins_in_raw_results,
        "processed_results": nuniques_vins_in_vehicle_data,
    },
    axis="columns",
)
unique_vin_counts.loc["total"] = unique_vin_counts.sum().astype("int")
unique_vin_counts