In [1]:
import pandas as pd

##### Checking if there are conflicting `make`, `model`, `year` and `body` assigned to the same VIN (VIN without the serial number part)

In [2]:
# Dropping serial numbers from VINs since they are redacted and serve no purpose
df = pd.read_csv("../data/raw/ml-engineer-challenge-redacted-data.csv")
df = df.drop_duplicates()
df = df.sort_values(by="vin")
df

Unnamed: 0,vin,make,model,year,body
1882,3MW5R1J0XM8CXXXXX,BMW,3 Series,2021.0,
2930,4USBT33443LRXXXXX,BMW,Z4,2003.0,
1622,5UXCR4C06M9FXXXXX,BMW,X5,2021.0,
2211,5UXCY6C04P9PXXXXX,BMW,,,
2210,5UXCY6C04P9PXXXXX,BMW,X6,2023.0,SUV
...,...,...,...,...,...
2109,WUAZZZFX9J79XXXXX,Audi,R8,2018.0,
2115,WUAZZZFX9J79XXXXX,Audi,R8,,
2112,WUAZZZFX9J79XXXXX,Audi,R8,,crossover
1059,WUAZZZGY4NA9XXXXX,Audi,,,


In [3]:
from pandas import DataFrame
from typing import Callable, List

def check_for_conflicts(df: DataFrame, col: str, gr_col: str = "vin") -> DataFrame:
    vin_col_df = df[[gr_col, col]].copy().drop_duplicates()
    non_null_vin_col_df = vin_col_df[vin_col_df[col].notnull()]
    non_null_vin_col_grouped = non_null_vin_col_df.groupby(gr_col)
    non_null_vin_col_conflicts = non_null_vin_col_grouped.filter(lambda x: x[col].nunique() > 1)
    print("There are no conflicts!") if non_null_vin_col_conflicts.empty else print("Conflicts found!")
    return non_null_vin_col_grouped


def get_conflicting_vins(df: DataFrame, col: str, func_list: List[Callable | str] = [set, 'count']) -> DataFrame:
    result = df.agg(func_list).rename(columns={"shortened_vin": "count"})
    result.columns = result.columns.to_flat_index().str.join("_")
    result = result[result[col + "_count"] > 1].drop(columns=[col + "_count"])
    return result.reset_index()


def display_df(df: DataFrame) -> None:
    with pd.option_context("display.max_rows", None,"display.max_columns", None):
        display(df)


##### Checking if there are conflicting make values

In [4]:
col = "make"
vin_make_conflicts_df = check_for_conflicts(df=df, col=col)

There are no conflicts!


##### Checking if there are conflicting models

In [5]:
col = "model"
vin_model_conflicts_df = check_for_conflicts(df=df, col=col)

Conflicts found!


These VINs should be flagged for human inspection and until then should not be included into the training of the vehicle model classifier!

In [6]:
conflicting_models_df = get_conflicting_vins(df=vin_model_conflicts_df, col=col)
display_df(conflicting_models_df)

Unnamed: 0,vin,model_set
0,5UXFE43568L0XXXXX,"{X5, X Series}"
1,5UXFG43578L2XXXXX,"{X6, X Series}"
2,5UXTS3C57K0ZXXXXX,"{X3, X Series}"
3,WA1B4AFY6L20XXXXX,"{SQ5, Q5}"
4,WAURV68T59A0XXXXX,"{A5, S5}"
5,WAUZZZ44ZJN2XXXXX,"{A6, 100}"
6,WAUZZZ4G1EN1XXXXX,"{A6, A7, allroad}"
7,WAUZZZ4G9DN0XXXXX,"{A6, A7}"
8,WAUZZZ4GXHN1XXXXX,"{A6, A7}"
9,WAUZZZ8V2DA0XXXXX,"{A4, A3}"


Splitting the whole dataset into good and bad records. Bad records are the ones that requires human intervention. We'll use records only from files with `_good.csv` suffix.

In [7]:
vin_model_pairs_good = df[~df["vin"].isin(conflicting_models_df.vin)]
vin_model_pairs_good.to_csv("../data/data_validation/vin_model_pairs_good.csv", index=False, header=True)

In [8]:
vin_model_pairs_bad = df[df["vin"].isin(conflicting_models_df.vin)]
vin_model_pairs_bad.to_csv("../data/data_validation/vin_model_pairs_bad.csv", index=False, header=True)

##### Checking if there are conflicting year values

In [9]:
col = "year"
vin_year_conflicts_df = check_for_conflicts(df=df, col=col)

Conflicts found!


These VINs should be flagged for human inspection and until then should not be included into the training of the vehicle year classifier!

In [10]:
conflicting_year_df = get_conflicting_vins(df=vin_year_conflicts_df, col=col)
display_df(conflicting_year_df)

Unnamed: 0,vin,year_set
0,WAUZZZ4G4GN0XXXXX,"{2016.0, 2015.0}"
1,WAUZZZ8U6ER1XXXXX,"{2014.0, 2015.0}"


Splitting the whole dataset into good and bad records. Bad records are the ones that requires human intervention. We'll use records only from files with `_good.csv` suffix.

In [11]:
vin_year_pairs_good = df[~df["vin"].isin(conflicting_year_df.vin)]
vin_year_pairs_good.to_csv("../data/data_validation/vin_year_pairs_good.csv", index=False, header=True)

In [12]:
vin_year_pairs_bad = df[df["vin"].isin(conflicting_year_df.vin)]
vin_year_pairs_bad.to_csv("../data/data_validation/vin_year_pairs_bad.csv", index=False, header=True)

##### Checking if there are conflicting body values

In [13]:
col = "body"
vin_body_conflicts_df = check_for_conflicts(df=df, col=col)

Conflicts found!


These VINs should be flagged for human inspection and until then should not be included into the training of the vehicle body set classifier!

In [14]:
conflicting_body_df = get_conflicting_vins(df=vin_body_conflicts_df, col=col)
display_df(conflicting_body_df)

Unnamed: 0,vin,body_set
0,WA1LFAFP3FA0XXXXX,"{multi-purpose vehicle, SUV}"
1,WAUZZZ4F17N0XXXXX,"{estate, sedan}"
2,WAUZZZ4G1EN1XXXXX,"{estate, hatchback}"
3,WAUZZZ4G9HN0XXXXX,"{van, estate}"
4,WAUZZZ4L98D0XXXXX,"{van, SUV}"
5,WAUZZZ4M1JD0XXXXX,"{van, SUV}"
6,WAUZZZ4M3GD0XXXXX,"{crossover, SUV}"
7,WAUZZZ4M3KD0XXXXX,"{van, SUV}"
8,WAUZZZ8K4DA2XXXXX,"{estate, SUV}"
9,WAUZZZ8K9DA1XXXXX,"{estate, hatchback}"


Splitting the whole dataset into good and bad records. Bad records are the ones that requires human intervention. We'll use records only from files with `_good.csv` suffix.

In [15]:
vin_body_pairs_good = df[~df["vin"].isin(conflicting_body_df.vin)]
vin_body_pairs_good.to_csv("../data/data_validation/vin_body_pairs_good.csv", index=False, header=True)

In [16]:
vin_body_pairs_bad = df[df["vin"].isin(conflicting_body_df.vin)]
vin_body_pairs_bad.to_csv("../data/data_validation/vin_body_pairs_bad.csv", index=False, header=True)