In [17]:
import pandas as pd
import datacompy

VALIDATIONS = {
    "players": {
        "joining_keys": ["player_id"]
    },
    "player_valuations": {
        "joining_keys": ["player_id", "date"]
    }
}

PROJECTPATH = "/Users/dcereijo/transfermarkt-datasets/"

OLDPATH = f"{PROJECTPATH}/data/prep"
NEWPATH = f"{PROJECTPATH}/data/prep_new"

def load_old(asset_name: str) -> pd.DataFrame:
    return pd.read_csv(
        f"{OLDPATH}/{asset_name}.csv"
    )

def load_new(asset_name: str) -> pd.DataFrame:
    return pd.read_csv(
        f"{NEWPATH}/{asset_name}.csv"
    )

def show_datacompy_diff(asset_name: str):

    old = load_old(asset_name)
    new = load_new(asset_name)

    comparison = datacompy.Compare(
        df1=old,
        df2=new,
        df1_name="pandas",
        df2_name="dbt",
        join_columns=VALIDATIONS[asset_name]["joining_keys"]
    )

    print(comparison.report(html_file=f"{PROJECTPATH}/notebooks/{asset_name}.html"))

def show_schema_diff(asset_name: str):

    old = load_old(asset_name)
    new = load_new(asset_name)

    print(set(old.columns.values) - set(new.columns.values))

In [18]:
show_datacompy_diff("player_valuations")

DataComPy Comparison
--------------------

DataFrame Summary
-----------------

  DataFrame  Columns    Rows
0    pandas        7  421564
1       dbt        5  405510

Column Summary
--------------

Number of columns in common: 5
Number of columns in pandas but not in dbt: 2
Number of columns in dbt but not in pandas: 0

Row Summary
-----------

Matched on: player_id, date
Any duplicates on match values: Yes
Absolute Tolerance: 0
Relative Tolerance: 0
Number of rows in common: 404,447
Number of rows in pandas but not in dbt: 17,117
Number of rows in dbt but not in pandas: 1,063

Number of rows with some compared columns unequal: 128,964
Number of rows with all compared columns equal: 275,483

Column Comparison
-----------------

Number of columns compared with some values unequal: 3
Number of columns compared with all values equal: 2
Total number of values which compare unequal: 253,903

Columns with Unequal Values or Types
------------------------------------

                        

In [19]:
show_schema_diff("player_valuations")

{'dateweek', 'datetime'}
