In [4]:
import pandas as pd
import datacompy

VALIDATIONS = {
    "players": {
        "joining_keys": ["player_id"]
    },
    "player_valuations": {
        "joining_keys": ["player_id", "date"]
    }
}

PROJECTPATH = "/Users/dcereijo/transfermarkt-datasets/"

OLDPATH = f"{PROJECTPATH}/data/prep"
NEWPATH = f"{PROJECTPATH}/data/prep_new"

def load_old(asset_name: str) -> pd.DataFrame:
    return pd.read_csv(
        f"{OLDPATH}/{asset_name}.csv"
    )

def load_new(asset_name: str) -> pd.DataFrame:
    return pd.read_csv(
        f"{NEWPATH}/{asset_name}.csv.gz"
    )

def show_datacompy_diff(asset_name: str):

    old = load_old(asset_name)
    new = load_new(asset_name)

    comparison = datacompy.Compare(
        df1=old,
        df2=new,
        df1_name="pandas",
        df2_name="dbt",
        join_columns=VALIDATIONS[asset_name]["joining_keys"]
    )

    print(comparison.report())

def show_schema_diff(asset_name: str):

    old = load_old(asset_name)
    new = load_new(asset_name)

    print(set(old.columns.values) - set(new.columns.values))

In [5]:
show_datacompy_diff("player_valuations")

DataComPy Comparison
--------------------

DataFrame Summary
-----------------

  DataFrame  Columns    Rows
0    pandas        7  421564
1       dbt        9  404811

Column Summary
--------------

Number of columns in common: 7
Number of columns in pandas but not in dbt: 0
Number of columns in dbt but not in pandas: 2

Row Summary
-----------

Matched on: player_id, date
Any duplicates on match values: No
Absolute Tolerance: 0
Relative Tolerance: 0
Number of rows in common: 401,307
Number of rows in pandas but not in dbt: 20,257
Number of rows in dbt but not in pandas: 3,504

Number of rows with some compared columns unequal: 401,307
Number of rows with all compared columns equal: 0

Column Comparison
-----------------

Number of columns compared with some values unequal: 4
Number of columns compared with all values equal: 3
Total number of values which compare unequal: 458,893

Columns with Unequal Values or Types
------------------------------------

                               

In [6]:
show_schema_diff("player_valuations")

set()
