In [1]:
import sys
from datagit.connectors.github_connector import GithubConnector
from dotenv import load_dotenv
from github import Github
import os

load_dotenv("../.env")

gh_token = os.getenv("GH_TOKEN")
if gh_token is None:
    print("GitHub token not found! Create a .env file a the root with a GH_TOKEN variable.")
    exit(1)
github_client = Github(gh_token, timeout=60)
repo_name = os.getenv("REPO") or "gh_org/repo"
github_connector = GithubConnector(github_client=github_client, github_repository_name=repo_name, assignees=["Sammy"])


In [2]:
import sys
sys.path.append('..')
import importlib
import datagit.drift_evaluator.drift_evaluators
importlib.reload(datagit.drift_evaluator.drift_evaluators)

import datagit.connectors.github_connector
importlib.reload(datagit.connectors.github_connector)
from datagit.connectors.workflow import snapshot_table
import pandas as pd

## Test with file already existing and splitting new data and historical data

def formatDF(dict):
    df = pd.DataFrame(dict)
    df['unique_key'] = df.apply(lambda row: row['date'] + '-' + row['name'], axis=1)
    column_order = ['unique_key'] + [col for col in df.columns if col != 'unique_key']
    df = df.reindex(columns=column_order)
    return df


table_name = "path/to/metric_name_13.csv"


# Store metric for the first time
dataMonth1 = {"name": ["Alice", "Bob", "Charlie"], "date": ["2022-12","2023-01","2023-01"], "age": [25, 30, 35]}
snapshot_table(
    connector=github_connector,
    table_dataframe= formatDF(dataMonth1),
    table_name= table_name,
)

Table found, updating it
common_rows_initial
                    name        date age
unique_key                              
2022-12-Alice      Alice  2022-12-01  25
2023-01-Bob          Bob  2023-01-01  30
2023-01-Charlie  Charlie  2023-01-01  36
common_rows_final
                    name        date age
unique_key                              
2022-12-Alice      Alice  2022-12-01  25
2023-01-Bob          Bob  2023-01-01  30
2023-01-Charlie  Charlie  2023-01-01  35
Change detected
Update: DRIFT
https://github.com/Samox/data-history/commit/5d713db09510815679643807d8d0ce9c3fa7aab1


In [3]:
# ## Introduce new data for 2023-02
dataMonth2 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02"], "age": [25, 30, 35, 40, 40, 40]}

snapshot_table(
    connector=github_connector,
    table_dataframe= formatDF(dataMonth2),
    table_name= table_name,
)

Table found, updating it
Change detected
Update: NEW DATA
https://github.com/Samox/data-history/commit/5ace84b7bfc248ea0a1a00478909c4bd2db0f32a


In [4]:
# ## Introduce new data for 2023-03 and a drift on 2020-02
dataMonth3 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 36, 40, 42, 40, 45, 45, 46]}

snapshot_table(
    connector=github_connector,
    table_dataframe= formatDF(dataMonth3),
    table_name= table_name,
)

Table found, updating it
common_rows_initial
                    name        date age
unique_key                              
2022-12-Alice      Alice  2022-12-01  25
2023-01-Bob          Bob  2023-01-01  30
2023-01-Charlie  Charlie  2023-01-01  35
2023-02-Antoine  Antoine  2023-02-01  40
2023-02-Didier    Didier  2023-02-01  40
2023-02-Philipe  Philipe  2023-02-01  40
2023-03-Clement  Clement  2023-03-01  45
2023-03-Cyril      Cyril  2023-03-01  45
2023-03-Victor    Victor  2023-03-01  46
common_rows_final
                    name        date age
unique_key                              
2022-12-Alice      Alice  2022-12-01  25
2023-01-Bob          Bob  2023-01-01  30
2023-01-Charlie  Charlie  2023-01-01  36
2023-02-Antoine  Antoine  2023-02-01  40
2023-02-Didier    Didier  2023-02-01  40
2023-02-Philipe  Philipe  2023-02-01  42
2023-03-Clement  Clement  2023-03-01  45
2023-03-Cyril      Cyril  2023-03-01  45
2023-03-Victor    Victor  2023-03-01  46
Change detected
Update: NEW DATA
ht

In [5]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
from datagit.drift_evaluator.drift_evaluators import AlertDriftEvaluator


dataMonth3And1Day = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 35, 40, 42, 40, 45, 45, 46]}
snapshot_table(
    connector=github_connector,
    table_dataframe= formatDF(dataMonth3And1Day),
    table_name= table_name,
    drift_evaluator=AlertDriftEvaluator()
)

Table found, updating it
common_rows_initial
                    name        date age
unique_key                              
2022-12-Alice      Alice  2022-12-01  25
2023-01-Bob          Bob  2023-01-01  30
2023-01-Charlie  Charlie  2023-01-01  36
2023-02-Antoine  Antoine  2023-02-01  40
2023-02-Didier    Didier  2023-02-01  40
2023-02-Philipe  Philipe  2023-02-01  42
2023-03-Clement  Clement  2023-03-01  45
2023-03-Cyril      Cyril  2023-03-01  45
2023-03-Victor    Victor  2023-03-01  46
common_rows_final
                    name        date age
unique_key                              
2022-12-Alice      Alice  2022-12-01  25
2023-01-Bob          Bob  2023-01-01  30
2023-01-Charlie  Charlie  2023-01-01  35
2023-02-Antoine  Antoine  2023-02-01  40
2023-02-Didier    Didier  2023-02-01  40
2023-02-Philipe  Philipe  2023-02-01  42
2023-03-Clement  Clement  2023-03-01  45
2023-03-Cyril      Cyril  2023-03-01  45
2023-03-Victor    Victor  2023-03-01  46
Change detected
Update: DRIFT
Branc

In [6]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
dataMonth3And2Day = {"name": ["Alice", "Alixe", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 25, 30, 35, 40, 42, 40, 45, 45, 46]}

snapshot_table(
    connector=github_connector,
    table_dataframe= formatDF(dataMonth3And2Day),
    table_name= table_name,
    drift_evaluator=AlertDriftEvaluator()
)


Table found, updating it
common_rows_initial
                    name        date age
unique_key                              
2022-12-Alice      Alice  2022-12-01  25
2023-01-Bob          Bob  2023-01-01  30
2023-01-Charlie  Charlie  2023-01-01  36
2023-02-Antoine  Antoine  2023-02-01  40
2023-02-Didier    Didier  2023-02-01  40
2023-02-Philipe  Philipe  2023-02-01  42
2023-03-Clement  Clement  2023-03-01  45
2023-03-Cyril      Cyril  2023-03-01  45
2023-03-Victor    Victor  2023-03-01  46
common_rows_final
                    name        date age
unique_key                              
2022-12-Alice      Alice  2022-12-01  25
2023-01-Bob          Bob  2023-01-01  30
2023-01-Charlie  Charlie  2023-01-01  35
2023-02-Antoine  Antoine  2023-02-01  40
2023-02-Didier    Didier  2023-02-01  40
2023-02-Philipe  Philipe  2023-02-01  42
2023-03-Clement  Clement  2023-03-01  45
2023-03-Cyril      Cyril  2023-03-01  45
2023-03-Victor    Victor  2023-03-01  46
Change detected
Update: DRIFT
Branc