In [5]:
from driftdb.connectors import GithubConnector
from driftdb.version import version
print(version)
from dotenv import load_dotenv
from github import Github
import os
import pandas as pd

load_dotenv("../../.env")

gh_token = os.getenv("GH_TOKEN")
if gh_token is None:
    print("GitHub token not found! Create a .env file a the root with a GH_TOKEN variable.")
    exit(1)
github_client = Github(gh_token, timeout=60)
repo_name = os.getenv("REPO") or "gh_org/repo"
github_connector = GithubConnector(github_client=github_client, github_repository_name=repo_name, assignees=["Sammy"])


0.0.4-a1


In [6]:
table_name = "test/alerts/new_data_alert"

dataMonth1 = pd.DataFrame({"unique_key": ["Alice", "Bob", "Charlie"], "date": ["2022-12","2023-01","2023-01"], "age": [25, 30, 35]})

github_connector.snapshot_table(
    table_dataframe= dataMonth1,
    table_name= table_name,
)

driftdb.connectors.github_connector - INFO - Table found. Updating it
driftdb.connectors.github_connector - INFO - Nothing to update


In [7]:
from driftdb.drift_evaluator import DefaultDriftEvaluator
from driftdb.drift_evaluator.interface import DriftEvaluation, NewDataEvaluatorContext


def detect_outliers(before: pd.DataFrame, after: pd.DataFrame, added_rows: pd.DataFrame):
    old_df = before
    new_lines = added_rows
    outliers = pd.DataFrame()

    numerical_cols = old_df.select_dtypes(include=['number']).columns
    print("numerical_cols", numerical_cols)
    for col in numerical_cols:
        Q1 = old_df[col].quantile(0.25)
        Q3 = old_df[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        is_outlier = (new_lines[col] < lower_bound) | (new_lines[col] > upper_bound)
        col_outliers = new_lines[is_outlier].copy()
        col_outliers['Reason'] = f"Column {col} out of boundaries"
        outliers = pd.concat([outliers, col_outliers])

    categorical_cols = old_df.select_dtypes(include=['object', 'category']).columns
    print("numerical_cols", numerical_cols)
    for col in categorical_cols:
        if col == 'unique_key':
            continue
        if col == 'date':
            continue
        old_categories = set(old_df[col].unique())

        new_categories = set(new_lines[col].unique()) - old_categories
        is_new_category = new_lines[col].isin(new_categories)
        cat_outliers = new_lines[is_new_category].copy()
        cat_outliers['Reason'] = f"Column {col} new unkown category"

        outliers = pd.concat([outliers, cat_outliers])


    # Drop duplicate rows
    outliers = outliers.drop_duplicates()

    return outliers
        

class MyEvaluator(DefaultDriftEvaluator):
    @staticmethod
    def compute_new_data_evaluation(
        new_data_context: NewDataEvaluatorContext,
    ) -> DriftEvaluation:
        outliers = detect_outliers(
            before=new_data_context.before,
            after=new_data_context.after,
            added_rows=new_data_context.added_rows,)
        if len(outliers) > 0:
            return DriftEvaluation(should_alert=True, message=f"Found {len(outliers)} outliers\n {outliers.to_markdown()}")
        return DriftEvaluation(should_alert=False, message="")

dataMonth1 = pd.DataFrame({"unique_key": ["Alice", "Bob", "Charlie", "Driss"], "date": ["2022-12","2023-01","2023-01", "2023-02"], "age": [25, 30, 35, 99]})

github_connector.snapshot_table(
    table_dataframe= dataMonth1,
    table_name= table_name,
    drift_evaluator=MyEvaluator()
)

driftdb.connectors.github_connector - INFO - Table found. Updating it
driftdb.connectors.github_connector - INFO - Change detected
driftdb.connectors.github_connector - INFO - Update: NEW DATA


numerical_cols Index(['age'], dtype='object')
numerical_cols Index(['age'], dtype='object')


driftdb.connectors.github_connector - INFO - Branch drift/2023-11-21-15-50-19/test-alerts-new-data-alert doesn't exist. Creating it...
driftdb.connectors.github_connector - INFO - Checkout branch: drift/2023-11-21-15-50-19/test-alerts-new-data-alert from branch: main
driftdb.connectors.github_connector - INFO - https://github.com/Samox/data-history/commit/c05c37bbd5b615062098c851f82ba2750ed55296
driftdb.connectors.github_connector - INFO - Pull request created: https://github.com/Samox/data-history/pull/220
driftdb.connectors.github_connector - INFO - Assignee Sammy does not exist


In [15]:
before = pd.DataFrame(
    {"unique_key": ["Alice", "Bob", "Charlie"], "date": ["2022-12", "2023-01", "2023-01"], "age": [25, 30, 35]}
)
after = pd.DataFrame(
    {
        "unique_key": ["Alice", "Bob", "Charlie", "Driss"],
        "date": ["2022-12", "2023-01", "2023-01", "2023-02"],
        "age": [25, 30, 35, 99],
    }
)
added_row = pd.DataFrame({"unique_key": ["Driss"], "date": ["2023-02"], "age": [99]})
outliers = detect_outliers(before=before, after=after, added_rows=added_row)
outliers

numerical_cols Index(['age'], dtype='object')
numerical_cols Index(['age'], dtype='object')


Unnamed: 0,unique_key,date,age,Reason
0,Driss,2023-02-01,99,age out of boundaries
