In [11]:
import sys
from dotenv import load_dotenv
from github import Github
import os

# Create a .env with the following content:
# GH_TOKEN=your_github_token
# REPON=$gh_org/$repo

# Load environment variables from .env file
load_dotenv()


# Get GitHub token from environment variable
gh_token = os.getenv("GH_TOKEN")
if gh_token is None:
    print("GitHub token not found! Create a .env file a the root with a GH_TOKEN variable.")
    exit(1)
gh_client = Github(gh_token, timeout=60)


In [27]:
import sys
sys.path.append('..')
import importlib
import datagit.drift_evaluators
importlib.reload(datagit.drift_evaluators)
from datagit.drift_evaluators import default_drift_evaluator, auto_merge_drift

import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"

def formatDF(dict):
    df = pd.DataFrame(dict)
    df['unique_key'] = df.apply(lambda row: row['date'] + '-' + row['name'], axis=1)
    column_order = ['unique_key'] + [col for col in df.columns if col != 'unique_key']
    df = df.reindex(columns=column_order)
    return df


file_path = repo+"/path/to/metric_name_06-07-23.csv"

# Store metric for the first time
dataMonth1 = {"name": ["Alice", "Bob", "Charlie"], "date": ["2022-12","2023-01","2023-01"], "age": [25, 30, 35]}
store_metric(gh_client,  formatDF(dataMonth1), file_path, assignees=["Sammy"], drift_evaluator=auto_merge_drift)




Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/copy-libeo-data-history/reported/path/to/metric_name_06-07-23.csv?token=ABUWFP45YTN3ONUAQJE2UGLEVPK6G
Drift detected
Could not display drift
Drift evaluation: {'should_alert': False, 'message': 'Drift detected and automatically merged.'}
No alert needed, pushing on reported branch
Commit: Drift: path/to/metric_name_06-07-23.csv
https://github.com/Samox/copy-libeo-data-history/commit/ea479e197d0983236de17173db586a135f701e8c
https://github.com/Samox/copy-libeo-data-history/commit/0583ad7df3e94729cdab3a4af0c34407e889820d
Drift pushed on reported branch


In [25]:
# ## Introduce new data for 2023-02
dataMonth2 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02"], "age": [25, 30, 35, 40, 40, 40]}
store_metric(gh_client,  formatDF(dataMonth2), file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/copy-libeo-data-history/reported/path/to/metric_name_06-07-23.csv?token=ABUWFP46YXUOJLO6YKV6AR3EVPKHQ
New data found
Commit: New data: path/to/metric_name_06-07-23.csv
No drift detected


In [26]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

# ## Introduce new data for 2023-03 and a drift on 2020-02
dataMonth3 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 36, 40, 42, 40, 45, 45, 46]}
store_metric(gh_client,  formatDF(dataMonth3), file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/copy-libeo-data-history/reported/path/to/metric_name_06-07-23.csv?token=ABUWFPYNPJ3VGEM4O5VGK5DEVPKIE
New data found
Commit: New data: path/to/metric_name_06-07-23.csv
https://github.com/Samox/copy-libeo-data-history/commit/c78d56c7897be677a93ad62984b070950fe27ebc
https://github.com/Samox/copy-libeo-data-history/commit/d153630747ce9df253810f9dd8f0592afa930ff7
Drift detected
    age      
   self other
2  35.0  36.0
5  40.0  42.0
Drift evaluator failed: 'dict' object has no attribute 'reported_dataframe'
Using default drift evaluator
Drift evaluation: {'should_alert': True, 'message': 'Drift detected:\n- ~~🆕 0 addition~~\n- ♻️ 2 modifications\n- ~~🗑️ 0 deletion~~'}
Commit: Drift: path/to/metric_name_06-07-23.csv
https://github.com/Samox/copy-libeo-data-history/commit/937a49542a2e85dce858f6ad706cfa1ec5a76a29
https://github.com/Samox/copy-libeo-data-history/commit/2ed50cd465f9a6

In [5]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
dataMonth3And1Day = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 35, 40, 42, 40, 45, 45, 46]}
store_metric(gh_client,  formatDF(dataMonth3And1Day), file_path, assignees=["Sammy"])


Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/copy-libeo-data-history/reported/path/to/metric_name_06-07-23.csv?token=ABUWFP27BWNYL2A2SPYST4TEU2BRI
Drift detected
    age      
   self other
5  40.0  42.0
Commit: Drift: path/to/metric_name_06-07-23.csv
Drift pushed
Creating pull request
Pull request created: https://github.com/Samox/copy-libeo-data-history/pull/89
Assignee Sammy does not exist


In [8]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
dataMonth3And2Day = {"name": ["Alice", "Alixe", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 25, 30, 35, 40, 42, 40, 45, 45, 46]}
store_metric(gh_client,  formatDF(dataMonth3And2Day), file_path, assignees=["Sammy"])


Storing metric...
Metric found, updating it on branch: reported
Drift detected
Commit: Drift: path/to/metric_name.csv
Drift pushed
Creating pull request
Pull request created: https://github.com/Samox/copy-libeo-data-history/pull/50
Assignee Sammy does not exist


In [8]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker to generate random data
fake = Faker()

# Set the number of rows for the dataframe
num_rows = 600000

# Generate random IDs and dates
ids = [fake.uuid4() for _ in range(num_rows)]
dates = [fake.date_between(start_date='-30y', end_date='today').strftime('%Y-%m-%d') for _ in range(num_rows)]

# Generate random metric values between 0 and 10
metric_values = [round(random.uniform(0, 10),2) for _ in range(num_rows)]

# Create the dataframe
ultra_large_df = pd.DataFrame({'unique_key': ids, 'date': dates, 'metric_value': metric_values})

# Print the dataframe
print(ultra_large_df)

                                  unique_key        date  metric_value
0       0432ef12-da7f-45c4-8913-824b04f83ef4  1994-04-06          5.76
1       a93f41e7-0ab7-4750-8c2f-874125a720ae  1995-12-07          5.93
2       96a2318f-631e-460a-8797-3045ae4ce64a  2005-03-22          5.74
3       980ad028-8f27-4c27-bb6c-e1c73c0741b5  2009-04-01          8.90
4       9b04a5c8-0e30-4649-96a9-84e945c29ecf  2014-04-30          2.98
...                                      ...         ...           ...
599995  3b8ac245-50d7-481d-8710-6d188f29316e  2010-12-18          1.56
599996  ded9dd37-9c33-456e-8045-3df8ea8597b0  1998-01-01          5.19
599997  82d30a01-9958-43aa-8908-627834047df1  2006-08-30          7.29
599998  affed857-153a-4991-a066-35395f0fb73f  2002-05-02          5.28
599999  9c5dda97-1171-4363-ac8c-5859f767e4e5  2016-08-16          1.32

[600000 rows x 3 columns]


In [21]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"
store_metric(gh_client,  ultra_large_df, repo+"/path/to/ultra_large_metric_name3.csv", assignees=["Sammy"])

Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/copy-libeo-data-history/reported/path/to/ultra_large_metric_name3.csv?token=ABUWFPZQHHSYRURECQWV2JLEUWQL4
Drift detected
Could not display drift
Commit: Drift: path/to/ultra_large_metric_name3.csv
Drift pushed
Creating pull request
Pull request created: https://github.com/Samox/copy-libeo-data-history/pull/80
Assignee Sammy does not exist


In [20]:
ultra_large_df = ultra_large_df.iloc[:-1]
