In [2]:
import sys
from dotenv import load_dotenv
from github import Github
import os

# Create a .env with the following content:
# GH_TOKEN=your_github_token
# REPON=$gh_org/$repo

# Load environment variables from .env file
load_dotenv("../.env")


# Get GitHub token from environment variable
gh_token = os.getenv("GH_TOKEN")
if gh_token is None:
    print("GitHub token not found! Create a .env file a the root with a GH_TOKEN variable.")
    exit(1)
gh_client = Github(gh_token, timeout=60)
repo = os.getenv("REPO") or "gh_org/repo"
repo = gh_client.get_repo(repo)

In [4]:
import sys
sys.path.append('..')
import importlib
import datagit.drift_evaluators
importlib.reload(datagit.drift_evaluators)
from datagit.drift_evaluators import default_drift_evaluator, auto_merge_drift

import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"

def formatDF(dict):
    df = pd.DataFrame(dict)
    df['unique_key'] = df.apply(lambda row: row['date'] + '-' + row['name'], axis=1)
    column_order = ['unique_key'] + [col for col in df.columns if col != 'unique_key']
    df = df.reindex(columns=column_order)
    return df


file_path = repo+"/path/to/metric_name_23-09-truc3.csv"

# Store metric for the first time
dataMonth1 = {"name": ["Alice", "Bob", "Charlie"], "date": ["2022-12","2023-01","2023-01"], "age": [25, 30, 35]}
store_metric(gh_client,  formatDF(dataMonth1), file_path, assignees=["Sammy"])




Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_23-09-truc3.csv?token=ABUWFP3UHVPL52WULA7XXZ3FBLWCS
Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Branch drift/2023-09-20-14-56-12/path-to-metric-name-23-09-truc3-csv doesn't exist, creating it...
Checkout branch: drift/2023-09-20-14-56-12/path-to-metric-name-23-09-truc3-csv  from default branch:main
comparison Empty DataFrame
Columns: []
Index: []
No drift detected


In [5]:
# ## Introduce new data for 2023-02
dataMonth2 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02"], "age": [25, 30, 35, 40, 40, 40]}
store_metric(gh_client,  formatDF(dataMonth2), file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_23-09-truc3.csv?token=ABUWFP7YWPEZMTQRGH7MSRDFBLWD4
Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
New data found
Commit: New data: path/to/metric_name_23-09-truc3.csv
https://github.com/Samox/data-history/commit/e457c05839d7330798d5ecf06a7146972f01db7c
Branch drift/2023-09-20-14-56-33/path-to-metric-name-23-09-truc3-csv doesn't exist, creating it...
Checkout branch: drift/2023-09-20-14-56-33/path-to-metric-name-23-09-truc3-csv  from default branch:main
comparison Empty DataFrame
Columns: []
Index: []
No drift detected


In [6]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

# ## Introduce new data for 2023-03 and a drift on 2020-02
dataMonth3 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 36, 40, 42, 40, 45, 45, 46]}
store_metric(gh_client,  formatDF(dataMonth3), file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_23-09-truc3.csv?token=ABUWFP7DVSSYEICZJSZ2XLDFBLWEU
Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
New data found
Commit: New data: path/to/metric_name_23-09-truc3.csv
https://github.com/Samox/data-history/commit/01716409adaae39514a50ab2d1823bdbc9c34259
Branch drift/2023-09-20-14-56-45/path-to-metric-name-23-09-truc3-csv doesn't exist, creating it...
Checkout branch: drift/2023-09-20-14-56-45/path-to-metric-name-23-09-truc3-csv  from default branch:main
comparison                  age      
                self other
unique_key                
2023-01-Charlie   35    36
2023-02-Philipe   40    42
Drift detected
Drift evaluation: {'should_alert': True,

In [8]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
dataMonth3And1Day = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 35, 40, 42, 40, 45, 45, 46]}
store_metric(gh_client,  formatDF(dataMonth3And1Day), file_path, assignees=["Sammy"])


Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/data-history/reported/path/to/metric_name_06-07-23.csv?token=ABUWFP6YHRV7SELQ5WDUT7LEVPPUW
Drift detected
    age      
   self other
5  40.0  42.0
Drift evaluator failed: 'dict' object has no attribute 'reported_dataframe'
Using default drift evaluator
Drift evaluation: {'should_alert': True, 'message': 'Drift detected:\n- ~~üÜï 0 addition~~\n- ‚ôªÔ∏è 1 modification\n- ~~üóëÔ∏è 0 deletion~~'}
Commit: Drift: path/to/metric_name_06-07-23.csv
https://github.com/Samox/data-history/commit/d8f19d3f300dd793507785691e028580485f9d67
https://github.com/Samox/data-history/commit/f83a1d0e12695e36ad8b84ce4512bc4ef926b77a
Drift pushed
Creating pull request
Pull request created: https://github.com/Samox/data-history/pull/110
Assignee Sammy does not exist


In [5]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
dataMonth3And2Day = {"name": ["Alice", "Alixe", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 25, 30, 35, 40, 42, 40, 45, 45, 46]}
store_metric(gh_client,  formatDF(dataMonth3And2Day), file_path, assignees=["Sammy"])


Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/data-history/reported/path/to/metric_name_06-07-23.csv?token=ABUWFP4WMUNYRMTGFK7FADLEVPPHC
Drift detected
Could not display drift
Drift evaluator failed: 'dict' object has no attribute 'reported_dataframe'
Using default drift evaluator
Drift evaluation: {'should_alert': True, 'message': 'Drift detected:\n- üÜï 1 addition\n- ‚ôªÔ∏è 1 modification\n- ~~üóëÔ∏è 0 deletion~~'}
Commit: Drift: path/to/metric_name_06-07-23.csv
https://github.com/Samox/data-history/commit/7f0e93ad978b945892e79c96d2b21aaf882d4f2f
https://github.com/Samox/data-history/commit/914f6faaa1c9bca964a3da4db309e897b6b3c830
Drift pushed
Creating pull request
Pull request created: https://github.com/Samox/data-history/pull/108
Assignee Sammy does not exist


In [6]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker to generate random data
fake = Faker()

# Set the number of rows for the dataframe
num_rows = 600000

# Generate random IDs and dates
ids = [fake.uuid4() for _ in range(num_rows)]
dates = [fake.date_between(start_date='-30y', end_date='today').strftime('%Y-%m-%d') for _ in range(num_rows)]

# Generate random metric values between 0 and 10
metric_values = [round(random.uniform(0, 10),2) for _ in range(num_rows)]

# Create the dataframe
ultra_large_df = pd.DataFrame({'unique_key': ids, 'date': dates, 'metric_value': metric_values})

# Print the dataframe
print(ultra_large_df)

                                  unique_key        date  metric_value
0       28504be7-0150-41a4-95c5-747c4617ae11  2005-07-24          9.18
1       3a125331-e4c7-4087-bc5a-9982062f16ce  1993-11-01          8.61
2       6bec0b38-321a-4d66-a197-743e29fbc9b5  2016-01-15          7.18
3       f9b944c5-e212-4fcb-a348-f8420107d126  2010-06-21          7.79
4       789f37cb-36ba-4986-ad7a-27a514cad7bd  2016-04-21          4.46
...                                      ...         ...           ...
599995  ee5ab8f6-148c-44db-8934-2e810f023d5e  2007-11-15          5.46
599996  91742782-84cc-4222-abd5-7a87ba283317  2011-03-18          5.05
599997  8e8f2f9b-333c-46bd-822e-26f3de30c7ac  2006-04-20          9.34
599998  34fdf345-acbd-46aa-9b0f-9e35d2aa6af8  2014-03-18          7.97
599999  db96be9a-773e-455e-8bda-82eccd29d017  2008-09-20          8.54

[600000 rows x 3 columns]


In [7]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"
store_metric(gh_client,  ultra_large_df, repo+"/path/to/ultra_large_metric_name3.csv", assignees=["Sammy"])

Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/data-history/reported/path/to/ultra_large_metric_name3.csv?token=ABUWFP2DUDMBUHXB27WCPJLEVPPIQ
New data found
Commit: New data: path/to/ultra_large_metric_name3.csv
https://github.com/Samox/data-history/commit/176e28321049706248a4e3b7838bd777c15cdf98
https://github.com/Samox/data-history/commit/8bde95c4196be6c2cb9b42ead5ec26ab97610cf0
Drift detected
Could not display drift
Drift evaluator failed: 'dict' object has no attribute 'reported_dataframe'
Using default drift evaluator
Drift evaluation: {'should_alert': True, 'message': 'Drift detected:\n- üÜï 599732 additions\n- ~~‚ôªÔ∏è 0 modification~~\n- üóëÔ∏è 600000 deletions'}
Commit: Drift: path/to/ultra_large_metric_name3.csv
https://github.com/Samox/data-history/commit/d91e6e0e803a0781496dc7da0ab633384baf7e72
https://github.com/Samox/data-history/commit/70cfa8e100b4a55df39b87ce25986c32a1dd6a90
Drift pushed
Creating pull re

In [20]:
ultra_large_df = ultra_large_df.iloc[:-1]
