In [1]:
import sys
from dotenv import load_dotenv
from github import Github
import os

# Create a .env with the following content:
# GH_TOKEN=your_github_token
# REPON=$gh_org/$repo

# Load environment variables from .env file
load_dotenv("../.env")


# Get GitHub token from environment variable
gh_token = os.getenv("GH_TOKEN")
if gh_token is None:
    print("GitHub token not found! Create a .env file a the root with a GH_TOKEN variable.")
    exit(1)
gh_client = Github(gh_token, timeout=60)
repo = os.getenv("REPO") or "gh_org/repo"
repo = gh_client.get_repo(repo)

In [2]:
import sys
sys.path.append('..')
import importlib
import datagit.drift_evaluators
importlib.reload(datagit.drift_evaluators)

import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"

def formatDF(dict):
    df = pd.DataFrame(dict)
    df['unique_key'] = df.apply(lambda row: row['date'] + '-' + row['name'], axis=1)
    column_order = ['unique_key'] + [col for col in df.columns if col != 'unique_key']
    df = df.reindex(columns=column_order)
    return df


file_path = repo+"/path/to/metric_name_13.csv"

# Store metric for the first time
dataMonth1 = {"name": ["Alice", "Bob", "Charlie"], "date": ["2022-12","2023-01","2023-01"], "age": [25, 30, 35]}
store_metric(ghClient=gh_client, dataframe= formatDF(dataMonth1),filepath= file_path, assignees=["Sammy"])




Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP6T2ZTBBU4QM46QZG3FJUCU4
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
without_deleted <class 'pandas.core.frame.DataFrame'>
Change detected
Update: DRIFT Deletion


  with_added = without_deleted.append(after_drift.loc[added_keys]).sort_index()


https://github.com/Samox/data-history/commit/9db8218d42adc8493bb46d1e7f68402aa39f0290
Update: DRIFT Modification
https://github.com/Samox/data-history/commit/da8f15572ddfea214cc9b1982497ea51c0ff8a4e


In [3]:
# ## Introduce new data for 2023-02
dataMonth2 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02"], "age": [25, 30, 35, 40, 40, 40]}
store_metric(ghClient=gh_client,dataframe=  formatDF(dataMonth2),filepath= file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFPYL2E6TT4CMHEPY7NLFJUBTO
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: NEW DATA


  with_added = without_deleted.append(after_drift.loc[added_keys]).sort_index()


https://github.com/Samox/data-history/commit/fbd9d8c4b2eba99b59b443b1cf076f55337ea251


In [4]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

# ## Introduce new data for 2023-03 and a drift on 2020-02
dataMonth3 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 36, 40, 42, 40, 45, 45, 46]}
store_metric(ghClient=gh_client,dataframe=  formatDF(dataMonth3),filepath= file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP4JB6YKDJ36M4QOMILFJUBTU
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: NEW DATA


  with_added = without_deleted.append(after_drift.loc[added_keys]).sort_index()


https://github.com/Samox/data-history/commit/ddef7ff694c304d3f9e31aa60a7f4d9eb3fdca20
Update: DRIFT Modification
https://github.com/Samox/data-history/commit/21df8ec6cf6bdaf24a8583a7c831ec6ea0d7a5b0


In [5]:

from datagit.dataframe_update_breakdown import dataframe_update_breakdown


dataframe_update_breakdown(initial_dataframe=formatDF(dataMonth2), final_dataframe=formatDF(dataMonth3))

  with_added = without_deleted.append(after_drift.loc[added_keys]).sort_index()


{'MIGRATION Column Deleted': {'df':                     name     date  age
  unique_key                            
  2022-12-Alice      Alice  2022-12   25
  2023-01-Bob          Bob  2023-01   30
  2023-01-Charlie  Charlie  2023-01   35
  2023-02-Didier    Didier  2023-02   40
  2023-02-Philipe  Philipe  2023-02   40
  2023-02-Antoine  Antoine  2023-02   40,
  'has_update': False,
  'type': <UpdateType.OTHER: 'other'>,
  'drift_context': None},
 'NEW DATA': {'df':                     name     date  age
  unique_key                            
  2022-12-Alice      Alice  2022-12   25
  2023-01-Bob          Bob  2023-01   30
  2023-01-Charlie  Charlie  2023-01   35
  2023-02-Didier    Didier  2023-02   40
  2023-02-Philipe  Philipe  2023-02   40
  2023-02-Antoine  Antoine  2023-02   40
  2023-03-Clement  Clement  2023-03   45
  2023-03-Cyril      Cyril  2023-03   45
  2023-03-Victor    Victor  2023-03   46,
  'has_update': True,
  'type': <UpdateType.OTHER: 'other'>,
  'drift_context':

In [7]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
from datagit.drift_evaluators import alert_drift


dataMonth3And1Day = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 35, 40, 42, 40, 45, 45, 46]}
store_metric(ghClient=gh_client, dataframe= formatDF(dataMonth3And1Day),filepath= file_path, assignees=["Sammy"], drift_evaluator=alert_drift)


Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP3QGRVD36A6GEAAGLDFJUBMO
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: DRIFT Modification
0      2022-12-Alice
1        2023-01-Bob
2    2023-01-Charlie
3    2023-02-Antoine
4     2023-02-Didier
5    2023-02-Philipe
6    2023-03-Clement
7      2023-03-Cyril
8     2023-03-Victor
Name: unique_key, dtype: string
0      2022-12-Alice
1        2023-01-Bob
2    2023-01-Charlie
3    2023-02-Antoine
4     2023-02-Didier
5    2023-02-Philipe
6    2023-03-Clement
7      2023-03-Cyril
8     2023-03-Victor
Name: unique_key, dtype: string


  with_added = without_deleted.append(after_drift.loc[added_keys]).sort_index()


Branch drift/2023-11-09-17-02-18/path-to-metric-name-13-csv doesn't exist, creating it...
Checkout branch: drift/2023-11-09-17-02-18/path-to-metric-name-13-csv  from default branch:main
https://github.com/Samox/data-history/commit/06b855eb154039e88dd181c69af4a71ba13a687a
Pull request created: https://github.com/Samox/data-history/pull/165
Assignee Sammy does not exist


In [8]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
dataMonth3And2Day = {"name": ["Alice", "Alixe", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 25, 30, 35, 40, 42, 40, 45, 45, 46]}
store_metric(ghClient=gh_client, dataframe= formatDF(dataMonth3And2Day), filepath=file_path, assignees=["Sammy"])


Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP4Y5BTNJ3W5YH4EEU3FJUBNU
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: DRIFT Addition
0      2022-12-Alice
1        2023-01-Bob
2    2023-01-Charlie
3    2023-02-Antoine
4     2023-02-Didier
5    2023-02-Philipe
6    2023-03-Clement
7      2023-03-Cyril
8     2023-03-Victor
Name: unique_key, dtype: string
0      2022-12-Alice
1      2022-12-Alixe
2        2023-01-Bob
3    2023-01-Charlie
4    2023-02-Antoine
5     2023-02-Didier
6    2023-02-Philipe
7    2023-03-Clement
8      2023-03-Cyril
9     2023-03-Victor
Name: unique_key, dtype: object


  with_added = without_deleted.append(after_drift.loc[added_keys]).sort_index()


https://github.com/Samox/data-history/commit/ae1af8d95bc3b64cce0f5cc1929ea57be8d12854
Update: DRIFT Modification
0      2022-12-Alice
1      2022-12-Alixe
2        2023-01-Bob
3    2023-01-Charlie
4    2023-02-Antoine
5     2023-02-Didier
6    2023-02-Philipe
7    2023-03-Clement
8      2023-03-Cyril
9     2023-03-Victor
Name: unique_key, dtype: object
0      2022-12-Alice
1      2022-12-Alixe
2        2023-01-Bob
3    2023-01-Charlie
4    2023-02-Antoine
5     2023-02-Didier
6    2023-02-Philipe
7    2023-03-Clement
8      2023-03-Cyril
9     2023-03-Victor
Name: unique_key, dtype: object
https://github.com/Samox/data-history/commit/a5ececf8e8edfcf7e890c85dcd3804cd91c6a686


In [6]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker to generate random data
fake = Faker()

# Set the number of rows for the dataframe
num_rows = 600000

# Generate random IDs and dates
ids = [fake.uuid4() for _ in range(num_rows)]
dates = [fake.date_between(start_date='-30y', end_date='today').strftime('%Y-%m-%d') for _ in range(num_rows)]

# Generate random metric values between 0 and 10
metric_values = [round(random.uniform(0, 10),2) for _ in range(num_rows)]

# Create the dataframe
ultra_large_df = pd.DataFrame({'unique_key': ids, 'date': dates, 'metric_value': metric_values})

# Print the dataframe
print(ultra_large_df)

                                  unique_key        date  metric_value
0       28504be7-0150-41a4-95c5-747c4617ae11  2005-07-24          9.18
1       3a125331-e4c7-4087-bc5a-9982062f16ce  1993-11-01          8.61
2       6bec0b38-321a-4d66-a197-743e29fbc9b5  2016-01-15          7.18
3       f9b944c5-e212-4fcb-a348-f8420107d126  2010-06-21          7.79
4       789f37cb-36ba-4986-ad7a-27a514cad7bd  2016-04-21          4.46
...                                      ...         ...           ...
599995  ee5ab8f6-148c-44db-8934-2e810f023d5e  2007-11-15          5.46
599996  91742782-84cc-4222-abd5-7a87ba283317  2011-03-18          5.05
599997  8e8f2f9b-333c-46bd-822e-26f3de30c7ac  2006-04-20          9.34
599998  34fdf345-acbd-46aa-9b0f-9e35d2aa6af8  2014-03-18          7.97
599999  db96be9a-773e-455e-8bda-82eccd29d017  2008-09-20          8.54

[600000 rows x 3 columns]


In [7]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"
store_metric(ghClient=gh_client,  dataframe=ultra_large_df, filepath=repo+"/path/to/ultra_large_metric_name3.csv", assignees=["Sammy"])

Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/data-history/reported/path/to/ultra_large_metric_name3.csv?token=ABUWFP2DUDMBUHXB27WCPJLEVPPIQ
New data found
Commit: New data: path/to/ultra_large_metric_name3.csv
https://github.com/Samox/data-history/commit/176e28321049706248a4e3b7838bd777c15cdf98
https://github.com/Samox/data-history/commit/8bde95c4196be6c2cb9b42ead5ec26ab97610cf0
Drift detected
Could not display drift
Drift evaluator failed: 'dict' object has no attribute 'reported_dataframe'
Using default drift evaluator
Drift evaluation: {'should_alert': True, 'message': 'Drift detected:\n- 🆕 599732 additions\n- ~~♻️ 0 modification~~\n- 🗑️ 600000 deletions'}
Commit: Drift: path/to/ultra_large_metric_name3.csv
https://github.com/Samox/data-history/commit/d91e6e0e803a0781496dc7da0ab633384baf7e72
https://github.com/Samox/data-history/commit/70cfa8e100b4a55df39b87ce25986c32a1dd6a90
Drift pushed
Creating pull request
Pull r

In [20]:
ultra_large_df = ultra_large_df.iloc[:-1]
