In [3]:
import sys
from dotenv import load_dotenv
from github import Github
import os

# Create a .env with the following content:
# GH_TOKEN=your_github_token
# REPON=$gh_org/$repo

# Load environment variables from .env file
load_dotenv("../.env")


# Get GitHub token from environment variable
gh_token = os.getenv("GH_TOKEN")
if gh_token is None:
    print("GitHub token not found! Create a .env file a the root with a GH_TOKEN variable.")
    exit(1)
gh_client = Github(gh_token, timeout=60)
repo = os.getenv("REPO") or "gh_org/repo"
repo = gh_client.get_repo(repo)

In [4]:
import sys
sys.path.append('..')
import importlib
import datagit.drift_evaluators
importlib.reload(datagit.drift_evaluators)

import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"

def formatDF(dict):
    df = pd.DataFrame(dict)
    df['unique_key'] = df.apply(lambda row: row['date'] + '-' + row['name'], axis=1)
    column_order = ['unique_key'] + [col for col in df.columns if col != 'unique_key']
    df = df.reindex(columns=column_order)
    return df


file_path = repo+"/path/to/metric_name_13.csv"

# Store metric for the first time
dataMonth1 = {"name": ["Alice", "Bob", "Charlie"], "date": ["2022-12","2023-01","2023-01"], "age": [25, 30, 35]}
store_metric(ghClient=gh_client, dataframe= formatDF(dataMonth1),filepath= file_path, assignees=["Sammy"])




Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP2RHA2DRED27CB3G53FJZGOC
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Nothing to update


In [5]:
# ## Introduce new data for 2023-02
dataMonth2 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02"], "age": [25, 30, 35, 40, 40, 40]}
store_metric(ghClient=gh_client,dataframe=  formatDF(dataMonth2),filepath= file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP76W7WL7XLABILNWZTFJZGOE
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: NEW DATA
https://github.com/Samox/data-history/commit/6b0a38f1d2d1d6ed6aef2bb4558ee6db5391beae


In [6]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

# ## Introduce new data for 2023-03 and a drift on 2020-02
dataMonth3 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 36, 40, 42, 40, 45, 45, 46]}
store_metric(ghClient=gh_client,dataframe=  formatDF(dataMonth3),filepath= file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP3NBQCMOHCF5OCLGNTFJZGOI
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: NEW DATA
https://github.com/Samox/data-history/commit/dd061b46ef26ef05c957ee3f77058ea6266929b7
Update: DRIFT Modification
https://github.com/Samox/data-history/commit/87b6cb6fd5e079429c29edb4cc8a92636d8553c6


In [7]:

from datagit.dataframe_update_breakdown import dataframe_update_breakdown


dataframe_update_breakdown(initial_dataframe=formatDF(dataMonth2), final_dataframe=formatDF(dataMonth3))

{'MIGRATION Column Deleted': {'df':                     name     date  age
  unique_key                            
  2022-12-Alice      Alice  2022-12   25
  2023-01-Bob          Bob  2023-01   30
  2023-01-Charlie  Charlie  2023-01   35
  2023-02-Didier    Didier  2023-02   40
  2023-02-Philipe  Philipe  2023-02   40
  2023-02-Antoine  Antoine  2023-02   40,
  'has_update': False,
  'type': <UpdateType.OTHER: 'other'>,
  'drift_context': None},
 'NEW DATA': {'df':                     name     date  age
  unique_key                            
  2022-12-Alice      Alice  2022-12   25
  2023-01-Bob          Bob  2023-01   30
  2023-01-Charlie  Charlie  2023-01   35
  2023-02-Didier    Didier  2023-02   40
  2023-02-Philipe  Philipe  2023-02   40
  2023-02-Antoine  Antoine  2023-02   40
  2023-03-Clement  Clement  2023-03   45
  2023-03-Cyril      Cyril  2023-03   45
  2023-03-Victor    Victor  2023-03   46,
  'has_update': True,
  'type': <UpdateType.OTHER: 'other'>,
  'drift_context':

In [8]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
from datagit.drift_evaluators import alert_drift


dataMonth3And1Day = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 35, 40, 42, 40, 45, 45, 46]}
store_metric(ghClient=gh_client, dataframe= formatDF(dataMonth3And1Day),filepath= file_path, assignees=["Sammy"], drift_evaluator=alert_drift)


Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP3EC652K74YTXPDNGDFJZGOO
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: DRIFT Modification
Branch drift/2023-11-10-16-30-50/path-to-metric-name-13-csv doesn't exist, creating it...
Checkout branch: drift/2023-11-10-16-30-50/path-to-metric-name-13-csv  from default branch:main
https://github.com/Samox/data-history/commit/a2fb50b405226829dd85652de3666809bfa6c7e5
Pull request created: https://github.com/Samox/data-history/pull/170
Assignee Sammy does not exist


In [9]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
dataMonth3And2Day = {"name": ["Alice", "Alixe", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 25, 30, 35, 40, 42, 40, 45, 45, 46]}
store_metric(ghClient=gh_client, dataframe= formatDF(dataMonth3And2Day), filepath=file_path, assignees=["Sammy"], drift_evaluator=alert_drift)


Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP27I5T7GS3WYEWRUODFJZGO2
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: DRIFT Addition
Branch drift/2023-11-10-16-30-56/path-to-metric-name-13-csv doesn't exist, creating it...
Checkout branch: drift/2023-11-10-16-30-56/path-to-metric-name-13-csv  from default branch:main
https://github.com/Samox/data-history/commit/a0cc0c4501ad179b67dbdc67a4902899646ae743
Update: DRIFT Modification
https://github.com/Samox/data-history/commit/50e38d8d195aa5f4c20e0780a7e6cf9fd61d407c
Pull request created: https://github.com/Samox/data-history/pull/171
Assignee Sammy does not exist


In [10]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker to generate random data
fake = Faker()

# Set the number of rows for the dataframe
num_rows = 600000

# Generate random IDs and dates
ids = [fake.uuid4() for _ in range(num_rows)]
dates = [fake.date_between(start_date='-30y', end_date='today').strftime('%Y-%m-%d') for _ in range(num_rows)]

# Generate random metric values between 0 and 10
metric_values = [round(random.uniform(0, 10),2) for _ in range(num_rows)]

# Create the dataframe
ultra_large_df = pd.DataFrame({'unique_key': ids, 'date': dates, 'metric_value': metric_values})

# Print the dataframe
print(ultra_large_df)

                                  unique_key        date  metric_value
0       4bca7ccb-fe2b-4527-b054-8bfd30de39bb  2015-06-05          2.76
1       db6a8c43-dc5e-449f-84a3-c274225df761  2018-06-21          5.51
2       7fc9012f-195d-4b60-bdd8-66dcb1caa95d  2011-09-05          5.34
3       9146d08b-3c30-433f-9b6c-3db39ca59a70  2009-08-18          3.35
4       1a72cfe6-5e1c-4007-9994-f09f4d078854  2007-04-27          0.49
...                                      ...         ...           ...
599995  6afbeeac-e373-4bb1-a15f-efad4a6b067b  2015-03-15          7.96
599996  09d57bc7-dc12-4894-b81f-de69a2912c51  2000-10-06          1.77
599997  dc50e686-6be2-4f1b-99f1-6aca63125443  1997-11-25          1.42
599998  eea2bca6-c9d7-413e-8573-0a3f61a47f54  1995-03-04          9.13
599999  a5838e57-b327-416e-95e5-dfc83c007e78  1996-11-16          5.27

[600000 rows x 3 columns]


In [11]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"
store_metric(ghClient=gh_client,  dataframe=ultra_large_df, filepath=repo+"/path/to/ultra_large_metric_name3.csv", assignees=["Sammy"])

Storing metric...
Metric not found, creating it on branch: main
Commit: New data: path/to/ultra_large_metric_name3.csv
Metric stored


In [12]:
ultra_large_df = ultra_large_df.iloc[:-1]
