In [1]:
import sys
from dotenv import load_dotenv
from github import Github
import os

# Create a .env with the following content:
# GH_TOKEN=your_github_token
# REPON=$gh_org/$repo

# Load environment variables from .env file
load_dotenv("../.env")


# Get GitHub token from environment variable
gh_token = os.getenv("GH_TOKEN")
if gh_token is None:
    print("GitHub token not found! Create a .env file a the root with a GH_TOKEN variable.")
    exit(1)
gh_client = Github(gh_token, timeout=60)
repo = os.getenv("REPO") or "gh_org/repo"
repo = gh_client.get_repo(repo)

In [2]:
import sys
sys.path.append('..')
import importlib
import datagit.drift_evaluators
importlib.reload(datagit.drift_evaluators)

import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"

def formatDF(dict):
    df = pd.DataFrame(dict)
    df['unique_key'] = df.apply(lambda row: row['date'] + '-' + row['name'], axis=1)
    column_order = ['unique_key'] + [col for col in df.columns if col != 'unique_key']
    df = df.reindex(columns=column_order)
    return df


file_path = repo+"/path/to/metric_name_13.csv"

# Store metric for the first time
dataMonth1 = {"name": ["Alice", "Bob", "Charlie"], "date": ["2022-12","2023-01","2023-01"], "age": [25, 30, 35]}
store_metric(ghClient=gh_client, dataframe= formatDF(dataMonth1),filepath= file_path, assignees=["Sammy"])




Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP5ODQO5XRDI23R5WYLFJZG5G
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: DRIFT Deletion
https://github.com/Samox/data-history/commit/12f0970e5c23684ce5f51a32dac591e22e0c0a35
Update: DRIFT Modification
https://github.com/Samox/data-history/commit/ffac5b458e7aa7ed26f4f24becaf3dc6cbb77567


In [3]:
# ## Introduce new data for 2023-02
dataMonth2 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02"], "age": [25, 30, 35, 40, 40, 40]}
store_metric(ghClient=gh_client,dataframe=  formatDF(dataMonth2),filepath= file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFPZWKSLFMQFZSANZ2CDFJZG5M
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: NEW DATA
https://github.com/Samox/data-history/commit/047c107b05949ae5cd37a0892ef8c21ffaebb3e5


In [4]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

# ## Introduce new data for 2023-03 and a drift on 2020-02
dataMonth3 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 36, 40, 42, 40, 45, 45, 46]}
store_metric(ghClient=gh_client,dataframe=  formatDF(dataMonth3),filepath= file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP7T6SDOWG65ZOAOHXDFJZG5Q
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: NEW DATA
https://github.com/Samox/data-history/commit/1cf960e9536c0718bd8691beb2193956b6bdc763
Update: DRIFT Modification
https://github.com/Samox/data-history/commit/c394f3197a808e73fe88987ba3b60be6c0be3d65


In [5]:

from datagit.dataframe_update_breakdown import dataframe_update_breakdown


dataframe_update_breakdown(initial_dataframe=formatDF(dataMonth2), final_dataframe=formatDF(dataMonth3))

{'MIGRATION Column Deleted': {'df':                     name     date  age
  unique_key                            
  2022-12-Alice      Alice  2022-12   25
  2023-01-Bob          Bob  2023-01   30
  2023-01-Charlie  Charlie  2023-01   35
  2023-02-Didier    Didier  2023-02   40
  2023-02-Philipe  Philipe  2023-02   40
  2023-02-Antoine  Antoine  2023-02   40,
  'has_update': False,
  'type': <UpdateType.OTHER: 'other'>,
  'drift_context': None},
 'NEW DATA': {'df':                     name     date  age
  unique_key                            
  2022-12-Alice      Alice  2022-12   25
  2023-01-Bob          Bob  2023-01   30
  2023-01-Charlie  Charlie  2023-01   35
  2023-02-Didier    Didier  2023-02   40
  2023-02-Philipe  Philipe  2023-02   40
  2023-02-Antoine  Antoine  2023-02   40
  2023-03-Clement  Clement  2023-03   45
  2023-03-Cyril      Cyril  2023-03   45
  2023-03-Victor    Victor  2023-03   46,
  'has_update': True,
  'type': <UpdateType.OTHER: 'other'>,
  'drift_context':

In [6]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
from datagit.drift_evaluators import alert_drift


dataMonth3And1Day = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 35, 40, 42, 40, 45, 45, 46]}
store_metric(ghClient=gh_client, dataframe= formatDF(dataMonth3And1Day),filepath= file_path, assignees=["Sammy"], drift_evaluator=alert_drift)


Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP3NLEVVG26DMZFMLCLFJZG5Y
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: DRIFT Modification
Branch drift/2023-11-10-16-34-55/path-to-metric-name-13-csv doesn't exist, creating it...
Checkout branch: drift/2023-11-10-16-34-55/path-to-metric-name-13-csv  from default branch:main
https://github.com/Samox/data-history/commit/297773c2c190639df3315b437927df10def1c39b
Pull request created: https://github.com/Samox/data-history/pull/172
Assignee Sammy does not exist


In [7]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
dataMonth3And2Day = {"name": ["Alice", "Alixe", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 25, 30, 35, 40, 42, 40, 45, 45, 46]}
store_metric(ghClient=gh_client, dataframe= formatDF(dataMonth3And2Day), filepath=file_path, assignees=["Sammy"], drift_evaluator=alert_drift)


Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP47GEKQRQRPHCF2ZZLFJZG6G
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: DRIFT Addition
Branch drift/2023-11-10-16-35-02/path-to-metric-name-13-csv doesn't exist, creating it...
Checkout branch: drift/2023-11-10-16-35-02/path-to-metric-name-13-csv  from default branch:main
https://github.com/Samox/data-history/commit/546c63b8668296c28988e1da8a4b5c8c3dd49296
Update: DRIFT Modification
https://github.com/Samox/data-history/commit/f6249e33839d8d5c497414716e9a38760a64668a
Pull request created: https://github.com/Samox/data-history/pull/173
Assignee Sammy does not exist


In [8]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker to generate random data
fake = Faker()

# Set the number of rows for the dataframe
num_rows = 600000

# Generate random IDs and dates
ids = [fake.uuid4() for _ in range(num_rows)]
dates = [fake.date_between(start_date='-30y', end_date='today').strftime('%Y-%m-%d') for _ in range(num_rows)]

# Generate random metric values between 0 and 10
metric_values = [round(random.uniform(0, 10),2) for _ in range(num_rows)]

# Create the dataframe
ultra_large_df = pd.DataFrame({'unique_key': ids, 'date': dates, 'metric_value': metric_values})

# Print the dataframe
print(ultra_large_df)

                                  unique_key        date  metric_value
0       2c5644f2-1cd1-4e97-8b28-8e89233732ac  2005-04-14          1.81
1       bd34730b-2200-40ef-9d65-1a74bc17a1da  2018-05-07          3.91
2       16f7798c-fa3a-41b2-b1e4-0e9e30d1aba2  1996-09-07          3.83
3       1ce4ce0a-cc0c-4c07-9acc-2ea898445dfd  1995-02-16          7.57
4       263f6c28-28c3-49c9-a621-971ee1c9118f  2018-04-25          9.33
...                                      ...         ...           ...
599995  3637aafb-8c56-45ce-a3d0-5a394eafa485  2016-09-26          8.26
599996  06a2320c-34c2-486a-a935-1fde764dd216  2012-08-16          9.59
599997  6e817a89-3c9e-49be-8bc8-67d36f6395fc  2016-01-26          2.21
599998  fb22e0ea-f44a-43d0-bd07-20bf915d504d  1993-11-10          2.98
599999  1b9e1cd6-16e7-413b-b80b-7ad3a6ce68cb  2005-05-28          2.62

[600000 rows x 3 columns]


In [9]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"
store_metric(ghClient=gh_client,  dataframe=ultra_large_df, filepath=repo+"/path/to/ultra_large_metric_name3.csv", assignees=["Sammy"])

Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/ultra_large_metric_name3.csv?token=ABUWFP422LZAKKOX6EPGRHTFJZG74
Dataframe dtypes {'date': string[python], 'metric_value': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'date': string[python], 'metric_value': string[python]}
Change detected
Update: DRIFT Deletion
https://github.com/Samox/data-history/commit/fe6291a88f2643b25ed39a7ae877615321e31235
Update: DRIFT Addition
https://github.com/Samox/data-history/commit/2f8c5a7ce3a065e737fb45a0f423d084020558c4


In [10]:
ultra_large_df = ultra_large_df.iloc[:-1]
