In [1]:
import sys
from dotenv import load_dotenv
from github import Github
import os

# Create a .env with the following content:
# GH_TOKEN=your_github_token
# REPON=$gh_org/$repo

# Load environment variables from .env file
load_dotenv("../.env")


# Get GitHub token from environment variable
gh_token = os.getenv("GH_TOKEN")
if gh_token is None:
    print("GitHub token not found! Create a .env file a the root with a GH_TOKEN variable.")
    exit(1)
gh_client = Github(gh_token, timeout=60)
repo = os.getenv("REPO") or "gh_org/repo"
repo = gh_client.get_repo(repo)

In [3]:
import sys
sys.path.append('..')
import importlib
import datagit.drift_evaluators
importlib.reload(datagit.drift_evaluators)

import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_table
import pandas as pd

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"

def formatDF(dict):
    df = pd.DataFrame(dict)
    df['unique_key'] = df.apply(lambda row: row['date'] + '-' + row['name'], axis=1)
    column_order = ['unique_key'] + [col for col in df.columns if col != 'unique_key']
    df = df.reindex(columns=column_order)
    return df


file_path = "/path/to/metric_name_13.csv"

# Store metric for the first time
dataMonth1 = {"name": ["Alice", "Bob", "Charlie"], "date": ["2022-12","2023-01","2023-01"], "age": [25, 30, 35]}
store_table(
    github_client=gh_client,
    github_repository_name= repo,
    table_dataframe= formatDF(dataMonth1),
    table_name= file_path,
    assignees=["Sammy"]
)




Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFPYHI2WJRM4VAUF3MO3FKTL6O
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: DRIFT


GithubException: 422 {"message": "path cannot start with a slash", "errors": [{"resource": "Commit", "field": "path", "code": "invalid"}], "documentation_url": "https://docs.github.com/rest/repos/contents#create-or-update-file-contents"}

In [3]:
# ## Introduce new data for 2023-02
dataMonth2 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02"], "age": [25, 30, 35, 40, 40, 40]}
store_metric(ghClient=gh_client,dataframe=  formatDF(dataMonth2),filepath= file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP4DV3KQ7XRYD2DPVG3FKOTP6
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: NEW DATA
https://github.com/Samox/data-history/commit/057b8159c95bebe9164128e7508d2b77fafe00a2


In [4]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

# ## Introduce new data for 2023-03 and a drift on 2020-02
dataMonth3 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 36, 40, 42, 40, 45, 45, 46]}
store_metric(ghClient=gh_client,dataframe=  formatDF(dataMonth3),filepath= file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP77BUGFA77QUHHJY33FKOTQC
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: NEW DATA
https://github.com/Samox/data-history/commit/a2122ae8979b0349b3ea5d8415bfded1366b5b22
Update: DRIFT
https://github.com/Samox/data-history/commit/933403257a5098278d013a0fac0bf0fa2213837d


In [5]:

from datagit.dataframe_update_breakdown import dataframe_update_breakdown


dataframe_update_breakdown(initial_dataframe=formatDF(dataMonth2), final_dataframe=formatDF(dataMonth3))

{'MIGRATION Column Deleted': {'df':                     name     date  age
  unique_key                            
  2022-12-Alice      Alice  2022-12   25
  2023-01-Bob          Bob  2023-01   30
  2023-01-Charlie  Charlie  2023-01   35
  2023-02-Didier    Didier  2023-02   40
  2023-02-Philipe  Philipe  2023-02   40
  2023-02-Antoine  Antoine  2023-02   40,
  'has_update': False,
  'type': <UpdateType.OTHER: 'other'>,
  'drift_context': None,
  'drift_evaluation': None,
  'drift_summary': None},
 'NEW DATA': {'df':                     name     date  age
  unique_key                            
  2022-12-Alice      Alice  2022-12   25
  2023-01-Bob          Bob  2023-01   30
  2023-01-Charlie  Charlie  2023-01   35
  2023-02-Didier    Didier  2023-02   40
  2023-02-Philipe  Philipe  2023-02   40
  2023-02-Antoine  Antoine  2023-02   40
  2023-03-Clement  Clement  2023-03   45
  2023-03-Cyril      Cyril  2023-03   45
  2023-03-Victor    Victor  2023-03   46,
  'has_update': True,
  't

In [6]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
from datagit.drift_evaluators import alert_drift


dataMonth3And1Day = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 35, 40, 42, 40, 45, 45, 46]}
store_metric(ghClient=gh_client, dataframe= formatDF(dataMonth3And1Day),filepath= file_path, assignees=["Sammy"], drift_evaluator=alert_drift)


Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP32VEW5E7MJQOA5ZK3FKOTQK
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: DRIFT
Branch drift/2023-11-14-17-56-39/path-to-metric-name-13-csv doesn't exist, creating it...
Checkout branch: drift/2023-11-14-17-56-39/path-to-metric-name-13-csv  from default branch:main
https://github.com/Samox/data-history/commit/a9183249ebb64a258d8fa074e0f22c607590d3e8
Pull request created: https://github.com/Samox/data-history/pull/178
Assignee Sammy does not exist


In [7]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
dataMonth3And2Day = {"name": ["Alice", "Alixe", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 25, 30, 35, 40, 42, 40, 45, 45, 46]}
store_metric(ghClient=gh_client, dataframe= formatDF(dataMonth3And2Day), filepath=file_path, assignees=["Sammy"], drift_evaluator=alert_drift)


Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP2X5GXLWGHLLFWOTRLFKOTQY
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: DRIFT
Branch drift/2023-11-14-17-56-47/path-to-metric-name-13-csv doesn't exist, creating it...
Checkout branch: drift/2023-11-14-17-56-47/path-to-metric-name-13-csv  from default branch:main
https://github.com/Samox/data-history/commit/156b9a38f23cfa9c402eb66bc67f5d796d37d215
Pull request created: https://github.com/Samox/data-history/pull/179
Assignee Sammy does not exist


In [8]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker to generate random data
fake = Faker()

# Set the number of rows for the dataframe
num_rows = 600000

# Generate random IDs and dates
ids = [fake.uuid4() for _ in range(num_rows)]
dates = [fake.date_between(start_date='-30y', end_date='today').strftime('%Y-%m-%d') for _ in range(num_rows)]

# Generate random metric values between 0 and 10
metric_values = [round(random.uniform(0, 10),2) for _ in range(num_rows)]

# Create the dataframe
ultra_large_df = pd.DataFrame({'unique_key': ids, 'date': dates, 'metric_value': metric_values})

# Print the dataframe
print(ultra_large_df)

                                  unique_key        date  metric_value
0       95463292-2130-4f6e-ba8e-68a3ec119203  2020-09-07          2.69
1       eca12da9-6693-4d5e-b3de-09fd34985b9a  2021-02-11          1.51
2       efcf9b75-dac0-4c36-a669-3682ad278554  2003-09-15          0.94
3       bb2701a9-162c-4ece-8a08-6cd9073e2847  2002-04-10          9.68
4       162bce39-b269-4f68-887b-148df90ea90d  1997-01-11          3.89
...                                      ...         ...           ...
599995  60244023-edcb-40b5-b18e-d59398ec66ca  2016-02-28          0.82
599996  62f84b8c-8a98-4c82-9cde-bcdda4216a8f  2013-01-18          0.34
599997  d89b7270-0ee9-4a29-9788-1fd540b0d6b3  1996-02-23          4.76
599998  e0c3da68-cee3-419d-afea-9ce4a27a50bf  2008-02-10          5.35
599999  5524904c-75c4-4e44-a9e7-5aab97b99d34  2009-06-22          6.51

[600000 rows x 3 columns]


In [9]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"
store_metric(ghClient=gh_client,  dataframe=ultra_large_df, filepath=repo+"/path/to/ultra_large_metric_name3.csv", assignees=["Sammy"])

Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/ultra_large_metric_name3.csv?token=ABUWFP4GI2QDFSR6JLAGBZLFKOTSG
Dataframe dtypes {'date': string[python], 'metric_value': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'date': string[python], 'metric_value': string[python]}
Change detected
Update: DRIFT
https://github.com/Samox/data-history/commit/2a661fa5e3f28996abcfce12eb50b95c785633ad


In [10]:
ultra_large_df = ultra_large_df.iloc[:-1]
