In [1]:
import sys
from dotenv import load_dotenv
from github import Github
import os

# Create a .env with the following content:
# GH_TOKEN=your_github_token
# REPON=$gh_org/$repo

# Load environment variables from .env file
load_dotenv("../.env")


# Get GitHub token from environment variable
gh_token = os.getenv("GH_TOKEN")
if gh_token is None:
    print("GitHub token not found! Create a .env file a the root with a GH_TOKEN variable.")
    exit(1)
gh_client = Github(gh_token, timeout=60)
repo = os.getenv("REPO") or "gh_org/repo"
repo = gh_client.get_repo(repo)

In [3]:
import sys
sys.path.append('..')
import importlib
import datagit.drift_evaluator.drift_evaluators
importlib.reload(datagit.drift_evaluator.drift_evaluators)

import datagit.connectors.github_connector
importlib.reload(datagit.connectors.github_connector)
from datagit.connectors.common import store_table
import pandas as pd

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"

def formatDF(dict):
    df = pd.DataFrame(dict)
    df['unique_key'] = df.apply(lambda row: row['date'] + '-' + row['name'], axis=1)
    column_order = ['unique_key'] + [col for col in df.columns if col != 'unique_key']
    df = df.reindex(columns=column_order)
    return df


file_path = "path/to/metric_name_13.csv"

# Store metric for the first time
dataMonth1 = {"name": ["Alice", "Bob", "Charlie"], "date": ["2022-12","2023-01","2023-01"], "age": [25, 30, 35]}
store_table(
    github_client=gh_client,
    github_repository_name= repo,
    table_dataframe= formatDF(dataMonth1),
    table_name= file_path,
    assignees=["Sammy"]
)




Storing table...
Table found, updating it on branch: main
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Nothing to update


In [3]:
# ## Introduce new data for 2023-02
dataMonth2 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02"], "age": [25, 30, 35, 40, 40, 40]}
store_table(github_client=gh_client,table_dataframe=  formatDF(dataMonth2) ,   github_repository_name= repo,table_name= file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP54CD43BDTBMCRWRV3FKXVUM
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: DRIFT
https://github.com/Samox/data-history/commit/58e726f5c132f95ea1676e23cec82e13b7330fe9


In [4]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_table
import pandas as pd

# ## Introduce new data for 2023-03 and a drift on 2020-02
dataMonth3 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 36, 40, 42, 40, 45, 45, 46]}
store_table(github_client=gh_client,table_dataframe=  formatDF(dataMonth3) ,   github_repository_name= repo,table_name= file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFPYZKDRYBHYWT6V4O3LFKXVUS
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: NEW DATA
https://github.com/Samox/data-history/commit/42c80eb60b7a8930398c21480d49832cbd51144f
Update: DRIFT
https://github.com/Samox/data-history/commit/9cb75389b6acfd5beb463254ae4549751c09eb60


In [5]:

from datagit.dataframe_update_breakdown import dataframe_update_breakdown


dataframe_update_breakdown(initial_dataframe=formatDF(dataMonth2), final_dataframe=formatDF(dataMonth3))

{'MIGRATION Column Deleted': {'df':                     name     date  age
  unique_key                            
  2022-12-Alice      Alice  2022-12   25
  2023-01-Bob          Bob  2023-01   30
  2023-01-Charlie  Charlie  2023-01   35
  2023-02-Didier    Didier  2023-02   40
  2023-02-Philipe  Philipe  2023-02   40
  2023-02-Antoine  Antoine  2023-02   40,
  'has_update': False,
  'type': <UpdateType.OTHER: 'other'>,
  'drift_context': None,
  'drift_evaluation': None,
  'drift_summary': None},
 'NEW DATA': {'df':                     name     date  age
  unique_key                            
  2022-12-Alice      Alice  2022-12   25
  2023-01-Bob          Bob  2023-01   30
  2023-01-Charlie  Charlie  2023-01   35
  2023-02-Didier    Didier  2023-02   40
  2023-02-Philipe  Philipe  2023-02   40
  2023-02-Antoine  Antoine  2023-02   40
  2023-03-Clement  Clement  2023-03   45
  2023-03-Cyril      Cyril  2023-03   45
  2023-03-Victor    Victor  2023-03   46,
  'has_update': True,
  't

In [6]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
from datagit.drift_evaluators import AlertDriftEvaluator


dataMonth3And1Day = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 35, 40, 42, 40, 45, 45, 46]}
store_table(github_client=gh_client, table_dataframe= formatDF(dataMonth3And1Day), github_repository_name= repo,table_name= file_path, assignees=["Sammy"], drift_evaluator=AlertDriftEvaluator())


Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP7UDOQEJ276RAJHEZDFKXVU2
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: DRIFT
Branch drift/2023-11-16-11-12-31/path-to-metric-name-13-csv doesn't exist, creating it...
Checkout branch: drift/2023-11-16-11-12-31/path-to-metric-name-13-csv  from branch:main
https://github.com/Samox/data-history/commit/6c060a9c345484a3e77f64ca3186b4b805c8f2ad
Pull request created: https://github.com/Samox/data-history/pull/180
Assignee Sammy does not exist


In [7]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
dataMonth3And2Day = {"name": ["Alice", "Alixe", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 25, 30, 35, 40, 42, 40, 45, 45, 46]}

store_table(github_client=gh_client, table_dataframe= formatDF(dataMonth3And2Day), github_repository_name= repo,table_name= file_path, assignees=["Sammy"], drift_evaluator=AlertDriftEvaluator())


Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/metric_name_13.csv?token=ABUWFP3F7AJFNNDR7NWTD7DFKXVVI
Dataframe dtypes {'name': string[python], 'date': string[python], 'age': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'name': string[python], 'date': string[python], 'age': string[python]}
Change detected
Update: DRIFT
Branch drift/2023-11-16-11-12-39/path-to-metric-name-13-csv doesn't exist, creating it...
Checkout branch: drift/2023-11-16-11-12-39/path-to-metric-name-13-csv  from branch:main
https://github.com/Samox/data-history/commit/62652b5a3b60e6e6752a5a5acb0be91d4f4e9798
Pull request created: https://github.com/Samox/data-history/pull/181
Assignee Sammy does not exist


In [8]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker to generate random data
fake = Faker()

# Set the number of rows for the dataframe
num_rows = 600000

# Generate random IDs and dates
ids = [fake.uuid4() for _ in range(num_rows)]
dates = [fake.date_between(start_date='-30y', end_date='today').strftime('%Y-%m-%d') for _ in range(num_rows)]

# Generate random metric values between 0 and 10
metric_values = [round(random.uniform(0, 10),2) for _ in range(num_rows)]

# Create the dataframe
ultra_large_df = pd.DataFrame({'unique_key': ids, 'date': dates, 'metric_value': metric_values})

# Print the dataframe
print(ultra_large_df)

                                  unique_key        date  metric_value
0       03d1f3b5-81d4-48ba-a7b9-854c42321f6a  2004-02-06          2.87
1       50667f8d-ba05-491d-8343-2cced0a8e9c5  2015-12-14          2.84
2       30aff271-c00c-4eef-a544-d8f91285523f  2019-07-21          7.68
3       4b8a7e22-ba3a-453d-92dc-e4d2dc06411e  2003-04-19          8.29
4       da70e52d-0579-4ab5-ba61-0b5929d7dacd  2022-03-17          6.50
...                                      ...         ...           ...
599995  acb89dbd-ac1c-4471-924e-1b1136ee49be  2008-09-04          1.38
599996  5b120793-e534-425d-8450-c255fd770771  2015-04-28          8.86
599997  228a5b5b-416b-4c96-9ebb-1638f92678a8  2014-03-07          2.77
599998  aff2bba0-5935-49dc-bcc5-9fe830e15140  1995-07-31          9.70
599999  9aaf1ebe-5073-46a1-88e6-9e690acbf6ad  2018-03-25          8.33

[600000 rows x 3 columns]


In [9]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"
store_metric(ghClient=gh_client,  dataframe=ultra_large_df, filepath=repo+"/path/to/ultra_large_metric_name3.csv", assignees=["Sammy"])

ImportError: cannot import name 'store_metric' from 'datagit.github_connector' (/Users/sammyteillet/Documents/Projects/DataDrift/data-drift/tools/datagit/datagit/github_connector.py)

In [None]:
ultra_large_df = ultra_large_df.iloc[:-1]
