In [15]:
import sys
from dotenv import load_dotenv
from github import Github
import os

# Create a .env with the following content:
# GH_TOKEN=your_github_token
# REPON=$gh_org/$repo

# Load environment variables from .env file
load_dotenv()


# Get GitHub token from environment variable
gh_token = os.getenv("GH_TOKEN")
if gh_token is None:
    print("GitHub token not found! Create a .env file a the root with a GH_TOKEN variable.")
    exit(1)
gh_client = Github(gh_token)


In [50]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"

def formatDF(dict):
    df = pd.DataFrame(dict)
    df['unique_key'] = df.apply(lambda row: row['date'] + '-' + row['name'], axis=1)
    column_order = ['unique_key'] + [col for col in df.columns if col != 'unique_key']
    df = df.reindex(columns=column_order)
    return df


file_path = repo+"/path/to/metric_name.csv"

# Store metric for the first time
dataMonth1 = {"name": ["Alice", "Bob", "Charlie"], "date": ["2022-12","2023-01","2023-01"], "age": [25, 30, 35]}
store_metric(gh_client,  formatDF(dataMonth1), file_path, assignees=["Sammy"])




Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/copy-libeo-data-history/reported/path/to/metric_name.csv?token=ABUWFP5G27SF52I76FT5UM3EUWGN2
Drift detected
Commit: Drift: path/to/metric_name.csv
Drift pushed
Creating pull request
Pull request created: https://github.com/Samox/copy-libeo-data-history/pull/61
Assignee Sammy does not exist


In [5]:
# ## Introduce new data for 2023-02
dataMonth2 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02"], "age": [25, 30, 35, 40, 40, 40]}
store_metric(gh_client,  formatDF(dataMonth2), file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: reported
New data found
Commit: New data: path/to/metric_name.csv
Storing metric...
Metric found, updating it on branch: reported
New data found
Commit: New data: path/to/metric_name.csv
Drift detected
Commit: Drift: path/to/metric_name.csv
Drift pushed
Creating pull request
Pull request created: https://github.com/Samox/copy-libeo-data-history/pull/53
Assignee Sammy does not exist


In [49]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric
import pandas as pd

# ## Introduce new data for 2023-03 and a drift on 2020-02
dataMonth3 = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 36, 40, 42, 40, 45, 45, 46]}
store_metric(gh_client,  formatDF(dataMonth3), file_path, assignees=["Sammy"])



Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/copy-libeo-data-history/reported/path/to/metric_name.csv?token=ABUWFP4V2FW26Z7OLXEKKLDEUWGMU


In [12]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
dataMonth3And1Day = {"name": ["Alice", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 30, 35, 40, 42, 40, 45, 45, 46]}
store_metric(gh_client,  formatDF(dataMonth3And1Day), file_path, assignees=["Sammy"])


Storing metric...
Metric found, updating it on branch: reported
        unique_key     name     date  age
0    2022-12-Alice    Alice  2022-12   25
1      2023-01-Bob      Bob  2023-01   30
2  2023-01-Charlie  Charlie  2023-01   36
3  2023-02-Antoine  Antoine  2023-02   40
4   2023-02-Didier   Didier  2023-02   40
5  2023-02-Philipe  Philipe  2023-02   42
6  2023-03-Clement  Clement  2023-03   45
7    2023-03-Cyril    Cyril  2023-03   45
8   2023-03-Victor   Victor  2023-03   46
        unique_key     name     date  age
0    2022-12-Alice    Alice  2022-12   25
1      2023-01-Bob      Bob  2023-01   30
2  2023-01-Charlie  Charlie  2023-01   35
3  2023-02-Antoine  Antoine  2023-02   40
4   2023-02-Didier   Didier  2023-02   40
5  2023-02-Philipe  Philipe  2023-02   42
6  2023-03-Clement  Clement  2023-03   45
7    2023-03-Cyril    Cyril  2023-03   45
8   2023-03-Victor   Victor  2023-03   46
Drift detected


ValueError: Can only compare identically-labeled DataFrame objects

In [8]:
# ## No new data. Adds a drift for Philipe, and remove all other drifts
dataMonth3And2Day = {"name": ["Alice", "Alixe", "Bob", "Charlie", "Didier", "Philipe", "Antoine", "Clement", "Cyril", "Victor"], "date": ["2022-12","2022-12","2023-01","2023-01","2023-02","2023-02","2023-02","2023-03","2023-03","2023-03"], "age": [25, 25, 30, 35, 40, 42, 40, 45, 45, 46]}
store_metric(gh_client,  formatDF(dataMonth3And2Day), file_path, assignees=["Sammy"])


Storing metric...
Metric found, updating it on branch: reported
Drift detected
Commit: Drift: path/to/metric_name.csv
Drift pushed
Creating pull request
Pull request created: https://github.com/Samox/copy-libeo-data-history/pull/50
Assignee Sammy does not exist
