In [1]:
import sys
from dotenv import load_dotenv
from github import Github
import os

# Create a .env with the following content:
# GH_TOKEN=your_github_token
# REPON=$gh_org/$repo

# Load environment variables from .env file
load_dotenv("../.env")


# Get GitHub token from environment variable
gh_token = os.getenv("GH_TOKEN")
if gh_token is None:
    print("GitHub token not found! Create a .env file a the root with a GH_TOKEN variable.")
    exit(1)
gh_client = Github(gh_token, timeout=60)


In [73]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker to generate random data
fake = Faker()

# Set the number of rows for the dataframe
num_rows = 100000

# Generate random IDs and dates
ids = [fake.uuid4() for _ in range(num_rows)]
dates = [fake.date_between(start_date='-30y', end_date='today').strftime('%Y-%m-%d') for _ in range(num_rows)]

# Generate random metric values between 0 and 10
metric_values = [round(random.uniform(0, 10),2) for _ in range(num_rows)]
# Generate random country codes
country_codes = [fake.country_code() for _ in range(num_rows)]

# Generate random categories
categories = [random.choice(['Category A', 'Category B', 'Category C']) for _ in range(num_rows)]

# Create the dataframe
ultra_large_df = pd.DataFrame({'unique_key': ids, 'date': dates, 'metric_value': metric_values, 'country_code': country_codes, 'category': categories})

# Print the dataframe
print(ultra_large_df)
local_file_path = 'ultra_large_df.csv'
ultra_large_df.to_csv(local_file_path, index=False)
print('Size of DataFrame in bytes:', os.path.getsize(local_file_path))



                                 unique_key        date  metric_value  \
0      ff9f0f4c-1e5b-4229-9559-e300621ba05c  2006-12-02          6.88   
1      1374e72b-6d35-4156-8832-691a2650b25e  2004-06-15          7.89   
2      ad24dcf8-f9f0-4ca0-b730-99f67cffd540  1999-02-16          2.85   
3      2355cb2d-fd4e-40d1-a80f-2b1a1c7eb45a  2019-01-03          0.72   
4      a046cc3d-c4ac-454d-bcc3-9a228a97943b  2022-01-25          3.58   
...                                     ...         ...           ...   
99995  97ec909b-2dad-4503-8fac-2ee8e3f42c4b  2003-09-01          8.04   
99996  c3b8463c-7718-453f-8ebd-f47d3f06342f  2011-01-17          9.90   
99997  8d78aff3-e28d-4485-a967-5651774f2c46  2020-11-25          1.66   
99998  ef946185-adb5-4874-88a6-ee2bb0f2e354  2000-08-27          9.86   
99999  53a49f76-c1dc-4c48-ad3f-36f8925c9ad8  2011-05-29          4.78   

      country_code    category  
0               SZ  Category A  
1               GY  Category C  
2               ES  Cate

In [74]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"
file_path = repo+"/path/to/ultra_large_metric_name12.csv"
store_metric(gh_client,  ultra_large_df, file_path, assignees=["Sammy"])

Storing metric...
Metric not found, creating it on branch: reported
Commit: New data: path/to/ultra_large_metric_name12.csv
Metric stored


In [75]:
import numpy as np

# Set the seed for reproducibility
np.random.seed(42)

ultra_large_df2 = ultra_large_df.copy()

# Select 10 random indices for metric value update
random_indices_metric = np.random.choice(ultra_large_df2.index, size=10, replace=False)
print(random_indices_metric)
# Update metric value with random values between 0 and 10
ultra_large_df2.loc[random_indices_metric, 'metric_value'] = [round(random.uniform(0, 10),2) for _ in range(10)]


[75721 80184 19864 76699 92991 76434 84004 80917 60767 50074]


In [25]:
ultra_large_df.dtypes

unique_key       object
date             object
metric_value    float64
country_code     object
category         object
dtype: object

In [77]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric

store_metric(gh_client,  ultra_large_df2, file_path, assignees=["Sammy"])

Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/data-history/reported/path/to/ultra_large_metric_name12.csv?token=ABUWFP4LXKMFOMNZXWO3EZDEWVTPI
Dataframe dtypes {'unique_key': string[python], 'date': string[python], 'metric_value': string[python], 'country_code': string[python], 'category': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'date': string[python], 'metric_value': string[python], 'country_code': string[python], 'category': string[python]}
dataframe length 100000
new_dataframe len 0
old_data_with_freshdata len 100000
Drift detected
comparison Empty DataFrame
Columns: []
Index: []
Drift evaluator failed: 'dict' object has no attribute 'reported_dataframe'
Using default drift evaluator
Drift evaluation: {'should_alert': True, 'message': 'Drift detected:\n- ~~🆕 0 addition~~\n- ~~♻️ 0 modification~~\n- ~~🗑️ 0 deletion~~'}
Commit: Drift: path/to/ultra_large_metric_name12.csv
https://github.com/Sa