In [1]:
import sys
from dotenv import load_dotenv
from github import Github
import os

# Create a .env with the following content:
# GH_TOKEN=your_github_token
# REPON=$gh_org/$repo

# Load environment variables from .env file
load_dotenv("../.env")


# Get GitHub token from environment variable
gh_token = os.getenv("GH_TOKEN")
if gh_token is None:
    print("GitHub token not found! Create a .env file a the root with a GH_TOKEN variable.")
    exit(1)
gh_client = Github(gh_token, timeout=60)


In [86]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker to generate random data
fake = Faker()

# Set the number of rows for the dataframe
num_rows = 600000

# Generate random IDs and dates
ids = [fake.uuid4() for _ in range(num_rows)]
dates = [fake.date_between(start_date='-30y', end_date='today').strftime('%Y-%m-%d') for _ in range(num_rows)]

# Generate random metric values between 0 and 10
metric_values = [round(random.uniform(0, 10),2) for _ in range(num_rows)]
# Generate random country codes
country_codes = [fake.country_code() for _ in range(num_rows)]

# Generate random categories
categories = [random.choice(['Category A', 'Category B', 'Category C']) for _ in range(num_rows)]

# Create the dataframe
ultra_large_df = pd.DataFrame({'unique_key': ids, 'date': dates, 'metric_value': metric_values, 'country_code': country_codes, 'category': categories})

# Print the dataframe
print(ultra_large_df)
local_file_path = 'ultra_large_df.csv'
ultra_large_df.to_csv(local_file_path, index=False)
print('Size of DataFrame in bytes:', os.path.getsize(local_file_path))



                                  unique_key        date  metric_value  \
0       9e4f5f12-fb6e-4016-871d-eb5eea6ff076  2016-01-17          0.29   
1       4bbe12d9-8fdb-48a1-9077-61689a048def  2000-12-01          6.02   
2       9dec416e-c033-4588-85e3-9648e9164327  2016-03-04          9.78   
3       8442e571-5a8f-477b-9b90-f85a64193d54  2008-10-22          4.79   
4       986f3ca8-6e12-4af3-80b7-69e0047ae84f  2000-05-26          1.15   
...                                      ...         ...           ...   
599995  b5898424-a220-4a9c-b338-f5a2dc6c7c71  2005-06-28          0.32   
599996  c6e33b7f-8072-4224-99f7-5c5c8d0e39b5  2006-02-20          7.40   
599997  66b9e8f0-6eb9-4a56-aa1e-2067896eb46c  1994-11-16          3.45   
599998  022ddcc2-3372-4708-b295-257a502b082b  2016-02-12          2.88   
599999  c256d003-db9b-4dad-8151-5593b3364116  2007-11-15          8.02   

       country_code    category  
0                AR  Category C  
1                LB  Category A  
2        

In [88]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"
file_path = repo+"/path/to/ultra_large_metric_name13.csv"
store_metric(gh_client,  ultra_large_df, file_path, assignees=["Sammy"])

Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/data-history/reported/path/to/ultra_large_metric_name13.csv?token=ABUWFPY65DKE55CIQTZN4CLEWVVVO
Dataframe dtypes {'unique_key': string[python], 'date': string[python], 'metric_value': string[python], 'country_code': string[python], 'category': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'date': string[python], 'metric_value': string[python], 'country_code': string[python], 'category': string[python]}
Branch metric/path-to-ultra-large-metric-name13-csv doesn't exist, creating it...
comparison Empty DataFrame
Columns: []
Index: []
No drift detected


In [89]:
import numpy as np

# Set the seed for reproducibility
np.random.seed(42)

ultra_large_df2 = ultra_large_df.copy()

# Select 10 random indices for metric value update
random_indices_metric = np.random.choice(ultra_large_df2.index, size=10, replace=False)
print(random_indices_metric)
# Update metric value with random values between 0 and 10
ultra_large_df2.loc[random_indices_metric, 'metric_value'] = [round(random.uniform(0, 10),2) for _ in range(10)]


[  4242  60608 392832  41643 464234 122681  10258 199077 303125 520722]


In [25]:
ultra_large_df.dtypes

unique_key       object
date             object
metric_value    float64
country_code     object
category         object
dtype: object

In [90]:
import sys
sys.path.append('..')
import importlib

import datagit.drift_evaluators
importlib.reload(datagit.drift_evaluators)
from datagit.drift_evaluators import default_drift_evaluator


import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric


store_metric(gh_client,  ultra_large_df2, file_path, assignees=["Sammy"])

Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/data-history/reported/path/to/ultra_large_metric_name13.csv?token=ABUWFP6464EDFCKTVPPS7EDEWVVWY
Dataframe dtypes {'unique_key': string[python], 'date': string[python], 'metric_value': string[python], 'country_code': string[python], 'category': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'date': string[python], 'metric_value': string[python], 'country_code': string[python], 'category': string[python]}
comparison                                      metric_value      
                                             self other
unique_key                                             
00eb7e35-d14e-498a-83f2-037990df6cde         7.61  6.07
01e0629f-fbec-4c91-9622-a80faf101ed7         3.83  3.81
557b60a5-3e1f-4f9e-aa66-826a34609977         7.58  8.41
60ca30ac-0160-4541-a286-d87420c0c89c          3.9  7.73
7a2699c5-7bef-4ebc-96ce-4d0b82ce9ecb         7.15  2.14
9