In [1]:
import sys
from dotenv import load_dotenv
from github import Github
import os

# Create a .env with the following content:
# GH_TOKEN=your_github_token
# REPON=$gh_org/$repo

# Load environment variables from .env file
load_dotenv("../.env")


# Get GitHub token from environment variable
gh_token = os.getenv("GH_TOKEN")
if gh_token is None:
    print("GitHub token not found! Create a .env file a the root with a GH_TOKEN variable.")
    exit(1)
gh_client = Github(gh_token, timeout=60)


In [19]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker to generate random data
fake = Faker()

# Set the number of rows for the dataframe
num_rows = 100000

# Generate random IDs and dates
ids = [fake.uuid4() for _ in range(num_rows)]
dates = [fake.date_between(start_date='-30y', end_date='today').strftime('%Y-%m-%d') for _ in range(num_rows)]

# Generate random metric values between 0 and 10
metric_values = [round(random.uniform(0, 10),2) for _ in range(num_rows)]
# Generate random country codes
country_codes = [fake.country_code() for _ in range(num_rows)]

# Generate random categories
categories = [random.choice(['Category A', 'Category B', 'Category C']) for _ in range(num_rows)]

# Create the dataframe
ultra_large_df = pd.DataFrame({'unique_key': ids, 'date': dates, 'metric_value': metric_values, 'country_code': country_codes, 'category': categories})

# Print the dataframe
print(ultra_large_df)
local_file_path = 'ultra_large_df.csv'
ultra_large_df.to_csv(local_file_path, index=False)
print('Size of DataFrame in bytes:', os.path.getsize(local_file_path))



                                 unique_key        date  metric_value  \
0      e648b9ea-7508-4128-8988-95c820aabbb9  2014-07-11          9.67   
1      0a6b375b-f430-40d6-a3bd-53062078b520  2002-03-23          0.81   
2      287de576-4e17-435e-a8e9-55975ab24204  2001-02-02          3.47   
3      38cb3fcc-8b1f-45aa-9825-ab461b43e627  1995-09-07          6.56   
4      ac0c645c-1ba5-476f-98f4-1ec3c925eef3  2002-05-28          7.33   
...                                     ...         ...           ...   
99995  8cc4e80e-b83c-48ad-b4f7-887512ca6dc1  2003-03-15          2.17   
99996  0b4fce83-1ee2-4211-82e5-3d6d52c3e82c  2007-05-21          6.96   
99997  31026d75-c9a4-4f89-b914-bf24a7275d40  2013-04-08          5.03   
99998  f6562254-0eb1-4112-ab45-d2ae564cec73  1999-01-03          3.22   
99999  c2454418-ba81-480a-bb95-148c77c613a4  2014-05-08          2.24   

      country_code    category  
0               GD  Category C  
1               ZA  Category A  
2               BR  Cate

In [41]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"
file_path = repo+"/path/to/ultra_large_metric_name8.csv"
store_metric(gh_client,  ultra_large_df, file_path, assignees=["Sammy"])

Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/data-history/reported/path/to/ultra_large_metric_name8.csv?token=ABUWFPZW53MU5BOGPTWDHRDEWVH6Y
dataframe.dtypes unique_key       object
date             object
metric_value    float64
country_code     object
category         object
dtype: object
new_dataframe Empty DataFrame
Columns: [unique_key, date, metric_value, country_code, category]
Index: []
old_data_with_freshdata.dtypes unique_key       object
date             object
metric_value    float64
country_code     object
category         object
dtype: object
Drift detected
                                     metric_value       country_code      
                                             self other         self other
unique_key                                                                
004d80bc-360b-40a8-b301-3319bc043136          NaN   NaN          NaN    NA
00d3dd9c-edeb-4417-9b32-16eee017378c          NaN   NaN 

In [22]:
import numpy as np

# Set the seed for reproducibility
np.random.seed(42)

# Select 10 random indices for metric value update
random_indices_metric = np.random.choice(ultra_large_df.index, size=10, replace=False)
print(random_indices_metric)
# Update metric value with random values between 0 and 10
ultra_large_df.loc[random_indices_metric, 'metric_value'] = [round(random.uniform(0, 10),2) for _ in range(10)]


[75721 80184 19864 76699 92991 76434 84004 80917 60767 50074]


In [25]:
ultra_large_df.dtypes

unique_key       object
date             object
metric_value    float64
country_code     object
category         object
dtype: object

In [26]:
store_metric(gh_client,  ultra_large_df, file_path, assignees=["Sammy"])

Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/data-history/reported/path/to/ultra_large_metric_name8.csv?token=ABUWFPZN23XAOQ6G5Y7LMGDEWVCBU
Drift detected
      metric_value       country_code      
              self other         self other
139            NaN   NaN          NaN    NA
342            NaN   NaN          NaN    NA
487            NaN   NaN          NaN    NA
528            NaN   NaN          NaN    NA
576            NaN   NaN          NaN    NA
...            ...   ...          ...   ...
98642          NaN   NaN          NaN    NA
98679          NaN   NaN          NaN    NA
98881          NaN   NaN          NaN    NA
99351          NaN   NaN          NaN    NA
99612          NaN   NaN          NaN    NA

[533 rows x 4 columns]
Drift evaluator failed: 'dict' object has no attribute 'reported_dataframe'
Using default drift evaluator
Drift evaluation: {'should_alert': True, 'message': 'Drift detected:\n- ~~🆕 