In [1]:
import sys
from dotenv import load_dotenv
from github import Github
import os

# Create a .env with the following content:
# GH_TOKEN=your_github_token
# REPON=$gh_org/$repo

# Load environment variables from .env file
load_dotenv("../.env")


# Get GitHub token from environment variable
gh_token = os.getenv("GH_TOKEN")
if gh_token is None:
    print("GitHub token not found! Create a .env file a the root with a GH_TOKEN variable.")
    exit(1)
gh_client = Github(gh_token, timeout=60)


In [3]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker to generate random data
fake = Faker()

# Set the number of rows for the dataframe
num_rows = 600000

# Generate random IDs and dates
ids = [fake.uuid4() for _ in range(num_rows)]
dates = [fake.date_between(start_date='-1y', end_date='-1m').strftime('%Y-%m-%d') for _ in range(num_rows)]

# Generate random metric values between 0 and 10
metric_values = [round(random.uniform(0, 10),2) for _ in range(num_rows)]
# Generate random country codes
country_codes = [fake.country_code() for _ in range(num_rows)]

# Generate random categories
categories = [random.choice(['Category A', 'Category B', 'Category C']) for _ in range(num_rows)]

# Create the dataframe
ultra_large_df = pd.DataFrame({'unique_key': ids, 'date': dates, 'metric_value': metric_values, 'country_code': country_codes, 'category': categories})

# Print the dataframe
print(ultra_large_df)
local_file_path = 'ultra_large_df.csv'
ultra_large_df.to_csv(local_file_path, index=False)
print('Size of DataFrame in bytes:', os.path.getsize(local_file_path))



                                  unique_key        date  metric_value  \
0       2d7c63e0-6064-4045-a2e4-f24f54e2ee39  2023-03-25          2.35   
1       d80ee206-0e5b-4f33-a464-fe4479bb1472  2023-06-20          8.37   
2       1bbabd1c-b512-4878-80ed-4cdbdc714b73  2022-12-04          6.39   
3       34c55387-769d-4657-8431-191f988993a7  2023-06-21          3.08   
4       6882e07d-66d5-4a91-9c4e-969c3a69391c  2023-04-25          9.81   
...                                      ...         ...           ...   
599995  ee0ac304-4e16-4102-b6a7-bb53a4ff6295  2022-10-07          4.28   
599996  e2054c3a-4aad-446d-a588-16d6dc084436  2022-11-25          4.92   
599997  af928223-d9f6-4d3e-b1b8-8347eb140169  2023-01-24          5.52   
599998  69b148b8-3ea9-4cee-961c-03b2f0c0f8ef  2023-04-02          4.13   
599999  f3d3279b-be6f-4fa3-8b38-2278f109f799  2022-10-29          2.99   

       country_code    category  
0                KZ  Category C  
1                FJ  Category A  
2        

In [23]:
import sys
sys.path.append('..')
import importlib
import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric

## Test with file already existing and splitting new data and historical data
repo = os.getenv("REPO") or "gh_org/repo"
file_path = repo+"/path/to/ultra_large_metric_name18.csv"
store_metric(gh_client,  ultra_large_df, file_path, assignees=["Sammy"], store_json=False)

Storing metric...
Metric not found, creating it on branch: main
Commit: New data: path/to/ultra_large_metric_name18.csv
Metric stored


In [5]:
import numpy as np

# Set the seed for reproducibility
np.random.seed(42)

ultra_large_df2 = ultra_large_df.copy()

# Select 10 random indices for metric value update
random_indices_metric = np.random.choice(ultra_large_df2.index, size=100, replace=False)
print(random_indices_metric)
# Update metric value with random values between 0 and 10
ultra_large_df2.loc[random_indices_metric, 'metric_value'] = [round(random.uniform(0, 10),2) for _ in range(100)]


[  4242  60608 392832  41643 464234 122681  10258 199077 303125 520722
 589338 285923 340236 339287 170118 520994 591772 550478 455232 123979
 354761 254889 470580 181624 274417 328726 504212 356040  23352 214898
 361248 277103 285786 356048  23550 236107 256942 509629 225513 564335
 184036 354122 408248 516338 367441 413740 222624 558355 350732  39333
 281914 445136 158145 571695  68129 364739 451960 551108 127251 528991
 179870 101684 296154 579076 578190 404752  22682 336015 431689 505472
 470965 267651 454329 439546 337991 326490 521320 586039 460422 513493
  29421 382580  56964  40928  17648 511244  31867 208736 367384 157223
 503369 483668  45568 248656 413335  14685 460849  72643 396418 483375]


In [25]:
ultra_large_df.dtypes

unique_key       object
date             object
metric_value    float64
country_code     object
category         object
dtype: object

In [8]:
import sys
sys.path.append('..')
import importlib

import datagit.drift_evaluators
importlib.reload(datagit.drift_evaluators)
from datagit.drift_evaluators import default_drift_evaluator


import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric


store_metric(gh_client,  ultra_large_df2, file_path, assignees=["Sammy"])

Storing metric...
Metric found, updating it on branch: reported
Content https://raw.githubusercontent.com/Samox/data-history/reported/path/to/ultra_large_metric_name14.csv?token=ABUWFP6JIN2WYROOIDT4PDDEW2BPQ
Dataframe dtypes {'unique_key': string[python], 'date': string[python], 'metric_value': string[python], 'country_code': string[python], 'category': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'date': string[python], 'metric_value': string[python], 'country_code': string[python], 'category': string[python]}
comparison                                      metric_value      
                                             self other
unique_key                                             
039d29a6-a84e-495f-89ab-9a64cc2f283a         6.37  1.89
0af6ef89-87c5-498f-b37c-6629d273f87c         6.45  0.73
0b6cc7f7-03c9-4e91-bc22-ba915a448529         0.95  9.86
0de03d6e-05ed-41b1-8b8f-e0ae64c0cbd8         3.45  2.92
0e1a763a-7498-4934-b836-ea4d6f08efc1         8.91  6.97
.

GithubException: 502 {"message": "Server Error"}

In [6]:
# Set the number of rows for the dataframe
num_rows = 6000

# Generate random IDs and dates
ids = [fake.uuid4() for _ in range(num_rows)]
dates = [fake.date_between(start_date='-1m', end_date='today').strftime('%Y-%m-%d') for _ in range(num_rows)]

# Generate random metric values between 0 and 10
metric_values = [round(random.uniform(0, 10),2) for _ in range(num_rows)]
# Generate random country codes
country_codes = [fake.country_code() for _ in range(num_rows)]

# Generate random categories
categories = [random.choice(['Category A', 'Category B', 'Category C']) for _ in range(num_rows)]

# Create the dataframe
new_lines = pd.DataFrame({'unique_key': ids, 'date': dates, 'metric_value': metric_values, 'country_code': country_codes, 'category': categories})

ultra_large_df3 = pd.concat([ultra_large_df2, new_lines], ignore_index=True)

In [24]:
import sys
sys.path.append('..')
import importlib

import datagit.drift_evaluators
importlib.reload(datagit.drift_evaluators)
from datagit.drift_evaluators import default_drift_evaluator


import datagit.github_connector
importlib.reload(datagit.github_connector)
from datagit.github_connector import store_metric

store_metric(gh_client,  ultra_large_df3, file_path, assignees=["Sammy"], store_json=False)

Storing metric...
Metric found, updating it on branch: main
Content https://raw.githubusercontent.com/Samox/data-history/main/path/to/ultra_large_metric_name18.csv?token=ABUWFP5KQIF72QPM6CKZ6VTEW2IZK
Dataframe dtypes {'unique_key': string[python], 'date': string[python], 'metric_value': string[python], 'country_code': string[python], 'category': string[python]}
Old Dataframe dtypes {'unique_key': string[python], 'date': string[python], 'metric_value': string[python], 'country_code': string[python], 'category': string[python]}
New data found
Commit: New data: path/to/ultra_large_metric_name18.csv
https://github.com/Samox/data-history/commit/510cc1a3978ec68cc439c5f19088d9626eb0db2f
Branch metric/path-to-ultra-large-metric-name18-csv doesn't exist, creating it...
comparison                                      metric_value      
                                             self other
unique_key                                             
03aeb072-4c9b-4900-a65f-0cb012459f50         5.28 

In [13]:
ultra_large_df.to_json(local_file_path+'.json')
