In [21]:
import sys
import os
import pandas as pd
from datetime import datetime, timedelta
import time
import requests
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_dir)
from functions import fetch_github_data
from config import GITHUB_TOKEN
import numpy as np

In [22]:
# Github's Personal Access Token
TOKEN = GITHUB_TOKEN

In [3]:
# The 'Accept': 'application/vnd.github.v3.star+json' is important to get the starred_at from the stargazers api
headers = {'Authorization': f'token {TOKEN}', 'Accept': 'application/vnd.github.v3.star+json'}

## Data Generation

In [4]:
total_iterations = 50 # should be 50
pause_time = 62  
fetch_time = 20
per_page = 20 # should be 20 (repos) - per_page * total_iterations = 1000 repo
total_estimated_time = total_iterations * (fetch_time + pause_time)
start_time = datetime.now()

query = 'AI OR "artificial intelligence"'
data = []


# Check Rate Limit Status
rate_limit_url = 'https://api.github.com/rate_limit'
response = requests.get(rate_limit_url, headers=headers)
rate_limit_data = response.json()
print(f"Rate Limit: {rate_limit_data['rate']['limit']}")
print(f"Remaining: {rate_limit_data['rate']['remaining']}")
print(f"Reset Time: {datetime.fromtimestamp(rate_limit_data['rate']['reset'])}")

for i in range(1, total_iterations+1):
    print(f"Fetching page {i}")
    data.extend(fetch_github_data(headers, query, pages=i, per_page=per_page))
    
    elapsed_time = (datetime.now() - start_time).total_seconds()
    remaining_time = total_estimated_time - elapsed_time

    # Display remaining time in HH:MM:SS format
    remaining_time_formatted = str(timedelta(seconds=max(remaining_time, 0)))  # max with 0 to avoid negative times
    print(f"Elapsed time: {str(timedelta(seconds=int(elapsed_time)))}")
    print(f"Estimated remaining time: {remaining_time_formatted}")
    
    # Pause for the specified time
    if i < total_iterations:  # No need to pause after the last iteration
        print(f"Pausing for {pause_time} seconds...")
        time.sleep(pause_time)

Rate Limit: 5000
Remaining: 4969
Reset Time: 2024-09-21 23:29:11
Fetching page 1
Fetched 1 repositories
Fetched 2 repositories
Fetched 3 repositories
Fetched 4 repositories
Fetched 5 repositories
Fetched 6 repositories
Fetched 7 repositories
Fetched 8 repositories
Fetched 9 repositories
Fetched 10 repositories
Fetched 11 repositories
Fetched 12 repositories
Fetched 13 repositories
Fetched 14 repositories
Fetched 15 repositories
Fetched 16 repositories
Fetched 17 repositories
Fetched 18 repositories
Fetched 19 repositories
Fetched 20 repositories
Elapsed time: 0:00:41
Estimated remaining time: 1:07:38.252923
Pausing for 62 seconds...
Fetching page 2
Fetched 1 repositories
Fetched 2 repositories
Fetched 3 repositories
Fetched 4 repositories
Fetched 5 repositories
Fetched 6 repositories
Fetched 7 repositories
Fetched 8 repositories
Fetched 9 repositories
Fetched 10 repositories
Fetched 11 repositories
Fetched 12 repositories
Fetched 13 repositories
Fetched 14 repositories
Fetched 15 repos

In [11]:
df = pd.DataFrame(data)

In [107]:
print(df.size)

# Filter our repos that have 12 releases or less
df_filtered = df[df['num_releases'] > 12]

print(df_filtered.size)

15000
5460


## Data Augmentation & Noise Addition

In [108]:
# Explode the 'release_date' and 'tag_name' columns
df_exploded = df_filtered.explode(['release_date', 'release_tag']).reset_index(drop=True)

# Convert 'release_date' to datetime format
df_exploded['release_date'] = pd.to_datetime(df_exploded['release_date'])
df_exploded = df_exploded.sort_values(by='release_date', ascending=True).reset_index(drop=True)


df_exploded

Unnamed: 0,org_name,repo_name,description,repo_url,topics,creation_date,update_date,release_tag,release_date,num_releases,num_open_issues,num_pull_requests,num_stars,num_forks,num_watchers
0,aio-libs,aiopg,aiopg is a library for accessing a PostgreSQL ...,https://github.com/aio-libs/aiopg,"asyncio, postgresql, sqlalchemy",2014-04-03T09:58:33Z,2024-09-17T10:57:37Z,v0.2.3,2014-06-12 15:29:59+00:00,24,76,697,1392,159,1392
1,aio-libs-abandoned,aioredis-py,asyncio (PEP 3156) Redis support,https://github.com/aio-libs-abandoned/aioredis-py,"python-3, asyncio, redis, redis-sentinel, pyth...",2014-05-25T19:29:57Z,2024-09-19T16:43:56Z,v0.1.0,2014-06-24 10:12:06+00:00,29,95,1041,2299,336,2299
2,aio-libs-abandoned,aioredis-py,asyncio (PEP 3156) Redis support,https://github.com/aio-libs-abandoned/aioredis-py,"python-3, asyncio, redis, redis-sentinel, pyth...",2014-05-25T19:29:57Z,2024-09-19T16:43:56Z,v0.1.1,2014-07-07 08:16:11+00:00,29,95,1041,2299,336,2299
3,aio-libs-abandoned,aioredis-py,asyncio (PEP 3156) Redis support,https://github.com/aio-libs-abandoned/aioredis-py,"python-3, asyncio, redis, redis-sentinel, pyth...",2014-05-25T19:29:57Z,2024-09-19T16:43:56Z,v0.1.2,2014-07-31 07:52:08+00:00,29,95,1041,2299,336,2299
4,aio-libs-abandoned,aioredis-py,asyncio (PEP 3156) Redis support,https://github.com/aio-libs-abandoned/aioredis-py,"python-3, asyncio, redis, redis-sentinel, pyth...",2014-05-25T19:29:57Z,2024-09-19T16:43:56Z,v0.1.3,2014-08-08 07:51:11+00:00,29,95,1041,2299,336,2299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9446,air-verse,air,☁️ Live reload for Go apps,https://github.com/air-verse/air,"tools, go, live-reload, gin, watcher, task-run...",2017-10-12T14:31:40Z,2024-09-22T02:54:15Z,v1.60.0,2024-09-21 15:00:46+00:00,30,146,259,17441,800,17441
9447,lobehub,lobe-chat,"🤯 Lobe Chat - an open-source, modern-design AI...",https://github.com/lobehub/lobe-chat,"chatgpt, nextjs, openai, ai, chat, function-ca...",2023-05-21T07:19:12Z,2024-09-22T02:42:16Z,v1.19.21,2024-09-21 17:08:51+00:00,30,421,1469,40966,9341,40966
9448,paul-gauthier,aider,aider is AI pair programming in your terminal,https://github.com/paul-gauthier/aider,"chatgpt, cli, command-line, gpt-4, openai, gpt...",2023-05-09T18:57:49Z,2024-09-22T02:03:50Z,v0.57.0,2024-09-21 20:35:49+00:00,30,174,235,18878,1744,18878
9449,mediar-ai,screenpipe,Library to build personalized AI powered by wh...,https://github.com/mediar-ai/screenpipe,"ai, computer-vision, llm, machine-learning, ml...",2024-06-19T13:23:56Z,2024-09-22T03:13:57Z,v0.1.88,2024-09-21 21:24:37+00:00,30,49,203,1485,109,1485


In [109]:
def distribute_values_across_releases(release_dates, total_value):
    """
    Distribute the total value across release dates in an increasing manner.
    Starts from 0 and increments up to the total_value.
    """
    n_releases = len(release_dates)
    step = total_value / (n_releases - 1) if n_releases > 1 else total_value

    # Create the values starting from 0 and incrementing by the calculated step
    values = [round(step * i) for i in range(n_releases)]
    values[-1] = total_value  # Ensure the last value matches the total count
    return values

In [110]:
def add_proportional_noise(series, factor=0.05, seed=None, min_value=1):
    """
    Adds proportional noise to a series, ensuring no negative values.
    """
    if seed is not None:
        np.random.seed(seed)
    noise = np.random.normal(loc=0, scale=1, size=len(series))
    noisy_series = series * (1 + factor * noise)
    
    # Ensure values are not below the specified minimum value
    noisy_series = np.maximum(noisy_series.round().astype(int), min_value)

    return noisy_series

In [111]:
def add_time_based_noise(series, factor=0.02, seed=None):
    """
    Adds time-based noise to a series to simulate temporal variations.
    """
    if seed is not None:
        np.random.seed(seed)
    # Simulate time-based noise as a sine wave with added random noise
    time = np.arange(len(series))
    temporal_variation = np.sin(time / 5)
    random_noise = np.random.normal(loc=0, scale=factor, size=len(series))
    
    time_noise = temporal_variation + random_noise
    return series + series.mean() * time_noise

In [112]:
for repo_index, repo_group in df_exploded.groupby(['org_name', 'repo_name']):
    # Get release dates for the current repository
    release_dates = repo_group['release_date']
    
    # Distribute each column across the release dates
    df_exploded.loc[release_dates.index, 'num_stars'] = distribute_values_across_releases(release_dates, repo_group['num_stars'].iloc[0])
    df_exploded.loc[release_dates.index, 'num_forks'] = distribute_values_across_releases(release_dates, repo_group['num_forks'].iloc[0])
    df_exploded.loc[release_dates.index, 'num_watchers'] = distribute_values_across_releases(release_dates, repo_group['num_watchers'].iloc[0])
    df_exploded.loc[release_dates.index, 'num_pull_requests'] = distribute_values_across_releases(release_dates, repo_group['num_pull_requests'].iloc[0])
    df_exploded.loc[release_dates.index, 'num_open_issues'] = distribute_values_across_releases(release_dates, repo_group['num_open_issues'].iloc[0])
    df_exploded.loc[release_dates.index, 'num_releases'] = distribute_values_across_releases(release_dates, repo_group['num_releases'].iloc[0])


In [113]:
df_exploded['num_stars'] = add_time_based_noise(df_exploded['num_stars'])
df_exploded['num_forks'] = add_time_based_noise(df_exploded['num_forks'])
df_exploded['num_watchers'] = add_time_based_noise(df_exploded['num_watchers'])
df_exploded['num_pull_requests'] = add_time_based_noise(df_exploded['num_pull_requests'])
df_exploded['num_open_issues'] = add_time_based_noise(df_exploded['num_open_issues'])


In [114]:
df_exploded['num_stars'] = add_proportional_noise(df_exploded['num_stars'])
df_exploded['num_forks'] = add_proportional_noise(df_exploded['num_forks'])
df_exploded['num_watchers'] = add_proportional_noise(df_exploded['num_watchers'])
df_exploded['num_pull_requests'] = add_proportional_noise(df_exploded['num_pull_requests'])
df_exploded['num_open_issues'] = add_proportional_noise(df_exploded['num_open_issues'])


In [115]:
# Remove rows where the initial value is zero
df_exploded = df_exploded[df_exploded['num_releases'] != 0]

# Reset the index after filtering
df_exploded.reset_index(drop=True, inplace=True)

df_exploded

Unnamed: 0,org_name,repo_name,description,repo_url,topics,creation_date,update_date,release_tag,release_date,num_releases,num_open_issues,num_pull_requests,num_stars,num_forks,num_watchers
0,aio-libs-abandoned,aioredis-py,asyncio (PEP 3156) Redis support,https://github.com/aio-libs-abandoned/aioredis-py,"python-3, asyncio, redis, redis-sentinel, pyth...",2014-05-25T19:29:57Z,2024-09-19T16:43:56Z,v0.1.1,2014-07-07 08:16:11+00:00,1,46,401,1851,244,1663
1,aio-libs-abandoned,aioredis-py,asyncio (PEP 3156) Redis support,https://github.com/aio-libs-abandoned/aioredis-py,"python-3, asyncio, redis, redis-sentinel, pyth...",2014-05-25T19:29:57Z,2024-09-19T16:43:56Z,v0.1.2,2014-07-31 07:52:08+00:00,2,72,661,2743,355,2665
2,aio-libs-abandoned,aioredis-py,asyncio (PEP 3156) Redis support,https://github.com/aio-libs-abandoned/aioredis-py,"python-3, asyncio, redis, redis-sentinel, pyth...",2014-05-25T19:29:57Z,2024-09-19T16:43:56Z,v0.1.3,2014-08-08 07:51:11+00:00,3,96,802,3631,489,3932
3,aio-libs-abandoned,aioredis-py,asyncio (PEP 3156) Redis support,https://github.com/aio-libs-abandoned/aioredis-py,"python-3, asyncio, redis, redis-sentinel, pyth...",2014-05-25T19:29:57Z,2024-09-19T16:43:56Z,v0.1.4,2014-09-22 19:50:09+00:00,4,99,1056,4013,620,4091
4,aio-libs,aiopg,aiopg is a library for accessing a PostgreSQL ...,https://github.com/aio-libs/aiopg,"asyncio, postgresql, sqlalchemy",2014-04-03T09:58:33Z,2024-09-17T10:57:37Z,v0.4.0,2014-10-02 09:33:16+00:00,1,95,922,3992,667,4414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9082,air-verse,air,☁️ Live reload for Go apps,https://github.com/air-verse/air,"tools, go, live-reload, gin, watcher, task-run...",2017-10-12T14:31:40Z,2024-09-22T02:54:15Z,v1.60.0,2024-09-21 15:00:46+00:00,30,50,1,12392,220,12969
9083,lobehub,lobe-chat,"🤯 Lobe Chat - an open-source, modern-design AI...",https://github.com/lobehub/lobe-chat,"chatgpt, nextjs, openai, ai, chat, function-ca...",2023-05-21T07:19:12Z,2024-09-22T02:42:16Z,v1.19.21,2024-09-21 17:08:51+00:00,30,304,520,36210,8813,39585
9084,paul-gauthier,aider,aider is AI pair programming in your terminal,https://github.com/paul-gauthier/aider,"chatgpt, cli, command-line, gpt-4, openai, gpt...",2023-05-09T18:57:49Z,2024-09-22T02:03:50Z,v0.57.0,2024-09-21 20:35:49+00:00,30,61,1,14419,1116,14104
9085,mediar-ai,screenpipe,Library to build personalized AI powered by wh...,https://github.com/mediar-ai/screenpipe,"ai, computer-vision, llm, machine-learning, ml...",2024-06-19T13:23:56Z,2024-09-22T03:13:57Z,v0.1.88,2024-09-21 21:24:37+00:00,29,1,1,1,1,1


In [116]:
# Save the generated data into a CSV file

print(df_exploded.size)
print(df_exploded.shape)
df_exploded.to_csv('../data/github_data.csv', index=False)

136305
(9087, 15)
