In [21]:
import sys
import os
import pandas as pd
from datetime import datetime, timedelta
import time
import requests


root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_dir)


from functions import fetch_github_data


from config import GITHUB_TOKEN


In [22]:
# Github's Personal Access Token
TOKEN = GITHUB_TOKEN

In [3]:
# The 'Accept': 'application/vnd.github.v3.star+json' is important to get the starred_at from the stargazers api
headers = {'Authorization': f'token {TOKEN}', 'Accept': 'application/vnd.github.v3.star+json'}

In [4]:
total_iterations = 50 # should be 50
pause_time = 62  
fetch_time = 20
per_page = 20 # should be 20 (repos) - per_page * total_iterations = 1000 repo
total_estimated_time = total_iterations * (fetch_time + pause_time)
start_time = datetime.now()

query = 'AI OR "artificial intelligence"'
data = []


# Check Rate Limit Status
rate_limit_url = 'https://api.github.com/rate_limit'
response = requests.get(rate_limit_url, headers=headers)
rate_limit_data = response.json()
print(f"Rate Limit: {rate_limit_data['rate']['limit']}")
print(f"Remaining: {rate_limit_data['rate']['remaining']}")
print(f"Reset Time: {datetime.fromtimestamp(rate_limit_data['rate']['reset'])}")

for i in range(1, total_iterations+1):
    print(f"Fetching page {i}")
    data.extend(fetch_github_data(headers, query, pages=i, per_page=per_page))
    
    elapsed_time = (datetime.now() - start_time).total_seconds()
    remaining_time = total_estimated_time - elapsed_time

    # Display remaining time in HH:MM:SS format
    remaining_time_formatted = str(timedelta(seconds=max(remaining_time, 0)))  # max with 0 to avoid negative times
    print(f"Elapsed time: {str(timedelta(seconds=int(elapsed_time)))}")
    print(f"Estimated remaining time: {remaining_time_formatted}")
    
    # Pause for the specified time
    if i < total_iterations:  # No need to pause after the last iteration
        print(f"Pausing for {pause_time} seconds...")
        time.sleep(pause_time)

Rate Limit: 5000
Remaining: 4969
Reset Time: 2024-09-21 23:29:11
Fetching page 1
Fetched 1 repositories
Fetched 2 repositories
Fetched 3 repositories
Fetched 4 repositories
Fetched 5 repositories
Fetched 6 repositories
Fetched 7 repositories
Fetched 8 repositories
Fetched 9 repositories
Fetched 10 repositories
Fetched 11 repositories
Fetched 12 repositories
Fetched 13 repositories
Fetched 14 repositories
Fetched 15 repositories
Fetched 16 repositories
Fetched 17 repositories
Fetched 18 repositories
Fetched 19 repositories
Fetched 20 repositories
Elapsed time: 0:00:41
Estimated remaining time: 1:07:38.252923
Pausing for 62 seconds...
Fetching page 2
Fetched 1 repositories
Fetched 2 repositories
Fetched 3 repositories
Fetched 4 repositories
Fetched 5 repositories
Fetched 6 repositories
Fetched 7 repositories
Fetched 8 repositories
Fetched 9 repositories
Fetched 10 repositories
Fetched 11 repositories
Fetched 12 repositories
Fetched 13 repositories
Fetched 14 repositories
Fetched 15 repos

In [11]:
df = pd.DataFrame(data)

In [86]:
# Explode the 'release_date' and 'tag_name' columns
df_exploded = df.explode(['release_date', 'release_tag']).reset_index(drop=True)

# Convert 'release_date' to datetime format
df_exploded['release_date'] = pd.to_datetime(df_exploded['release_date'])
df_exploded = df_exploded.sort_values(by='release_date', ascending=True).reset_index(drop=True)


df_exploded

Unnamed: 0,org_name,repo_name,description,repo_url,topics,creation_date,update_date,release_tag,release_date,num_releases,num_open_issues,num_pull_requests,num_stars,num_forks,num_watchers
0,vim-airline,vim-airline,lean & mean status/tabline for vim that's ligh...,https://github.com/vim-airline/vim-airline,"vim-airline, statusline, tabline, vim, vim-plugin",2013-06-30T18:49:56Z,2024-09-21T10:52:01Z,v0.2,2013-07-27 22:35:13+00:00,11,44,853,17754,1103,17754
1,vim-airline,vim-airline,lean & mean status/tabline for vim that's ligh...,https://github.com/vim-airline/vim-airline,"vim-airline, statusline, tabline, vim, vim-plugin",2013-06-30T18:49:56Z,2024-09-21T10:52:01Z,v0.1,2013-08-09 19:12:50+00:00,11,44,853,17754,1103,17754
2,vim-airline,vim-airline,lean & mean status/tabline for vim that's ligh...,https://github.com/vim-airline/vim-airline,"vim-airline, statusline, tabline, vim, vim-plugin",2013-06-30T18:49:56Z,2024-09-21T10:52:01Z,v0.3,2013-08-12 22:07:03+00:00,11,44,853,17754,1103,17754
3,vim-airline,vim-airline,lean & mean status/tabline for vim that's ligh...,https://github.com/vim-airline/vim-airline,"vim-airline, statusline, tabline, vim, vim-plugin",2013-06-30T18:49:56Z,2024-09-21T10:52:01Z,v0.4,2013-08-26 17:01:52+00:00,11,44,853,17754,1103,17754
4,vim-airline,vim-airline,lean & mean status/tabline for vim that's ligh...,https://github.com/vim-airline/vim-airline,"vim-airline, statusline, tabline, vim, vim-plugin",2013-06-30T18:49:56Z,2024-09-21T10:52:01Z,v0.5,2013-09-09 14:19:37+00:00,11,44,853,17754,1103,17754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10887,mathworks,MATLAB-Simulink-Challenge-Project-Hub,This MATLAB and Simulink Challenge Project Hub...,https://github.com/mathworks/MATLAB-Simulink-C...,"capstone-project, senior-design, senior-projec...",2021-02-05T16:03:41Z,2024-09-22T01:24:40Z,,NaT,0,0,5,1221,267,1221
10888,opendilab,DI-star,An artificial intelligence platform for the St...,https://github.com/opendilab/DI-star,"reinforcment-learning, starcraft2, self-play, ...",2021-07-04T09:58:25Z,2024-09-20T18:54:42Z,,NaT,0,0,12,1216,114,1216
10889,gcui-art,suno-api,Use API to call the music generation AI of sun...,https://github.com/gcui-art/suno-api,"ai, suno, suno-ai, suno-ai-api, typescript, music",2024-03-27T09:50:37Z,2024-09-21T18:54:51Z,,NaT,0,49,43,1213,277,1213
10890,Geniusay,ChopperBot,虎牙，斗鱼，抖音，BiliBili，TikTok，Twitch🔥热门🔥智能直播视频剪辑发布A...,https://github.com/Geniusay/ChopperBot,,2023-04-27T18:55:04Z,2024-09-22T03:46:31Z,,NaT,0,24,14,1213,200,1213


In [87]:
def distribute_values_across_releases(release_dates, total_value):
    """
    Distribute the total value across release dates in an increasing manner.
    Starts from 0 and increments up to the total_value.
    """
    n_releases = len(release_dates)
    step = total_value / (n_releases - 1) if n_releases > 1 else total_value

    # Create the values starting from 0 and incrementing by the calculated step
    values = [round(step * i) for i in range(n_releases)]
    values[-1] = total_value  # Ensure the last value matches the total count
    return values

In [88]:
import numpy as np

def add_proportional_noise(series, factor=0.05, seed=None, min_value=1):
    """
    Adds proportional noise to a series, ensuring no negative values.
    """
    if seed is not None:
        np.random.seed(seed)
    noise = np.random.normal(loc=0, scale=1, size=len(series))
    noisy_series = series * (1 + factor * noise)
    
    # Ensure values are not below the specified minimum value
    noisy_series = np.maximum(noisy_series.round().astype(int), min_value)

    return noisy_series

In [89]:
def add_time_based_noise(series, factor=0.02, seed=None):
    """
    Adds time-based noise to a series to simulate temporal variations.
    """
    if seed is not None:
        np.random.seed(seed)
    # Simulate time-based noise as a sine wave with added random noise
    time = np.arange(len(series))
    temporal_variation = np.sin(time / 5)  # You can adjust the period
    random_noise = np.random.normal(loc=0, scale=factor, size=len(series))
    
    time_noise = temporal_variation + random_noise
    return series + series.mean() * time_noise

In [90]:
for repo_index, repo_group in df_exploded.groupby(['org_name', 'repo_name']):
    # Get release dates for the current repository
    release_dates = repo_group['release_date']
    
    # Distribute each column across the release dates
    df_exploded.loc[release_dates.index, 'num_stars'] = distribute_values_across_releases(release_dates, repo_group['num_stars'].iloc[0])
    df_exploded.loc[release_dates.index, 'num_forks'] = distribute_values_across_releases(release_dates, repo_group['num_forks'].iloc[0])
    df_exploded.loc[release_dates.index, 'num_watchers'] = distribute_values_across_releases(release_dates, repo_group['num_watchers'].iloc[0])
    df_exploded.loc[release_dates.index, 'num_pull_requests'] = distribute_values_across_releases(release_dates, repo_group['num_pull_requests'].iloc[0])
    df_exploded.loc[release_dates.index, 'num_open_issues'] = distribute_values_across_releases(release_dates, repo_group['num_open_issues'].iloc[0])
    df_exploded.loc[release_dates.index, 'num_releases'] = distribute_values_across_releases(release_dates, repo_group['num_releases'].iloc[0])


In [91]:
df_exploded['num_stars'] = add_time_based_noise(df_exploded['num_stars'])
df_exploded['num_forks'] = add_time_based_noise(df_exploded['num_forks'])
df_exploded['num_watchers'] = add_time_based_noise(df_exploded['num_watchers'])
df_exploded['num_pull_requests'] = add_time_based_noise(df_exploded['num_pull_requests'])
df_exploded['num_open_issues'] = add_time_based_noise(df_exploded['num_open_issues'])


In [92]:
df_exploded['num_stars'] = add_proportional_noise(df_exploded['num_stars'])
df_exploded['num_forks'] = add_proportional_noise(df_exploded['num_forks'])
df_exploded['num_watchers'] = add_proportional_noise(df_exploded['num_watchers'])
df_exploded['num_pull_requests'] = add_proportional_noise(df_exploded['num_pull_requests'])
df_exploded['num_open_issues'] = add_proportional_noise(df_exploded['num_open_issues'])


In [93]:
# Remove rows where the initial value is zero
df_exploded = df_exploded[df_exploded['num_releases'] != 0]

# Reset the index after filtering
df_exploded.reset_index(drop=True, inplace=True)

df_exploded

Unnamed: 0,org_name,repo_name,description,repo_url,topics,creation_date,update_date,release_tag,release_date,num_releases,num_open_issues,num_pull_requests,num_stars,num_forks,num_watchers
0,vim-airline,vim-airline,lean & mean status/tabline for vim that's ligh...,https://github.com/vim-airline/vim-airline,"vim-airline, statusline, tabline, vim, vim-plugin",2013-06-30T18:49:56Z,2024-09-21T10:52:01Z,v0.1,2013-08-09 19:12:50+00:00,1,27,239,2743,247,2739
1,vim-airline,vim-airline,lean & mean status/tabline for vim that's ligh...,https://github.com/vim-airline/vim-airline,"vim-airline, statusline, tabline, vim, vim-plugin",2013-06-30T18:49:56Z,2024-09-21T10:52:01Z,v0.3,2013-08-12 22:07:03+00:00,2,49,479,5746,461,4636
2,vim-airline,vim-airline,lean & mean status/tabline for vim that's ligh...,https://github.com/vim-airline/vim-airline,"vim-airline, statusline, tabline, vim, vim-plugin",2013-06-30T18:49:56Z,2024-09-21T10:52:01Z,v0.4,2013-08-26 17:01:52+00:00,3,75,751,8073,640,8186
3,vim-airline,vim-airline,lean & mean status/tabline for vim that's ligh...,https://github.com/vim-airline/vim-airline,"vim-airline, statusline, tabline, vim, vim-plugin",2013-06-30T18:49:56Z,2024-09-21T10:52:01Z,v0.5,2013-09-09 14:19:37+00:00,4,93,1010,10384,861,11342
4,vim-airline,vim-airline,lean & mean status/tabline for vim that's ligh...,https://github.com/vim-airline/vim-airline,"vim-airline, statusline, tabline, vim, vim-plugin",2013-06-30T18:49:56Z,2024-09-21T10:52:01Z,v0.6,2013-10-10 17:59:55+00:00,6,113,1208,13060,1037,13454
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9933,air-verse,air,☁️ Live reload for Go apps,https://github.com/air-verse/air,"tools, go, live-reload, gin, watcher, task-run...",2017-10-12T14:31:40Z,2024-09-22T02:54:15Z,v1.60.0,2024-09-21 15:00:46+00:00,30,229,1082,21649,1460,21476
9934,lobehub,lobe-chat,"🤯 Lobe Chat - an open-source, modern-design AI...",https://github.com/lobehub/lobe-chat,"chatgpt, nextjs, openai, ai, chat, function-ca...",2023-05-21T07:19:12Z,2024-09-22T02:42:16Z,v1.19.21,2024-09-21 17:08:51+00:00,30,498,2350,43149,10145,49891
9935,paul-gauthier,aider,aider is AI pair programming in your terminal,https://github.com/paul-gauthier/aider,"chatgpt, cli, command-line, gpt-4, openai, gpt...",2023-05-09T18:57:49Z,2024-09-22T02:03:50Z,v0.57.0,2024-09-21 20:35:49+00:00,30,250,883,22792,2232,20367
9936,mediar-ai,screenpipe,Library to build personalized AI powered by wh...,https://github.com/mediar-ai/screenpipe,"ai, computer-vision, llm, machine-learning, ml...",2024-06-19T13:23:56Z,2024-09-22T03:13:57Z,v0.1.88,2024-09-21 21:24:37+00:00,29,112,727,3384,445,3815


In [94]:
# Display the DataFrame

print(df_exploded.size)
print(df_exploded.shape)
df_exploded.to_csv('../data/github_data.csv', index=False)

149070
(9938, 15)
