In [1]:
import sys
import os
import pandas as pd
from datetime import datetime, timedelta
import time
import requests


root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_dir)

module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)

from functions import fetch_github_data


from config import GITHUB_TOKEN


# @TODO 
# make the num_stars num_forks and num_releases dynamic
# regen the data
# fine-tuning or transfer learning

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Github's Personal Access Token
TOKEN = GITHUB_TOKEN

In [3]:
# The 'Accept': 'application/vnd.github.v3.star+json' is important to get the starred_at from the stargazers api
headers = {'Authorization': f'token {TOKEN}', 'Accept': 'application/vnd.github.v3.star+json'}

In [4]:
total_iterations = 50 # should be 50
pause_time = 62  
fetch_time = 20
per_page = 20 # should be 20 (repos) - per_page * total_iterations = 1000 repo
total_estimated_time = total_iterations * (fetch_time + pause_time)
start_time = datetime.now()

query = 'AI OR "artificial intelligence"'
data = []


# Check Rate Limit Status
rate_limit_url = 'https://api.github.com/rate_limit'
response = requests.get(rate_limit_url, headers=headers)
rate_limit_data = response.json()
print(f"Rate Limit: {rate_limit_data['rate']['limit']}")
print(f"Remaining: {rate_limit_data['rate']['remaining']}")
print(f"Reset Time: {datetime.fromtimestamp(rate_limit_data['rate']['reset'])}")

for i in range(1, total_iterations+1):
    print(f"Fetching page {i}")
    data.extend(fetch_github_data(headers, query, pages=i, per_page=per_page))
    
    elapsed_time = (datetime.now() - start_time).total_seconds()
    remaining_time = total_estimated_time - elapsed_time

    # Display remaining time in HH:MM:SS format
    remaining_time_formatted = str(timedelta(seconds=max(remaining_time, 0)))  # max with 0 to avoid negative times
    print(f"Elapsed time: {str(timedelta(seconds=int(elapsed_time)))}")
    print(f"Estimated remaining time: {remaining_time_formatted}")
    
    # Pause for the specified time
    if i < total_iterations:  # No need to pause after the last iteration
        print(f"Pausing for {pause_time} seconds...")
        time.sleep(pause_time)

Rate Limit: 5000
Remaining: 4986
Reset Time: 2024-09-21 23:29:11
Fetching page 1
Fetched 1 repositories
Fetched 2 repositories
Fetched 3 repositories
Fetched 4 repositories
Fetched 5 repositories
Elapsed time: 0:00:10
Estimated remaining time: 0:01:11.831909


In [39]:
df = pd.DataFrame(data)

In [40]:
# Step 1: Explode the 'release_date' and 'tag_name' columns
df_exploded = df.explode(['release_date', 'release_tag']).reset_index(drop=True)

# Convert 'release_date' to datetime format
df_exploded['release_date'] = pd.to_datetime(df_exploded['release_date'])

df_exploded

Unnamed: 0,org_name,repo_name,description,repo_url,topics,creation_date,update_date,release_tag,release_date,num_releases,num_open_issues,num_pull_requests,num_stars,num_forks,num_watchers
0,Significant-Gravitas,AutoGPT,AutoGPT is the vision of accessible AI for eve...,https://github.com/Significant-Gravitas/AutoGPT,"ai, gpt-4, openai, python, artificial-intellig...",2023-03-16T09:21:07Z,2024-09-22T02:28:37Z,autogpt-v0.5.1,2024-04-26 20:15:57+00:00,21,158,4271,166740,44110,166740
1,Significant-Gravitas,AutoGPT,AutoGPT is the vision of accessible AI for eve...,https://github.com/Significant-Gravitas/AutoGPT,"ai, gpt-4, openai, python, artificial-intellig...",2023-03-16T09:21:07Z,2024-09-22T02:28:37Z,autogpt-v0.5.0,2023-12-14 15:17:16+00:00,21,158,4271,166740,44110,166740
2,Significant-Gravitas,AutoGPT,AutoGPT is the vision of accessible AI for eve...,https://github.com/Significant-Gravitas/AutoGPT,"ai, gpt-4, openai, python, artificial-intellig...",2023-03-16T09:21:07Z,2024-09-22T02:28:37Z,agbenchmark-v0.0.10,2023-09-17 00:02:08+00:00,21,158,4271,166740,44110,166740
3,Significant-Gravitas,AutoGPT,AutoGPT is the vision of accessible AI for eve...,https://github.com/Significant-Gravitas/AutoGPT,"ai, gpt-4, openai, python, artificial-intellig...",2023-03-16T09:21:07Z,2024-09-22T02:28:37Z,v0.4.7,2023-08-11 17:55:06+00:00,21,158,4271,166740,44110,166740
4,Significant-Gravitas,AutoGPT,AutoGPT is the vision of accessible AI for eve...,https://github.com/Significant-Gravitas/AutoGPT,"ai, gpt-4, openai, python, artificial-intellig...",2023-03-16T09:21:07Z,2024-09-22T02:28:37Z,v0.4.6,2023-07-28 12:42:29+00:00,21,158,4271,166740,44110,166740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,supabase,supabase,The open source Firebase alternative. Supabase...,https://github.com/supabase/supabase,"firebase, supabase, realtime, postgrest, postg...",2019-10-12T05:56:49Z,2024-09-22T01:20:12Z,0.0.8,2020-09-07 04:02:43+00:00,16,471,11607,71838,6869,71838
61,supabase,supabase,The open source Firebase alternative. Supabase...,https://github.com/supabase/supabase,"firebase, supabase, realtime, postgrest, postg...",2019-10-12T05:56:49Z,2024-09-22T01:20:12Z,0.0.7,2020-08-05 10:38:19+00:00,16,471,11607,71838,6869,71838
62,supabase,supabase,The open source Firebase alternative. Supabase...,https://github.com/supabase/supabase,"firebase, supabase, realtime, postgrest, postg...",2019-10-12T05:56:49Z,2024-09-22T01:20:12Z,0.0.4,2020-07-02 02:08:15+00:00,16,471,11607,71838,6869,71838
63,fighting41love,funNLP,中英文敏感词、语言检测、中外手机/电话归属地/运营商查询、名字推断性别、手机号抽取、身份证抽...,https://github.com/fighting41love/funNLP,,2018-08-21T11:20:39Z,2024-09-21T18:37:04Z,,NaT,0,156,26,67809,14413,67809


In [41]:
def distribute_values_across_releases(release_dates, total_value):
    """
    Distribute the total value across release dates in an increasing manner.
    """
    n_releases = len(release_dates)
    step = (total_value - n_releases) / (n_releases - 1) if n_releases > 1 else total_value

    values = [1 + round(step * i) for i in range(n_releases)]
    values[-1] = total_value  # Ensure the last value matches the total count
    return values

In [42]:
for repo_index, repo_group in df_exploded.groupby(['org_name', 'repo_name']):
    # Get release dates for the current repository
    release_dates = repo_group['release_date']
    
    # Distribute each column across the release dates
    df_exploded.loc[release_dates.index, 'num_stars'] = distribute_values_across_releases(release_dates, repo_group['num_stars'].iloc[0])
    df_exploded.loc[release_dates.index, 'num_forks'] = distribute_values_across_releases(release_dates, repo_group['num_forks'].iloc[0])
    df_exploded.loc[release_dates.index, 'num_watchers'] = distribute_values_across_releases(release_dates, repo_group['num_watchers'].iloc[0])
    df_exploded.loc[release_dates.index, 'num_pull_requests'] = distribute_values_across_releases(release_dates, repo_group['num_pull_requests'].iloc[0])
    df_exploded.loc[release_dates.index, 'num_open_issues'] = distribute_values_across_releases(release_dates, repo_group['num_open_issues'].iloc[0])
    df_exploded.loc[release_dates.index, 'num_releases'] = distribute_values_across_releases(release_dates, repo_group['num_releases'].iloc[0])

In [43]:
df_exploded

Unnamed: 0,org_name,repo_name,description,repo_url,topics,creation_date,update_date,release_tag,release_date,num_releases,num_open_issues,num_pull_requests,num_stars,num_forks,num_watchers
0,Significant-Gravitas,AutoGPT,AutoGPT is the vision of accessible AI for eve...,https://github.com/Significant-Gravitas/AutoGPT,"ai, gpt-4, openai, python, artificial-intellig...",2023-03-16T09:21:07Z,2024-09-22T02:28:37Z,autogpt-v0.5.1,2024-04-26 20:15:57+00:00,1,1,1,1,1,1
1,Significant-Gravitas,AutoGPT,AutoGPT is the vision of accessible AI for eve...,https://github.com/Significant-Gravitas/AutoGPT,"ai, gpt-4, openai, python, artificial-intellig...",2023-03-16T09:21:07Z,2024-09-22T02:28:37Z,autogpt-v0.5.0,2023-12-14 15:17:16+00:00,1,8,213,8337,2205,8337
2,Significant-Gravitas,AutoGPT,AutoGPT is the vision of accessible AI for eve...,https://github.com/Significant-Gravitas/AutoGPT,"ai, gpt-4, openai, python, artificial-intellig...",2023-03-16T09:21:07Z,2024-09-22T02:28:37Z,agbenchmark-v0.0.10,2023-09-17 00:02:08+00:00,1,15,426,16673,4410,16673
3,Significant-Gravitas,AutoGPT,AutoGPT is the vision of accessible AI for eve...,https://github.com/Significant-Gravitas/AutoGPT,"ai, gpt-4, openai, python, artificial-intellig...",2023-03-16T09:21:07Z,2024-09-22T02:28:37Z,v0.4.7,2023-08-11 17:55:06+00:00,1,22,639,25009,6614,25009
4,Significant-Gravitas,AutoGPT,AutoGPT is the vision of accessible AI for eve...,https://github.com/Significant-Gravitas/AutoGPT,"ai, gpt-4, openai, python, artificial-intellig...",2023-03-16T09:21:07Z,2024-09-22T02:28:37Z,v0.4.6,2023-07-28 12:42:29+00:00,1,28,851,33345,8819,33345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,supabase,supabase,The open source Firebase alternative. Supabase...,https://github.com/supabase/supabase,"firebase, supabase, realtime, postgrest, postg...",2019-10-12T05:56:49Z,2024-09-22T01:20:12Z,0.0.8,2020-09-07 04:02:43+00:00,1,395,10047,62247,5940,62247
61,supabase,supabase,The open source Firebase alternative. Supabase...,https://github.com/supabase/supabase,"firebase, supabase, realtime, postgrest, postg...",2019-10-12T05:56:49Z,2024-09-22T01:20:12Z,0.0.7,2020-08-05 10:38:19+00:00,1,426,10819,67035,6397,67035
62,supabase,supabase,The open source Firebase alternative. Supabase...,https://github.com/supabase/supabase,"firebase, supabase, realtime, postgrest, postg...",2019-10-12T05:56:49Z,2024-09-22T01:20:12Z,0.0.4,2020-07-02 02:08:15+00:00,16,471,11607,71838,6869,71838
63,fighting41love,funNLP,中英文敏感词、语言检测、中外手机/电话归属地/运营商查询、名字推断性别、手机号抽取、身份证抽...,https://github.com/fighting41love/funNLP,,2018-08-21T11:20:39Z,2024-09-21T18:37:04Z,,NaT,0,156,26,67809,14413,67809


In [7]:
# Display the DataFrame

print(df_exploded.size)
print(df_exploded.shape)
df_exploded.to_csv('../data/github_data.csv', index=False)

75
(5, 15)
