In [1]:
import requests
import pandas as pd
from datetime import datetime
import sys
import os


root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_dir)

module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)

from functions import fetch_github_data


from config import GITHUB_TOKEN

  from tqdm.autonotebook import tqdm, trange


In [4]:
def fetch_stargazers_with_dates(org_name, repo_name, headers):
    """
    Fetches stargazer data for a repository, including the date each star was given.
    """
    url = f'https://api.github.com/repos/{org_name}/{repo_name}/stargazers'
    params = {'per_page': 100, 'page': 1}
    stars_data = []

    while True:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            stars = response.json()
            if not stars:
                break

            # Extract the date each star was added
            for star in stars:
                stars_data.append(star['starred_at'])
            params['page'] += 1
        else:
            print(f"Failed to fetch stargazers: {response.status_code}")
            break
    
    # Convert to DataFrame
    stars_df = pd.DataFrame({'date': pd.to_datetime(stars_data)})
    stars_df['num_stars'] = 1  # Each row represents a single star
    return stars_df

# Example usage
headers = {'Authorization': f'token {GITHUB_TOKEN}', 'Accept': 'application/vnd.github.v3.star+json'}
org_name = 'Significant-Gravitas'
repo_name = 'AutoGPT'
stars_df = fetch_stargazers_with_dates(org_name, repo_name, headers)


Failed to fetch stargazers: 422
                       date  num_stars
0 2023-03-16 19:57:53+00:00          1
1 2023-03-16 21:32:21+00:00          1
2 2023-03-17 07:46:00+00:00          1
3 2023-03-17 14:52:47+00:00          1
4 2023-03-17 15:16:39+00:00          1


In [5]:
print(stars_df)

                           date  num_stars
0     2023-03-16 19:57:53+00:00          1
1     2023-03-16 21:32:21+00:00          1
2     2023-03-17 07:46:00+00:00          1
3     2023-03-17 14:52:47+00:00          1
4     2023-03-17 15:16:39+00:00          1
...                         ...        ...
39995 2023-04-13 10:21:22+00:00          1
39996 2023-04-13 10:21:27+00:00          1
39997 2023-04-13 10:21:30+00:00          1
39998 2023-04-13 10:21:32+00:00          1
39999 2023-04-13 10:21:46+00:00          1

[40000 rows x 2 columns]


In [6]:
def fetch_fork_events_with_dates(org_name, repo_name, headers):
    """
    Fetches fork events for a repository, including the date each fork was created.
    """
    url = f'https://api.github.com/repos/{org_name}/{repo_name}/events'
    params = {'per_page': 100, 'page': 1}
    fork_data = []

    while True:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            events = response.json()
            if not events:
                break

            # Extract the date each fork was created
            for event in events:
                if event['type'] == 'ForkEvent':
                    fork_data.append(event['created_at'])

            params['page'] += 1
        else:
            print(f"Failed to fetch fork events: {response.status_code}")
            break
    
    # Convert to DataFrame
    forks_df = pd.DataFrame({'date': pd.to_datetime(fork_data)})
    forks_df['num_forks'] = 1  # Each row represents a single fork
    return forks_df

# Example usage
forks_df = fetch_fork_events_with_dates(org_name, repo_name, headers)


Failed to fetch fork events: 422
                       date  num_forks
0 2024-09-21 13:11:21+00:00          1
1 2024-09-21 09:41:10+00:00          1
2 2024-09-21 04:44:09+00:00          1
3 2024-09-21 01:22:59+00:00          1
4 2024-09-20 21:59:24+00:00          1


In [7]:
print(forks_df)

                       date  num_forks
0 2024-09-21 13:11:21+00:00          1
1 2024-09-21 09:41:10+00:00          1
2 2024-09-21 04:44:09+00:00          1
3 2024-09-21 01:22:59+00:00          1
4 2024-09-20 21:59:24+00:00          1
5 2024-09-20 20:44:00+00:00          1
6 2024-09-20 15:36:59+00:00          1
7 2024-09-20 15:02:19+00:00          1
8 2024-09-20 14:45:56+00:00          1
9 2024-09-20 12:18:21+00:00          1


In [9]:
# Combine star and fork data into a single DataFrame
combined_df = pd.concat([stars_df.set_index('date').resample('D').sum(),
                        forks_df.set_index('date').resample('D').sum()], axis=1).fillna(0)

# Accumulate the values over time
combined_df['num_stars_cumulative'] = combined_df['num_stars'].cumsum()
combined_df['num_forks_cumulative'] = combined_df['num_forks'].cumsum()

# Reset index to have date as a column
combined_df.reset_index(inplace=True)
print(combined_df)

                        date  num_stars  num_forks  num_stars_cumulative  \
0  2023-03-16 00:00:00+00:00        2.0        0.0                   2.0   
1  2023-03-17 00:00:00+00:00        8.0        0.0                  10.0   
2  2023-03-18 00:00:00+00:00        2.0        0.0                  12.0   
3  2023-03-19 00:00:00+00:00        4.0        0.0                  16.0   
4  2023-03-20 00:00:00+00:00        2.0        0.0                  18.0   
5  2023-03-21 00:00:00+00:00        2.0        0.0                  20.0   
6  2023-03-22 00:00:00+00:00        3.0        0.0                  23.0   
7  2023-03-23 00:00:00+00:00        0.0        0.0                  23.0   
8  2023-03-24 00:00:00+00:00        0.0        0.0                  23.0   
9  2023-03-25 00:00:00+00:00        2.0        0.0                  25.0   
10 2023-03-26 00:00:00+00:00        1.0        0.0                  26.0   
11 2023-03-27 00:00:00+00:00        3.0        0.0                  29.0   
12 2023-03-2