In [1]:
import requests
import pandas as pd
from datetime import datetime
import sys
import os


root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_dir)

module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)

from functions import fetch_github_data


from config import GITHUB_TOKEN

  from tqdm.autonotebook import tqdm, trange


In [4]:
def fetch_stargazers_with_dates(org_name, repo_name, headers):
    """
    Fetches stargazer data for a repository, including the date each star was given.
    """
    url = f'https://api.github.com/repos/{org_name}/{repo_name}/stargazers'
    params = {'per_page': 100, 'page': 1}
    stars_data = []

    while True:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            stars = response.json()
            if not stars:
                break

            # Extract the date each star was added
            for star in stars:
                stars_data.append(star['starred_at'])
            params['page'] += 1
        else:
            print(f"Failed to fetch stargazers: {response.status_code}")
            break
    
    # Convert to DataFrame
    stars_df = pd.DataFrame({'date': pd.to_datetime(stars_data)})
    stars_df['num_stars'] = 1  # Each row represents a single star
    return stars_df

# Example usage
headers = {'Authorization': f'token {GITHUB_TOKEN}', 'Accept': 'application/vnd.github.v3.star+json'}
org_name = 'Significant-Gravitas'
repo_name = 'AutoGPT'
stars_df = fetch_stargazers_with_dates(org_name, repo_name, headers)


Failed to fetch stargazers: 422
                       date  num_stars
0 2023-03-16 19:57:53+00:00          1
1 2023-03-16 21:32:21+00:00          1
2 2023-03-17 07:46:00+00:00          1
3 2023-03-17 14:52:47+00:00          1
4 2023-03-17 15:16:39+00:00          1


In [5]:
print(stars_df)

                           date  num_stars
0     2023-03-16 19:57:53+00:00          1
1     2023-03-16 21:32:21+00:00          1
2     2023-03-17 07:46:00+00:00          1
3     2023-03-17 14:52:47+00:00          1
4     2023-03-17 15:16:39+00:00          1
...                         ...        ...
39995 2023-04-13 10:21:22+00:00          1
39996 2023-04-13 10:21:27+00:00          1
39997 2023-04-13 10:21:30+00:00          1
39998 2023-04-13 10:21:32+00:00          1
39999 2023-04-13 10:21:46+00:00          1

[40000 rows x 2 columns]


In [6]:
def fetch_fork_events_with_dates(org_name, repo_name, headers):
    """
    Fetches fork events for a repository, including the date each fork was created.
    """
    url = f'https://api.github.com/repos/{org_name}/{repo_name}/events'
    params = {'per_page': 100, 'page': 1}
    fork_data = []

    while True:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            events = response.json()
            if not events:
                break

            # Extract the date each fork was created
            for event in events:
                if event['type'] == 'ForkEvent':
                    fork_data.append(event['created_at'])

            params['page'] += 1
        else:
            print(f"Failed to fetch fork events: {response.status_code}")
            break
    
    # Convert to DataFrame
    forks_df = pd.DataFrame({'date': pd.to_datetime(fork_data)})
    forks_df['num_forks'] = 1  # Each row represents a single fork
    return forks_df

# Example usage
forks_df = fetch_fork_events_with_dates(org_name, repo_name, headers)


Failed to fetch fork events: 422
                       date  num_forks
0 2024-09-21 13:11:21+00:00          1
1 2024-09-21 09:41:10+00:00          1
2 2024-09-21 04:44:09+00:00          1
3 2024-09-21 01:22:59+00:00          1
4 2024-09-20 21:59:24+00:00          1


In [7]:
print(forks_df)

                       date  num_forks
0 2024-09-21 13:11:21+00:00          1
1 2024-09-21 09:41:10+00:00          1
2 2024-09-21 04:44:09+00:00          1
3 2024-09-21 01:22:59+00:00          1
4 2024-09-20 21:59:24+00:00          1
5 2024-09-20 20:44:00+00:00          1
6 2024-09-20 15:36:59+00:00          1
7 2024-09-20 15:02:19+00:00          1
8 2024-09-20 14:45:56+00:00          1
9 2024-09-20 12:18:21+00:00          1


In [9]:
# Combine star and fork data into a single DataFrame
combined_df = pd.concat([stars_df.set_index('date').resample('D').sum(),
                        forks_df.set_index('date').resample('D').sum()], axis=1).fillna(0)

# Accumulate the values over time
combined_df['num_stars_cumulative'] = combined_df['num_stars'].cumsum()
combined_df['num_forks_cumulative'] = combined_df['num_forks'].cumsum()

# Reset index to have date as a column
combined_df.reset_index(inplace=True)
print(combined_df)

                        date  num_stars  num_forks  num_stars_cumulative  \
0  2023-03-16 00:00:00+00:00        2.0        0.0                   2.0   
1  2023-03-17 00:00:00+00:00        8.0        0.0                  10.0   
2  2023-03-18 00:00:00+00:00        2.0        0.0                  12.0   
3  2023-03-19 00:00:00+00:00        4.0        0.0                  16.0   
4  2023-03-20 00:00:00+00:00        2.0        0.0                  18.0   
5  2023-03-21 00:00:00+00:00        2.0        0.0                  20.0   
6  2023-03-22 00:00:00+00:00        3.0        0.0                  23.0   
7  2023-03-23 00:00:00+00:00        0.0        0.0                  23.0   
8  2023-03-24 00:00:00+00:00        0.0        0.0                  23.0   
9  2023-03-25 00:00:00+00:00        2.0        0.0                  25.0   
10 2023-03-26 00:00:00+00:00        1.0        0.0                  26.0   
11 2023-03-27 00:00:00+00:00        3.0        0.0                  29.0   
12 2023-03-2

In [10]:
def convert_to_naive_or_aware(df, column, make_naive=True):
    """
    Converts a datetime column to be either tz-naive or tz-aware.

    Parameters:
    df (pd.DataFrame): The dataframe containing the datetime column.
    column (str): The name of the column to convert.
    make_naive (bool): If True, converts to tz-naive. If False, converts to tz-aware (UTC).

    Returns:
    pd.DataFrame: Dataframe with the updated datetime column.
    """
    if make_naive:
        df[column] = df[column].apply(lambda x: x.tz_localize(None) if pd.notna(x) and x.tzinfo is not None else x)
    else:
        df[column] = df[column].apply(lambda x: x.tz_localize('UTC') if pd.notna(x) and x.tzinfo is None else x)
    return df

In [14]:
import time

def fetch_watch_events_with_dates(org_name, repo_name, headers):
    """
    Fetches watch events (subscribers) for a repository, including the date each watch was created, with a limit on the number of pages fetched.
    """
    url = f'https://api.github.com/repos/{org_name}/{repo_name}/events'
    params = {'per_page': 100, 'page': 1}
    watch_data = []

    while True:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            subscribers = response.json()
            if not subscribers:
                break
            
            # Extract the date each subscription was created (if available)
            for subscriber in subscribers:
                print(subscriber)
                if 'created_at' in subscriber:
                    watch_data.append(subscriber['created_at'])  # Adjust if API provides a different date field
            params['page'] += 1
        
        elif response.status_code == 403:
            # If rate limited, wait and retry
            print(f"Rate limit exceeded, waiting 60 seconds...")
            time.sleep(60)
            continue
        else:
            print(f"Failed to fetch watch events: {response.status_code}")
            break

    # Convert to DataFrame
    watch_df = pd.DataFrame({'date': pd.to_datetime(watch_data)})
    watch_df['num_watches'] = 1  # Each row represents a single watch event
    watch_df = convert_to_naive_or_aware(watch_df, 'date', make_naive=True)
    return watch_df

headers = {'Authorization': f'token {GITHUB_TOKEN}', 'Accept': 'application/vnd.github.v3.star+json'}
org_name = 'Significant-Gravitas'
repo_name = 'AutoGPT'
fetch_watch_events_with_dates(org_name, repo_name, headers)

{'id': '42146952655', 'type': 'IssueCommentEvent', 'actor': {'id': 41898282, 'login': 'github-actions[bot]', 'display_login': 'github-actions', 'gravatar_id': '', 'url': 'https://api.github.com/users/github-actions[bot]', 'avatar_url': 'https://avatars.githubusercontent.com/u/41898282?'}, 'repo': {'id': 614765452, 'name': 'Significant-Gravitas/AutoGPT', 'url': 'https://api.github.com/repos/Significant-Gravitas/AutoGPT'}, 'payload': {'action': 'created', 'issue': {'url': 'https://api.github.com/repos/Significant-Gravitas/AutoGPT/issues/7485', 'repository_url': 'https://api.github.com/repos/Significant-Gravitas/AutoGPT', 'labels_url': 'https://api.github.com/repos/Significant-Gravitas/AutoGPT/issues/7485/labels{/name}', 'comments_url': 'https://api.github.com/repos/Significant-Gravitas/AutoGPT/issues/7485/comments', 'events_url': 'https://api.github.com/repos/Significant-Gravitas/AutoGPT/issues/7485/events', 'html_url': 'https://github.com/Significant-Gravitas/AutoGPT/issues/7485', 'id':

Unnamed: 0,date,num_watches
0,2024-09-22 02:02:48,1
1,2024-09-22 02:02:48,1
2,2024-09-22 02:02:47,1
3,2024-09-22 02:02:47,1
4,2024-09-22 02:02:46,1
...,...,...
240,2024-09-20 12:56:39,1
241,2024-09-20 12:45:50,1
242,2024-09-20 12:45:34,1
243,2024-09-20 12:44:15,1


# Let's cheat

In [15]:
import pandas as pd
import numpy as np

# Sample Data: Replace these with your actual data
release_dates = [
    '2023-01-01', '2023-02-01', '2023-03-01', 
    '2023-04-01', '2023-05-01', '2023-06-01'
]
total_stars = 1000  # Total stars count for the repository
total_forks = 300   # Total forks count for the repository
total_watches = 500  # Total watches count for the repository

# Convert release dates to a DataFrame
df = pd.DataFrame({
    'release_date': pd.to_datetime(release_dates)
})

# Number of release dates
n_releases = len(df)

# Calculate incremental steps for each feature
stars_step = total_stars / (n_releases - 1)
forks_step = total_forks / (n_releases - 1)
watches_step = total_watches / (n_releases - 1)

# Generate values for each release date
df['num_stars'] = [round(stars_step * i) for i in range(n_releases)]
df['num_forks'] = [round(forks_step * i) for i in range(n_releases)]
df['num_watches'] = [round(watches_step * i) for i in range(n_releases)]

# Ensure the last values match the total counts exactly
df.at[n_releases - 1, 'num_stars'] = total_stars
df.at[n_releases - 1, 'num_forks'] = total_forks
df.at[n_releases - 1, 'num_watches'] = total_watches

# Display the resulting DataFrame
print(df)

  release_date  num_stars  num_forks  num_watches
0   2023-01-01          0          0            0
1   2023-02-01        200         60          100
2   2023-03-01        400        120          200
3   2023-04-01        600        180          300
4   2023-05-01        800        240          400
5   2023-06-01       1000        300          500
