In [1]:
from google.cloud import bigquery
from datetime import datetime, timedelta

# Construct a BigQuery client object.
client = bigquery.Client()

# 获取今天的日期和前一天的日期
today = datetime.today()
yesterday = today - timedelta(days=1)

# 将日期格式化为 BigQuery 查询需要的格式 (YYYYMMDD)
today_str = today.strftime('%Y%m%d')
yesterday_str = yesterday.strftime('%Y%m%d')

# 构建查询字符串
query = f"""
WITH watch_data AS (
  -- 查询最近一天的 WatchEvent 事件
  SELECT 
    repo.name AS repo_name
  FROM 
    `githubarchive.day.{today_str}`
  WHERE 
    type = 'WatchEvent'
  
  UNION ALL
  
  -- 查询前一天的 WatchEvent 事件
  SELECT 
    repo.name AS repo_name
  FROM 
    `githubarchive.day.{yesterday_str}`
  WHERE 
    type = 'WatchEvent'
)

SELECT 
  repo_name,
  COUNT(*) AS star_count
FROM 
  watch_data
GROUP BY 
  repo_name
ORDER BY 
  star_count DESC
LIMIT 1000
"""

# 执行查询并等待结果
rows = client.query(query)  # 执行查询
results = rows.result().to_dataframe()  # 等待查询结果






In [None]:
import requests
import pandas as pd
from tqdm import tqdm


# GitHub GraphQL API URL
GRAPHQL_URL = "https://api.github.com/graphql"

# GitHub Token (替换为你的 Token)
TOKEN = "your_token"

# 设置请求头
HEADERS = {
    "Authorization": f"bearer {TOKEN}",
    "Content-Type": "application/json"
}

# 构建 GraphQL 查询模板
GRAPHQL_QUERY_TEMPLATE = """
query {{
  repository(owner: "{repo_owner}", name: "{repo_name}") {{
    createdAt
    stargazerCount
  }}
}}
"""

# 构建请求函数，使用字符串替换
def fetch_repo_details(repo_name):
    """
    使用 GitHub GraphQL API 获取仓库的创建日期和 star 总数。
    """
    # 分割 repo_name 为 owner 和 name
    if "/" not in repo_name:
        return None, None  # 无效的 repo_name
    
    repo_owner, repo_name_only = repo_name.split("/", 1)

    # 构造查询字符串，替换占位符
    query = GRAPHQL_QUERY_TEMPLATE.format(repo_owner=repo_owner, repo_name=repo_name_only)
    
    # 发送请求
    try:
        response = requests.post(
            GRAPHQL_URL,
            json={"query": query},
            headers=HEADERS
        )
        # 检查请求是否成功
        if response.status_code == 200:
            data = response.json()
            # 确保 data 和 repository 存在且有效
            if "data" in data and "repository" in data["data"]:
                repo_data = data["data"]["repository"]
                if repo_data is not None:
                    return repo_data.get("createdAt"), repo_data.get("stargazerCount")
                else:
                    print(f"Repository data is None for {repo_name}")
            else:
                print(f"Missing 'data' or 'repository' for {repo_name}")
        else:
            print(f"Failed to fetch data for {repo_name}: {response.status_code}, {response.text}")
    except requests.exceptions.RequestException as e:
        # 捕获请求中的异常
        print(f"Request failed for {repo_name}: {str(e)}")
    
    # 出现问题时返回 None
    return None, None

# 从 BigQuery 获取的 DataFrame 示例
# 假设你已经得到了如下的 DataFrame
df = results  # BigQuery 查询的结果 DataFrame

# 添加两列：创建日期和 star 总数
df["created_at"] = None
df["current_star_count"] = None

# 遍历 DataFrame，获取每个 repo 的详细信息
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Fetching repo details"):
    repo_name = row["repo_name"]
    created_at, current_star_count = fetch_repo_details(repo_name)
    df.at[index, "created_at"] = created_at
    df.at[index, "current_star_count"] = current_star_count



Fetching repo details:  12%|█▏        | 119/1000 [00:41<04:45,  3.09it/s]

Repository data is None for thisiscindychou/roblox-solara-executors


Fetching repo details:  12%|█▏        | 122/1000 [00:42<04:39,  3.15it/s]

Repository data is None for timigsh2mos/fortnite-hack-external


Fetching repo details:  32%|███▏      | 323/1000 [01:50<03:40,  3.07it/s]

Repository data is None for Dilodova/Roblox-Executor-Xeno-v1.0.9


Fetching repo details:  43%|████▎     | 426/1000 [02:26<03:09,  3.02it/s]

Repository data is None for spear-blackseeker/Solara-Executor-Roblox


Fetching repo details:  47%|████▋     | 469/1000 [02:40<02:59,  2.96it/s]

Repository data is None for kotskirk852/verse-spoofer


Fetching repo details: 100%|██████████| 1000/1000 [05:43<00:00,  2.91it/s]


In [26]:
df

Unnamed: 0,repo_name,star_count,created_at,current_star_count
0,Tencent/HunyuanVideo,1972,2024-11-28T08:38:31Z,2174
1,LadybirdBrowser/ladybird,1457,2024-05-30T09:18:10Z,24702
2,huggingface/smol-course,1263,2024-11-25T19:22:43Z,1474
3,myhhub/stock,973,2023-03-21T01:23:26Z,5233
4,lobehub/lobe-chat,777,2023-05-21T07:19:12Z,46944
...,...,...,...,...
995,xoofx/ultra,25,2024-11-18T09:56:16Z,764
996,zenml-io/zenml,25,2020-11-19T09:25:46Z,4137
997,ollama/ollama-python,25,2023-12-09T09:27:18Z,5123
998,virattt/ai-hedge-fund,25,2024-11-29T16:30:01Z,363


In [None]:
import requests
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed


# GitHub GraphQL API URL
GRAPHQL_URL = "https://api.github.com/graphql"

# GitHub Token (替换为你的 Token)
TOKEN = "your_github_token"

# 设置请求头
HEADERS = {
    "Authorization": f"bearer {TOKEN}",
    "Content-Type": "application/json"
}

# 构建 GraphQL 查询模板
GRAPHQL_QUERY_TEMPLATE = """
query {{
  repository(owner: "{repo_owner}", name: "{repo_name}") {{
    createdAt
    stargazerCount
  }}
}}
"""

# 请求函数，获取仓库信息
def fetch_repo_details(repo_name):
    """
    使用 GitHub GraphQL API 获取仓库的创建日期和 star 总数。
    """
    if "/" not in repo_name:
        return None, None  # 无效的 repo_name
    
    repo_owner, repo_name_only = repo_name.split("/", 1)
    
    query = GRAPHQL_QUERY_TEMPLATE.format(repo_owner=repo_owner, repo_name=repo_name_only)
    
    # 发送请求
    try:
        response = requests.post(
            GRAPHQL_URL,
            json={"query": query},
            headers=HEADERS
        )
        # 检查请求是否成功
        if response.status_code == 200:
            data = response.json()
            # 确保 data 和 repository 存在且有效
            if "data" in data and "repository" in data["data"]:
                repo_data = data["data"]["repository"]
                if repo_data is not None:
                    return repo_data.get("createdAt"), repo_data.get("stargazerCount")
                else:
                    print(f"Repository data is None for {repo_name}")
            else:
                print(f"Missing 'data' or 'repository' for {repo_name}")
        else:
            print(f"Failed to fetch data for {repo_name}: {response.status_code}, {response.text}")
    except requests.exceptions.RequestException as e:
        # 捕获请求中的异常
        print(f"Request failed for {repo_name}: {str(e)}")
    
    # 出现问题时返回 None
    return None, None

# 处理并行化请求
def fetch_repo_details_parallel(df):
    results = []
    
    # 使用 ThreadPoolExecutor 进行并行化
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(fetch_repo_details, row["repo_name"]): index for index, row in df.iterrows()}
        
        # 显示进度条
        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching repo details"):
            index = futures[future]
            created_at, stargazer_count = future.result()
            df.at[index, "created_at"] = created_at
            df.at[index, "current_star_count"] = stargazer_count
    
    return df

# 假设你已经得到了如下的 DataFrame
df = results  # BigQuery 查询的结果 DataFrame

# 添加两列：创建日期和 star 总数
df["created_at"] = None
df["current_star_count"] = None

# 执行并行化的获取仓库信息
df = fetch_repo_details_parallel(df)


Fetching repo details:   0%|          | 0/1000 [00:00<?, ?it/s]

Fetching repo details:  13%|█▎        | 131/1000 [00:04<00:29, 29.93it/s]

Repository data is None for thisiscindychou/roblox-solara-executors
Repository data is None for timigsh2mos/fortnite-hack-external


Fetching repo details:  35%|███▌      | 354/1000 [00:12<00:21, 29.42it/s]

Repository data is None for Dilodova/Roblox-Executor-Xeno-v1.0.9


Fetching repo details:  46%|████▋     | 465/1000 [00:16<00:16, 31.85it/s]

Repository data is None for spear-blackseeker/Solara-Executor-Roblox


Fetching repo details:  51%|█████     | 506/1000 [00:17<00:16, 30.05it/s]

Repository data is None for kotskirk852/verse-spoofer
Repository data is None for beastamya8/Roblox-Synapse-X


Fetching repo details: 100%|██████████| 1000/1000 [00:34<00:00, 28.63it/s]


In [3]:
df

Unnamed: 0,repo_name,star_count,created_at,current_star_count
0,Tencent/HunyuanVideo,2037,2024-11-28T08:38:31Z,2249
1,LadybirdBrowser/ladybird,1622,2024-05-30T09:18:10Z,24852
2,huggingface/smol-course,1369,2024-11-25T19:22:43Z,1578
3,myhhub/stock,1015,2023-03-21T01:23:26Z,5254
4,lobehub/lobe-chat,868,2023-05-21T07:19:12Z,47045
...,...,...,...,...
995,StacklokLabs/promptwright,27,2024-10-25T10:59:05Z,288
996,Fansirsqi/Sesame-TK,27,2024-10-17T09:21:08Z,214
997,vuejs/vue,27,2013-07-29T03:24:51Z,208086
998,PanJiaChen/vue-element-admin,27,2017-04-17T03:35:49Z,88163


In [4]:
# 确保 created_at 是 datetime 类型
df["created_at"] = pd.to_datetime(df["created_at"])

# 按照 created_at 升序排序
df_sorted = df.sort_values(by="created_at", ascending=False)


In [5]:
df_sorted.to_csv('result.csv')