Scraping Github Repos from multiple pages

In [1]:
import requests
import pandas as pd
import time

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
project_name = []
last_update = []

In [3]:
def get_projects(stars_range):
    i = 1;
    while i <= 100:
        
        headers = {
            'authority': 'github.com',
            'accept': 'application/json',
            'accept-language': 'en-US,en;q=0.9',
            'referer': 'https://github.com/search?q=stars%3A'+str(stars_range)+'&type=repositories&p=1',
            'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
            'x-github-target': 'dotcom',
            'x-requested-with': 'XMLHttpRequest',
        }

        params = {
            'q': 'stars:'+str(stars_range),
            'type': 'repositories',
            'p': str(i),
        }
        
        try:
            # response variable
            response = requests.get('https://github.com/search', params=params, headers=headers) 
            response.raise_for_status()
            print(f"Page {i} - Status Code: {response.status_code}")
            
            # json object
            results_json = response.json()
            
        except requests.exceptions.RequestException as e:
            print(f"Request failed on page {i}: {e}")
            delay_seconds = 60  # default delay
            time.sleep(delay_seconds)
            continue
            
        except json.decoder.JSONDecodeError as e:
            print(f"JSONDecodeError on page {i}: {e}")
        
        project_list = results_json['payload']['results']
        if not project_list:
            break
        
        for project in project_list:
            project_name.append(project['hl_name'])
            last_update.append(project['repo']['repository']['updated_at'])
        
        i += 1


In [4]:
# get_more_projects will collect projects within a range of stars. It includes projects with stars in the range of (low, high).
# NOTE: low is included in the range, but high is not
def get_more_projects(low, high):
    while high != low:
        get_projects(str(high-500)+'..'+str(high-1))
        high -= 500

In [5]:
get_more_projects(5000,8000)

Page 1 - Status Code: 200
Page 2 - Status Code: 200
Page 3 - Status Code: 200
Page 4 - Status Code: 200
Page 5 - Status Code: 200
Page 6 - Status Code: 200
Page 7 - Status Code: 200
Page 8 - Status Code: 200
Page 9 - Status Code: 200
Request failed on page 10: 429 Client Error: Too Many Requests for url: https://github.com/search?q=stars%3A7500..7999&type=repositories&p=10
Page 10 - Status Code: 200
Page 11 - Status Code: 200
Page 12 - Status Code: 200
Page 13 - Status Code: 200
Page 14 - Status Code: 200
Page 15 - Status Code: 200
Page 16 - Status Code: 200
Page 17 - Status Code: 200
Page 18 - Status Code: 200
Request failed on page 19: 429 Client Error: Too Many Requests for url: https://github.com/search?q=stars%3A7500..7999&type=repositories&p=19
Page 19 - Status Code: 200
Page 20 - Status Code: 200
Page 21 - Status Code: 200
Page 22 - Status Code: 200
Page 23 - Status Code: 200
Page 24 - Status Code: 200
Page 25 - Status Code: 200
Page 26 - Status Code: 200
Page 27 - Status Code: 

In [6]:
projects_df = pd.DataFrame({'Project':project_name, 'Last Update':last_update})

In [7]:
projects_df

Unnamed: 0,Project,Last Update
0,teddysun/shadowsocks_install,2023-09-23T08:43:20.806Z
1,imgproxy/imgproxy,2024-02-17T16:27:00.877Z
2,rectorphp/rector,2024-02-18T15:45:54.488Z
3,LeetCode-OpenSource/vscode-leetcode,2024-02-18T13:35:15.055Z
4,blue-yonder/tsfresh,2024-01-28T09:45:09.643Z
...,...,...
3666,therealsreehari/Learn-Data-Science-For-Free,2023-01-22T09:43:52.169Z
3667,nuxt/awesome,2024-01-20T21:15:04.203Z
3668,Codeusa/Borderless-Gaming,2024-01-23T23:42:43.827Z
3669,UnblockNeteaseMusic/server,2024-02-17T04:39:49.819Z


In [8]:
#projects_df.to_excel("projects_test.xlsx", index=False)

try:
    with pd.ExcelWriter(
        "projects_9500AndUp.xlsx",
        mode="a",
        engine="openpyxl",
        if_sheet_exists="overlay",
    ) as writer:
         projects_df.to_excel(writer,sheet_name="Sheet1", startrow=writer.sheets["Sheet1"].max_row, index = False,header= False)
except FileNotFoundError:
    projects_df.to_excel("projects_9500AndUp.xlsx", index=False)