**Author:** Lisa Wallner  
**Description:** In this notebook the proof of concept GitHub repositories are downloaded a ZIP file and add to the other repository data for further processing.  

In [None]:
# Hint: If lines are created with support of a Large Language Model or the code is taken from another source, you find following hint at the end of the line:
#       (generated with Microsoft Copilot) or (source: link_to_source)

In [None]:
import requests # package for api requests
import json # package to work with .json
from datetime import datetime # package for timestamps
import time # package to set time ranges
import random # package to work with random numbers
from dotenv import load_dotenv
import os # package for using operating system

In [7]:
# load .env file
load_dotenv(override=True)

# set credentials and parameters for api requests
ACCESS_TOKEN = os.environ['GIT_TOKEN'] # get token from .env
PAYLOAD = {}
HEADERS = {
  'Accept': 'application/vnd.github+json',
  'X-GitHub-Api-Version': '2022-11-28',
  'Authorization': f'Bearer {ACCESS_TOKEN}'
}


In [8]:
def set_sleeper(number=None):
    '''
    Function which pauses execution of script for a specified or random amount of time.

    Args:
        number (optional): The number of seconds to pause. Defaults to None.

    Return:
        None
    '''
    if number != None: # check if number is not None
        time.sleep(number) # scrpipt pause for given number
    random_number = random.randint(3,8) # if not set time to random number
    time.sleep(random_number) # script pause for random number

In [10]:
def download_github_repo(repo_owner, repo_name, refs):
    '''
    Function which downloads the GitHub repository as ZIP file if the number of files is smaller than 1000.
    https://docs.github.com/en/rest/repos/contents?apiVersion=2022-11-28#download-a-repository-archive-zip
    The GitHub api has limit of 1000 files and 100 MB/file for a repository.
    https://docs.github.com/en/rest/repos/contents?apiVersion=2022-11-28

    Args:
        repo_owner: Owner of the GitHub repository.
        repo_name: Name of the GitHub repository.
        refs: Name of branch.

    Return:
        None
    '''  
    url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/zipball/{refs}' # url to download repo a zip file for specific branch (refs)
    response = requests.get(url, stream=True, headers=HEADERS, data=PAYLOAD) # send GET request (generated with Microsoft Copilot)
    if response.status_code == 200: # check if request was successful --> status_code == 200
        timestamp = datetime.now(tz=None).strftime('%Y-%m-%d_%H-%M-%S') # set timestamp
        with open(f"../data/repo_data_zip/{repo_owner}_{repo_name}_{timestamp}.zip", "wb") as file: # create zip file for repo
            for chunk in response.iter_content(chunk_size=8192): # iterate over each chunk (chunk_size=8192) in response data to prevent memory constraints
                file.write(chunk) # write chunk to zip file
        print(f"Repository '{repo_name}' has been downloaded as a ZIP file.")
    else:
        if response.status_code != 200: # check if status_code is not 200 to prevent time outs and banning from api
            print(f'Error with response. Check out status_code {response.status_code}!')
            rate_limit_remaining = int(response.headers.get('X-RateLimit-Remaining')) # get remaining rate limit for requests
            if rate_limit_remaining <= 1: # if rate limits are <= 1 call set_sleeper func and return None
                set_sleeper(61)

In [None]:
# list with sublists which contain necessary information to get ZIP files of proof of concept repositories
repos = [['langchain-ai', 'local-deep-researcher', 'main'], ['Taniiishk', 'Rock-Paper-Scissors-Game', 'main']]

In [18]:
for repo in repos: # iterate through list repos
    download_github_repo(repo_owner=repo[0], repo_name=repo[1], refs=repo[2]) # execute func download_github_repo

Repository 'local-deep-researcher' has been downloaded as a ZIP file.


Load metadata of the two repositories.