In [1]:
import pandas as pd
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests 
from bs4 import BeautifulSoup
from env_miatta import github_token, github_username
import time


# ACQUIRE

In [2]:
REPOS = ['TheAlgorithms/Python',
 'apache/flink',
 'forezp/SpringCloudLearning',
 'learn-co-students/python-dictionaries-readme-data-science-intro-000',
 'angular/angular-phonecat',
 'bloominstituteoftechnology/github-usercard',
 'learn-co-students/javascript-arrays-lab-bootcamp-prep-000',
 'tastejs/todomvc',
 'learn-co-students/jupyter-notebook-introduction-data-science-intro-000',
 'hasura-imad/imad-2016-app',
 'josephmisiti/awesome-machine-learning',
 'RasaHQ/rasa',
 'learn-co-students/javascript-intro-to-functions-lab-bootcamp-prep-000',
 'CorentinJ/Real-Time-Voice-Cloning',
 'vivienzou1/DL-Notes-for-interview',
 'MakeSchool/FlappyBirdTemplate-Spritebuilder',
 'jtleek/datasharing',
 'jquery/jquery',
 'freefq/free',
 'moby/moby',
 'learn-co-students/js-deli-counter-bootcamp-prep-000',
 'bloominstituteoftechnology/React-Todo',
 '996icu/996.ICU',
 'modood/Administrative-divisions-of-China',
 'codefresh-contrib/gitops-certification-examples',
 'TheOdinProject/javascript-exercises',
 'ColorlibHQ/gentelella',
 'learn-co-students/python-variables-lab-data-science-intro-000',
 'vaxilu/x-ui',
 'linuxacademy/cicd-pipeline-train-schedule-dockerdeploy',
 'RedHatTRaining/DO288-apps',
 'luchihoratiu/debug-via-ssh',
 'deadlyvipers/dojo_rules',
 'jenkinsci/jenkins',
 'mqyqingfeng/Blog',
 'spring-projects/spring-framework',
 'apache/kafka',
 'learn-co-curriculum/react-hooks-lists-and-keys-lab',
 '233boy/v2ray',
 'typicode/json-server',
 'learn-co-students/js-beatles-loops-lab-bootcamp-prep-000',
 'Azure/azure-quickstart-templates',
 'learn-co-students/js-from-dom-to-node-bootcamp-prep-000',
 'trekhleb/homemade-machine-learning',
 'AtsushiSakai/PythonRobotics',
 'xitu/gold-miner',
 'xingshaocheng/architect-awesome',
 'celery/celery',
 'ibm-developer-skills-network/xzceb-flask_eng_fr',
 'lazyprogrammer/machine_learning_examples',
 'ripienaar/free-for-dev',
 'jeecgboot/jeecg-boot',
 'bloominstituteoftechnology/Preprocessing-II',
 'Turonk/infra_actions',
 'reduxjs/redux',
 'rapid7/metasploit-framework',
 'bloominstituteoftechnology/node-db1-project',
 'nightscout/cgm-remote-monitor',
 'alx-tools/your_first_code',
 'dcxy/learngit',
 'brentley/ecsdemo-nodejs',
 'yankouskia/additional_5',
 'altercation/solarized',
 'supabase/supabase',
 'Ebazhanov/linkedin-skill-assessments-quizzes',
 'travis-ci/docs-travis-ci-com',
 'learn-co-students/python-lists-lab-data-science-intro-000',
 'stacksimplify/azure-aks-kubernetes-masterclass',
 'mitmproxy/mitmproxy',
 'jumpserver/jumpserver',
 'scutan90/DeepLearning-500-questions',
 'ultralytics/yolov5',
 'forem/forem',
 'bloominstituteoftechnology/module-challenge-intro-to-git',
 'jackfrued/Python-100-Days',
 'docsifyjs/docsify',
 'heartcombo/devise',
 'linuxacademy/cicd-pipeline-train-schedule-gradle',
 'othneildrew/Best-README-Template',
 'developerforce/intro-to-heroku',
 'kodekloudhub/certified-kubernetes-administrator-course',
 'ceph/ceph',
 'bilibili/ijkplayer',
 'taizilongxu/interview_python',
 'ant-design/ant-design-pro',
 'progedu/adding-up',
 'micropython/micropython',
 'XX-net/XX-Net',
 'cyclic-software/starter-express-api',
 'bloominstituteoftechnology/team-builder',
 'bloominstituteoftechnology/User-Interface',
 'JetBrains/kotlin',
 'learn-co-students/javascript-logging-lab-bootcamp-prep-000',
 'barryclark/jekyll-now',
 'raulmur/ORB_SLAM2',
 'amjuarez/bytecoin',
 'trustwallet/assets',
 'Binaryify/NeteaseCloudMusicApi',
 'vuejs/v2.vuejs.org',
 'keycloak/keycloak',
 'thingsboard/thingsboard',
 'learn-co-curriculum/phase-0-html-issue-bot-9000-lab',
 'qmk/qmk_firmware',
 'learn-co-curriculum/phase-0-the-dom-editing-lab',
 'PowerShellMafia/PowerSploit',
 'devopshydclub/vprofile-project',
 'bloominstituteoftechnology/DOM-II',
 'zhisheng17/flink-learning',
 'protocolbuffers/protobuf',
 'GitbookIo/gitbook',
 'selectize/selectize.js',
 'kubernetes/kubernetes',
 'facebookresearch/fastText',
 'espressif/esp-idf',
 'lin-xin/vue-manage-system',
 'Significant-Gravitas/Auto-GPT',
 'torvalds/linux',
 'namndwebdev/tang-crush']


url = 'https://github.com/'
headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}

if headers["Authorization"] == "token " or headers["User-Agent"] == "":
    raise Exception(
        "You need to follow the instructions marked TODO in this script before trying to use it"
    )

In [3]:
def github_api_request(url: str) -> Union[List, Dict]:
    response = requests.get(url, headers=headers)
    response_data = response.json()
    if response.status_code != 200:
        raise Exception(
            f"Error response from github api! status code: {response.status_code}, "
            f"response: {json.dumps(response_data)}"
        )
    return response_data

def get_repo_language(repo: str) -> str:
    url = f"https://api.github.com/repos/{repo}"
    repo_info = github_api_request(url)
    if type(repo_info) is dict:
        repo_info = cast(Dict, repo_info)
        return repo_info.get("language", None)
    raise Exception(
        f"Expecting a dictionary response from {url}, instead got {json.dumps(repo_info)}"
    )


def get_repo_contents(repo: str) -> List[Dict[str, str]]:
    url = f"https://api.github.com/repos/{repo}/contents/"
    contents = github_api_request(url)
    if type(contents) is list:
        contents = cast(List, contents)
        return contents
    raise Exception(
        f"Expecting a list response from {url}, instead got {json.dumps(contents)}"
    )


def get_readme_download_url(files: List[Dict[str, str]]) -> str:
    """
    Takes in a response from the github api that lists the files in a repo and
    returns the url that can be used to download the repo's README file.
    """
    for file in files:
        if file["name"].lower().startswith("readme"):
            return file["download_url"]
    return ""


def process_repo(repo: str) -> Dict[str, str]:
    """
    Takes a repo name like "gocodeup/codeup-setup-script" and returns a
    dictionary with the language of the repo and the readme contents.
    """
    contents = get_repo_contents(repo)
    readme_contents = requests.get(get_readme_download_url(contents)).text
    return {
        "repo": repo,
        "language": get_repo_language(repo),
        "readme_contents": readme_contents,
    }


# def scrape_github_data() -> List[Dict[str, str]]:
#     """
#     Loop through all of the repos and process them. Returns the processed data.
#     """
#     return [process_repo(repo) for repo in REPOS]


# if __name__ == "__main__":
#     data = scrape_github_data()
#     json.dump(data, open("data2.json", "w"), indent=1)
    
    



REQUEST_DELAY = 60  # 60-second delay between requests
BATCH_SIZE = 4      # Number of repositories to process in each batch

def scrape_github_data(repo_list) -> List[Dict[str, str]]:
    """
    Loop through a list of repos and process them. Returns the processed data.
    """
    processed_data = []
    for repo in repo_list:
        processed_repo = process_repo(repo)
        processed_data.append(processed_repo)
        time.sleep(REQUEST_DELAY)  # Add a 60-second delay between requests
    return processed_data

if __name__ == "__main__":
    total_repos = len(REPOS)
    
    for start in range(0, total_repos, BATCH_SIZE):
        end = min(start + BATCH_SIZE, total_repos)
        repo_batch = REPOS[start:end]
        data_batch = scrape_github_data(repo_batch)
        
        with open(f"data_batch{start}_{end}.json", "w") as f:
        json.dump(data_batch, open("data2.json", "w"), indent=1)


KeyboardInterrupt: 

In [None]:
words_df = scrape_github_data()


In [None]:
# Aquire data using methods described above.
words_df = pd.read_json('data2.json')
words_df.head()

