In [None]:
import requests
import json
from getpass import getpass
import base64
import pandas as pd
from datetime import datetime


In [2]:
# set credentials
ACCESS_TOKEN = getpass()

In [3]:
payload = {}
headers = {
  'Accept': 'application/vnd.github+json',
  'X-GitHub-Api-Version': '2022-11-28',
  'Authorization': f'Bearer {ACCESS_TOKEN}'
}

### list with multiple repos

In [4]:
url_multiple_repos = 'https://api.github.com/search/repositories?q=language:python&page=3'

response = requests.request('GET', url=url_multiple_repos, headers=headers, data=payload)

In [5]:
data = response.json()

In [None]:
with open('../data/multiple_github_repos.json', 'w') as file:
    json.dump(data, file)

### load data from file for preprocessing

In [55]:
with open('../data/multiple_github_repos.json', 'r') as file:
    loaded_data = json.load(file)

In [56]:
data = loaded_data['items']

In [None]:
data

In [57]:
# get keys of loaded_data as list
keys = list(loaded_data['items'][0].keys())

In [67]:
print(keys)

['id', 'node_id', 'name', 'full_name', 'private', 'owner', 'html_url', 'description', 'fork', 'url', 'forks_url', 'keys_url', 'collaborators_url', 'teams_url', 'hooks_url', 'issue_events_url', 'events_url', 'assignees_url', 'branches_url', 'tags_url', 'blobs_url', 'git_tags_url', 'git_refs_url', 'trees_url', 'statuses_url', 'languages_url', 'stargazers_url', 'contributors_url', 'subscribers_url', 'subscription_url', 'commits_url', 'git_commits_url', 'comments_url', 'issue_comment_url', 'contents_url', 'compare_url', 'merges_url', 'archive_url', 'downloads_url', 'issues_url', 'pulls_url', 'milestones_url', 'notifications_url', 'labels_url', 'releases_url', 'deployments_url', 'created_at', 'updated_at', 'pushed_at', 'git_url', 'ssh_url', 'clone_url', 'svn_url', 'homepage', 'size', 'stargazers_count', 'watchers_count', 'language', 'has_issues', 'has_projects', 'has_downloads', 'has_wiki', 'has_pages', 'has_discussions', 'forks_count', 'mirror_url', 'archived', 'disabled', 'open_issues_cou

In [58]:
# create empty df with keys of loaded_data as columns
df_repos = pd.DataFrame(columns=keys)

In [59]:
df_repos

Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,is_template,web_commit_signoff_required,topics,visibility,forks,open_issues,watchers,default_branch,permissions,score


In [60]:
#  iterate through subdictionary in data and concatenate the content of the subdictionary to df_repos
for repo in data:
    # create tmp df_repo for each repo
    df_repo = pd.DataFrame(data=[repo], columns=keys)
    # concatenate df_repos with df_repo
    df_repos = pd.concat([df_repos, df_repo], ignore_index=True)

  df_repos = pd.concat([df_repos, df_repo], ignore_index=True)


In [77]:
df_repos.head(1)

Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,is_template,web_commit_signoff_required,topics,visibility,forks,open_issues,watchers,default_branch,permissions,score
0,646410686,R_kgDOJodxvg,LLaMA-Factory,hiyouga/LLaMA-Factory,False,"{'login': 'hiyouga', 'id': 16256802, 'node_id'...",https://github.com/hiyouga/LLaMA-Factory,Unified Efficient Fine-Tuning of 100+ LLMs & V...,False,https://api.github.com/repos/hiyouga/LLaMA-Fac...,...,False,False,"[agent, ai, chatglm, fine-tuning, gpt, instruc...",public,5638,431,46207,main,"{'admin': False, 'maintain': False, 'push': Fa...",1.0


In [78]:
# columns which are requiered for preprocessing and fruther steps
columns = ['id', 'name', 'full_name', 'html_url', 'description', 'url', 'labels_url', 'created_at', 'updated_at', 'pushed_at', 'size', 'stargazers_count', 'watchers_count', 'language', 'has_issues', 'has_projects', 'has_downloads', 'has_wiki', 'has_pages', 'has_discussions', 'forks_count', 'open_issues_count', 'license', 'allow_forking', 'topics', 'visibility', 'forks', 'open_issues', 'watchers', 'default_branch', 'score']

In [80]:
for k in keys:
    if k in columns:
        continue
    else:
        df_repos = df_repos.drop([k], axis=1)

In [89]:
df_repos[['forks', 'watchers', 'stargazers_count']]

Unnamed: 0,forks,watchers,stargazers_count
0,5638,46207,46207
1,8717,45652,45652
2,18413,45039,45039
3,6767,44187,44187
4,4866,43721,43721
5,6674,43677,43677
6,2063,43178,43178
7,5552,43089,43089
8,6406,42776,42776
9,7180,42154,42154


In [90]:
print(30*34)

1020


### list with repo names - endpoint search

In [None]:
url_list = "https://api.github.com/search/repositories?q=language:python"


response = requests.request("GET", url_list, headers=headers, data=payload)

print(response.text)

### repo content

In [None]:
# url for repo content
url = "https://api.github.com/repos/taniiishk/rock-paper-scissors-game/contents/"

In [4]:
response = requests.request("GET", url, headers=headers, data=payload)
if response.status_code == 200:
    print('Request successful')
else:
    raise Exception(f"Non-success status code: {response.status_code}")

Request successful


In [5]:
data = response.json()
data

[{'name': 'README.md',
  'path': 'README.md',
  'sha': 'd1b07bc9e30f12188a6be5575fd4a27114cbf874',
  'size': 3287,
  'url': 'https://api.github.com/repos/Taniiishk/Rock-Paper-Scissors-Game/contents/README.md?ref=main',
  'html_url': 'https://github.com/Taniiishk/Rock-Paper-Scissors-Game/blob/main/README.md',
  'git_url': 'https://api.github.com/repos/Taniiishk/Rock-Paper-Scissors-Game/git/blobs/d1b07bc9e30f12188a6be5575fd4a27114cbf874',
  'download_url': 'https://raw.githubusercontent.com/Taniiishk/Rock-Paper-Scissors-Game/main/README.md',
  'type': 'file',
  '_links': {'self': 'https://api.github.com/repos/Taniiishk/Rock-Paper-Scissors-Game/contents/README.md?ref=main',
   'git': 'https://api.github.com/repos/Taniiishk/Rock-Paper-Scissors-Game/git/blobs/d1b07bc9e30f12188a6be5575fd4a27114cbf874',
   'html': 'https://github.com/Taniiishk/Rock-Paper-Scissors-Game/blob/main/README.md'}},
 {'name': 'code_file.py',
  'path': 'code_file.py',
  'sha': 'ee15d676e980576746be8cbbe93779ddfd3c9187

### one file

In [6]:
url_file = "https://api.github.com/repos/taniiishk/rock-paper-scissors-game/contents/code_file.py"
response = requests.request("GET", url_file, headers=headers, data=payload)


In [7]:
data = response.json()
data

{'name': 'code_file.py',
 'path': 'code_file.py',
 'sha': 'ee15d676e980576746be8cbbe93779ddfd3c9187',
 'size': 2446,
 'url': 'https://api.github.com/repos/Taniiishk/Rock-Paper-Scissors-Game/contents/code_file.py?ref=main',
 'html_url': 'https://github.com/Taniiishk/Rock-Paper-Scissors-Game/blob/main/code_file.py',
 'git_url': 'https://api.github.com/repos/Taniiishk/Rock-Paper-Scissors-Game/git/blobs/ee15d676e980576746be8cbbe93779ddfd3c9187',
 'download_url': 'https://raw.githubusercontent.com/Taniiishk/Rock-Paper-Scissors-Game/main/code_file.py',
 'type': 'file',
 'content': 'aW1wb3J0IHRraW50ZXIgYXMgdGsKZnJvbSB0a2ludGVyIGltcG9ydCBQaG90\nb0ltYWdlCmltcG9ydCByYW5kb20KCnJvb3QgPSB0ay5UaygpCnJvb3QudGl0\nbGUoIlJvY2sgUGFwZXIgU2Npc3NvcnMgR2FtZSIpCnJvb3QuZ2VvbWV0cnko\nIjE5ODB4MTA4MCIpCnJvb3QuY29uZmlndXJlKGJnPSIjRjBGOEZGIikgICMg\nQWxpY2VCbHVlIGJhY2tncm91bmQKCiMgSGVhZGluZyBhdCB0aGUgdG9wIApo\nZWFkaW5nID0gdGsuTGFiZWwocm9vdCwgdGV4dD0iUm9jayBQYXBlciBTY2lz\nc29ycyBHYW1lIiwgZm9udD0oIkhlbHZldGljYSIsIDQwL

In [8]:
data['name']

'code_file.py'

### repo tree (not relevant?)

In [None]:
# tree_sha = 'd1b07bc9e30f12188a6be5575fd4a27114cbf874'
# url_tree = f"https://api.github.com/repos/taniiishk/rock-paper-scissors-game/git/trees/{tree_sha}"

# payload = {}
# headers = {
#   'Accept': 'application/vnd.github+json',
#   'X-GitHub-Api-Version': '2022-11-28',
#   'Authorization': f'Bearer {ACCESS_TOKEN}'
# }

In [None]:
# response = requests.request("GET", url_tree, headers=headers, data=payload)

In [None]:
# response.json()

{'message': 'Invalid object requested. SHA must identify a commit or a tree.',
 'documentation_url': 'https://docs.github.com/rest/git/trees#get-a-tree',
 'status': '422'}