# Extracting top repository from github from different topics

This notebook will first extract all of the important repositories  from github according to their topic;

Note: This Scraping depends upon current page html, which means it may fail in case of any structral changes to the webpages.

## importing required modules

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Some important functions

In [2]:
def scrape_page(url):
    """scrape given url"""
    res = requests.get(url)
    if(res.status_code == 200):
        return res.text
    return None

In [3]:
def parse_topics(html_docs):
    """parse html and extract all topics"""
    topic_class = "f3 lh-condensed mb-0 mt-1 Link--primary"
    soup = BeautifulSoup(html_docs, "html.parser")
    topic_lists = soup.find_all('p', topic_class)
    topic_list = []
    for topic in topic_lists:
        topic_list.append(topic.text) 
    return topic_list

In [4]:
def parse_description(html_docs):
    """parse html and extract all descriptions"""
    desc_class = "f5 color-fg-muted mb-0 mt-1"
    soup = BeautifulSoup(html_docs, "html.parser")
    desc_lists = soup.find_all('p', desc_class)
    desc_list = []
    for desc in desc_lists:
        desc_list.append(desc.text.strip())
    return desc_list

In [149]:
def parse_html(html_doc):
    soup = BeautifulSoup(html_doc)
    repos = soup.find_all('article')
    
    info = []
    for repo in repos:
        tags = repo.find('h3').find_all('a')
        repo_user_name = tags[0].text.strip()
        repo_name = tags[1].text.strip()  
        stars = repo.find("span", "Counter js-social-count").text
        req = {"user_name": repo_user_name, "repo_name": repo_name, "stars": stars}
        info.append(req)
    return info

In [151]:
def extract_info(topic):
    url = "https://github.com/topics/" + topic
    page = scrape_page(url)
    if(page == None):
        return []
    repo_info = parse_html(page)
    return repo_info

## Scrapping important repository topics from github

In [6]:
#url for our required web page
topics_url = "https://github.com/topics"
#our required web page
topics_html_doc = scrape_page(topics_url)


In [14]:
full_topics_html = ""
# full web page
with open('topics.html', 'r') as f:
    full_topics_html = f.read()

In [16]:
#parsing topics from our web page
topic_lists = parse_topics(full_topics_html)
topic_description_lists = parse_description(full_topics_html)



In [60]:
#length or topics scrapped
len(topic_lists)
topic_lists

['3D',
 'Ajax',
 'Algorithm',
 'Amp',
 'Android',
 'Angular',
 'Ansible',
 'API',
 'Arduino',
 'ASP.NET',
 'Atom',
 'Awesome Lists',
 'Amazon Web Services',
 'Azure',
 'Babel',
 'Bash',
 'Bitcoin',
 'Bootstrap',
 'Bot',
 'C',
 'Chrome',
 'Chrome extension',
 'Command line interface',
 'Clojure',
 'Code quality',
 'Code review',
 'Compiler',
 'Continuous integration',
 'COVID-19',
 'C++',
 'Cryptocurrency',
 'Crystal',
 'C#',
 'CSS',
 'Data structures',
 'Data visualization',
 'Database',
 'Deep learning',
 'Dependency management',
 'Deployment',
 'Django',
 'Docker',
 'Documentation',
 '.NET',
 'Electron',
 'Elixir',
 'Emacs',
 'Ember',
 'Emoji',
 'Emulator',
 'ESLint',
 'Ethereum',
 'Express',
 'Firebase',
 'Firefox',
 'Flask',
 'Font',
 'Framework',
 'Front end',
 'Game engine',
 'Git',
 'GitHub API',
 'Go',
 'Google',
 'Gradle',
 'GraphQL',
 'Gulp',
 'Hacktoberfest',
 'Haskell',
 'Homebrew',
 'Homebridge',
 'HTML',
 'HTTP',
 'Icon font',
 'iOS',
 'IPFS',
 'Java',
 'JavaScript',
 'Je

### converting scraped data into dataframes

In [19]:
data = {"topics": topic_lists, "description": topic_description_lists}
df = pd.DataFrame(data)

df

## Extracting top repositories

Here i am only extracting 3d related topics. but you can extract all of above topics if you want

In [156]:
topic = 'git'
repository_lists = extract_info(topic)
repository_lists

[{'user_name': 'github', 'repo_name': 'gitignore', 'stars': '141k'},
 {'user_name': 'gogs', 'repo_name': 'gogs', 'stars': '41.3k'},
 {'user_name': 'sharkdp', 'repo_name': 'bat', 'stars': '38.2k'},
 {'user_name': 'tiimgreen',
  'repo_name': 'github-cheat-sheet',
  'stars': '37.9k'},
 {'user_name': 'bregman-arie',
  'repo_name': 'devops-exercises',
  'stars': '33.8k'},
 {'user_name': 'go-gitea', 'repo_name': 'gitea', 'stars': '33.4k'},
 {'user_name': 'jesseduffield', 'repo_name': 'lazygit', 'stars': '31.1k'},
 {'user_name': 'cli', 'repo_name': 'cli', 'stars': '30.5k'},
 {'user_name': 'typicode', 'repo_name': 'husky', 'stars': '28.2k'},
 {'user_name': 'github', 'repo_name': 'hub', 'stars': '22.2k'},
 {'user_name': 'git-tips', 'repo_name': 'tips', 'stars': '20.5k'},
 {'user_name': 'requarks', 'repo_name': 'wiki', 'stars': '19.2k'},
 {'user_name': 'logseq', 'repo_name': 'logseq', 'stars': '18.5k'},
 {'user_name': 'desktop', 'repo_name': 'desktop', 'stars': '16.2k'},
 {'user_name': 'so-fancy

### Converting into datafrem

In [157]:
user_name = []
repo_name = []
stars = []
for page in repository_lists:
    user_name.append(page['user_name'])
    repo_name.append(page['repo_name'])
    stars.append(page['stars'])

df = pd.DataFrame({"user_name":user_name, "repo_name":repo_name, "stars":stars})
df

Unnamed: 0,user_name,repo_name,stars
0,github,gitignore,141k
1,gogs,gogs,41.3k
2,sharkdp,bat,38.2k
3,tiimgreen,github-cheat-sheet,37.9k
4,bregman-arie,devops-exercises,33.8k
5,go-gitea,gitea,33.4k
6,jesseduffield,lazygit,31.1k
7,cli,cli,30.5k
8,typicode,husky,28.2k
9,github,hub,22.2k
