In [1]:
from github import Github
from pprint import pprint
import pandas as pd
import json
from tqdm.notebook import tqdm

In [2]:
def get_repo_dict(repo):
    return {"Full name": repo.full_name,
            "Description": repo.description,
            "Date created": repo.created_at,
            "Date of last push": repo.pushed_at,
            "Home Page": repo.homepage,
            "Language": repo.language,
            "Number of forks": repo.forks,
            "Number of stars": repo.stargazers_count,
            "Topics": repo.get_topics(),
            # "Labels": [i._rawData for i in repo.get_labels()],
            # "Contributors": [i._rawData for i in repo.get_contributors()],
            "Contributors Count": repo.get_contributors().totalCount,
            #"Subscribers": [i._rawData for i in repo.get_subscribers()],
            "Subscribers Count": repo.get_subscribers().totalCount,
            #"Watchers": [i._rawData for i in repo.get_watchers()],
            "Watchers Count": repo.get_watchers().totalCount            
           }

In [3]:
g = Github()
TOPIC = "osint"
all_repo = g.search_repositories(f'topic:{TOPIC}')
print(all_repo.totalCount)

938


In [4]:
top_repo = []
for i, repo in enumerate(all_repo):
    top_repo.append(repo)
    if i == 25:
        break

In [5]:
repo_list = []
for repo in tqdm(top_repo, total=len(top_repo)):
    repo_list.append(get_repo_dict(repo))

  0%|          | 0/26 [00:00<?, ?it/s]

In [6]:
with open(f'{TOPIC}_top.json', 'w', encoding='utf-8') as f:
    json.dump(repo_list, f, ensure_ascii=False, indent=4, default=str)  
repos_df = pd.DataFrame(repo_list)
repos_df

Unnamed: 0,Full name,Description,Date created,Date of last push,Home Page,Language,Number of forks,Number of stars,Topics,Contributors Count,Subscribers Count,Watchers Count
0,sherlock-project/sherlock,🔎 Hunt down social media accounts by username ...,2018-12-24 14:30:48,2021-11-21 22:13:27,http://sherlock-project.github.io,Python,3040,28231,"[osint, reconnaissance, linux, macos, cli, she...",136,894,28231
1,twintproject/twint,An advanced Twitter scraping & OSINT tool writ...,2017-06-10 15:16:49,2021-11-03 19:29:47,,Python,1812,11939,"[osint, twitter, python, scrape, tweets, elast...",60,292,11939
2,mxrch/GHunt,🕵️‍♂️ Investigate Google emails and documents.,2020-10-02 11:26:03,2021-11-22 20:31:32,,Python,833,10375,"[osint, google, hideandsec]",18,487,10375
3,blaCCkHatHacEEkr/PENTESTING-BIBLE,Learn ethical hacking.Learn about reconnaissan...,2019-06-28 11:26:57,2021-04-14 14:38:06,https://twitter.com/cry__pto,,1855,8887,"[pentesting, resources, redteam, hacking, osin...",2,645,8887
4,qeeqbox/social-analyzer,"API, CLI & Web App for analyzing & finding a ...",2020-11-30 19:04:26,2021-11-22 03:27:18,,JavaScript,579,8387,"[osint, social-media, analyzer, username, prof...",12,357,8387
5,s0md3v/Photon,Incredibly fast crawler designed for OSINT.,2018-03-30 19:38:22,2021-05-02 18:16:28,,Python,1215,8292,"[crawler, spider, python, osint, information-g...",19,315,8292
6,jivoi/awesome-osint,:scream: A curated list of amazingly awesome O...,2016-11-30 13:26:11,2021-11-17 15:34:28,,,1682,7695,"[awesome-list, osint, website]",73,534,7695
7,smicallef/spiderfoot,SpiderFoot automates OSINT for threat intellig...,2012-04-28 07:10:13,2021-11-22 15:02:09,http://www.spiderfoot.net,Python,1347,6776,"[reconnaissance, footprinting, osint, threatin...",39,303,6776
8,jofpin/trape,People tracker on the Internet: OSINT analysis...,2017-10-31 14:03:57,2021-06-20 16:33:43,https://twitter.com/jofpin,Python,1186,6703,"[tracking, osint, footprint, hacking-tool, rec...",16,332,6703
9,OWASP/Amass,In-depth Attack Surface Mapping and Asset Disc...,2018-07-10 16:05:08,2021-11-22 23:42:41,https://owasp.org/www-project-amass/,Go,1149,6122,"[go, dns, subdomain, enumeration, recon, osint...",52,167,6122


In [7]:
from collections import Counter

topics_list = [i['Topics'] for i in repo_list]
topics_list_flatten = [item for sublist in topics_list for item in sublist]
topics_counter = Counter(topics_list_flatten)
topics_counter = Counter(el for el in topics_counter.elements() if topics_counter[el] >= 2)

nodes= []
edges = []

for k,v in topics_counter.most_common():
    node = {"Id": k,
            "Size": v*1000,
            "Type": "topic",
            "Label": k
            }
    nodes.append(node)
print(len(nodes))

for record in repo_list:
    node = {"Id": record['Full name'],
            "Size": record['Number of stars'],
            "Type": "repo",
            "Label": record['Full name']
            }
    nodes.append(node)
    
    for topic in record['Topics']:
        if topic in topics_counter:
            edge = {"Source": record['Full name'],
                    "Target": topic}
            edges.append(edge)

print(len(nodes))
print(len(edges))
# print(nodes)
# print(edges)

43
69
160


In [8]:
from pyvis.network import Network
edges_df = pd.DataFrame(edges)
edges_df = edges_df[edges_df['Target'].str.contains("osint|hacker|whitehat|blackhat|penetration|testing", case=False)] # getting topics related to Leet and Python
edges_df['Source_Weight'] = edges_df.apply(lambda row: repos_df[repos_df['Full name']==row.Source]["Number of stars"].item(), axis=1)
print(topics_counter)
edges_df['Target_Weight'] = edges_df.apply(lambda row: topics_counter.get(row.Target)*100000, axis=1)

github_net = Network(height='1000px', width='100%', bgcolor='#222222', directed=False, font_color=True, notebook=True)
github_net.show_buttons(filter_=['physics','selection','renderer','interaction','manipulation'])
github_net.set_edge_smooth('dynammic')

github_net.force_atlas_2based(overlap= 0.5)
github_data = edges_df

sources = github_data['Source']
targets = github_data['Target']
source_weights = github_data['Source_Weight']
target_weights = github_data['Target_Weight']

edge_data = zip(sources, targets, source_weights, target_weights)

for e in edge_data:
    src, dst, src_w, dst_w = e
    github_net.add_node(src, src, title=src, size=src_w, group=1)
    github_net.add_node(dst, dst, title=dst, size=dst_w, group=2)
    github_net.add_edge(src, dst)

neighbor_map = github_net.get_adj_list()

# add neighbor data to node hover data
for node in github_net.nodes:
    node['title'] += ' Neighbors:<br>' + '<br>'.join(neighbor_map[node['id']])
    node['value'] = len(neighbor_map[node['id']])

github_net.show('github_repo_topic.html')

Counter({'osint': 26, 'information-gathering': 13, 'reconnaissance': 10, 'python': 8, 'hacking': 6, 'recon': 5, 'pentesting': 4, 'bugbounty': 4, 'security-tools': 4, 'security': 4, 'penetration-testing': 4, 'cli': 3, 'redteam': 3, 'hacktoberfest': 3, 'hacking-tool': 3, 'subdomain': 3, 'subdomain-enumeration': 3, 'golang': 3, 'scanning': 3, 'linux': 2, 'python3': 2, 'windows': 2, 'osint-resources': 2, 'kali-linux': 2, 'awesome-list': 2, 'pentest': 2, 'javascript': 2, 'nodejs': 2, 'analysis': 2, 'footprinting': 2, 'infosec': 2, 'intelligence-gathering': 2, 'osint-reconnaissance': 2, 'threat-intelligence': 2, 'osint-framework': 2, 'footprint': 2, 'dns': 2, 'github-api': 2, 'bug-bounty': 2, 'pentest-tool': 2, 'instagram': 2, 'osint-python': 2, 'scanner': 2})
