In [1]:
import pandas as pd 
import numpy as np
from functools import partial, reduce
import networkx as nx 
import glob 
import os
import itertools

In [2]:
## Change to local path of where your CSV files are located
path = r"C:\Users\erica\OneDrive - University of Southern California\Projects_at_USC\CKIDS_Fa2020_Social_Graph_Analysis\code\test_data"

all_files = glob.glob(os.path.join(path, "*.csv"))  

## Read all CSV files in `data` folder into Pandas df

In [3]:
df_list = []

print(f"Processing {len(all_files)} repos data:\n")

for f in all_files:
    path_list = f.split("\\")
    f_name = path_list[-1]
    f_name2 = f_name.split("stargazers_")[1]
    repo_name = f_name2.split(".csv")[0]   # final repo name
    print(repo_name) 
    df_from_file = pd.read_csv(f)
    df_from_file['repo_name'] = repo_name
    df_list.append(df_from_file)

Processing 25 repos data:

aapanel
BlackCatCMS
bludit
BugReport
cabot
cJSON
Codiad
contiki-ng
cryptacular
CSS-injection-in-Swagger-UI
CVE-2020-11579
CVE-2020-25270
CVE-2020-25272
CVE-2020-25273
CVE-2020-25487
cve-pocs
CVEnotes
CVEs
devise
Disclosures
dotplant2
envoy
FA
Fluid
fosite


In [4]:
df_concat = pd.concat(df_list, ignore_index=True)
print(df_concat.shape)

(31675, 4)


In [5]:
df_concat.head(3)

Unnamed: 0,username,name,numberOfReposStarred,repo_name
0,AnthonyH45,UCR,1954,aapanel
1,webbird,,61,BlackCatCMS
2,schoensee,,1,BlackCatCMS


In [6]:
df_names = df_concat[['username','repo_name']]

In [7]:
print(df_names.shape)
# df_names['username'].nunique()

(31675, 2)


# Format the data

In [8]:
## drop all usernames which are NOT duplicates (e.g. they only starred one repo)
df0 = df_names[df_names.duplicated(subset='username', keep=False)]
print(df0.shape)

(2270, 2)


In [10]:
# df0.head(5)
df0['repo_name'].nunique()

21

In [11]:
df = df0.copy()
df.columns = ['user', 'repo']

# Create Edge List
Code contributed by Erin

In [12]:
users = list(df["user"].unique()) # get unique users
dic = {}

for user in users:
    repos = list(df.loc[df["user"] == user]["repo"]) # for each user, get list of repos for that user
    dic[user] = repos

In [13]:
rows = []
for k1, k2 in itertools.combinations(dic, 2): # get all pair combinations of keys (users)
   # print(k1, k2) # can comment these print statements out
   # print(set(dic[k1]) & set(dic[k2]))
    count = len(set(dic[k1]) & set(dic[k2])) # find number of repos in common between the two users
    rows.append([k1, k2, count])

In [14]:
df_user = pd.DataFrame(rows, columns = ["user1", "user2", "count"]) # create dataframe with pair of users and num of repos in common

In [15]:
df_user.shape

(571915, 3)

In [16]:
df_user.head()

Unnamed: 0,user1,user2,count
0,pbuzdin,iheanyi,1
1,pbuzdin,badcat,2
2,pbuzdin,coldwinds,2
3,pbuzdin,oudommeas,1
4,pbuzdin,hosseinfs,1


In [17]:
print(f"There are {df_user['user2'].nunique()} unique users")

There are 1069 unique users


In [18]:
df_user.sort_values('count', ascending=False)

Unnamed: 0,user1,user2,count
93535,myfreeweb,edersohe,4
389330,denji,mrluanma,4
152218,filipeoliveiraa,edersohe,4
375701,edersohe,dobestan,4
151995,filipeoliveiraa,sbusso,4
...,...,...,...
231655,charliek,AlexShiLucky,0
231656,charliek,at86,0
231657,charliek,armink,0
157132,Falc,iDings,0


In [21]:
## drop where count is <= 1
df_user2 = df_user[df_user['count'] >= 2]
df_user2.columns = ['source', 'target', 'weight']
df_user2.shape

(99316, 3)

## Create User User Graph

In [22]:
G = nx.from_pandas_edgelist(df_user2, edge_attr=True)

In [23]:
print(nx.info(G))
## Num of nodes = num of unique Github users
## Num of edges = num of starred repo commonalities between users

Name: 
Type: Graph
Number of nodes: 1062
Number of edges: 99316
Average degree: 187.0358


In [24]:
nx.write_gexf(G, 'user_user_graph_1.gexf')