In [3]:
import pandas as pd 
import numpy as np
from functools import partial, reduce
import networkx as nx 
import glob 
import os
import itertools as it

In [4]:
path = r"C:\Users\erica\OneDrive - University of Southern California\Projects_at_USC\CKIDS_Fa2020_Social_Graph_Analysis\code\data"

all_files = glob.glob(os.path.join(path, "*.csv"))  

## Read all CSV files in `data` folder into Pandas df

In [5]:
df_list = []

print(f"Processing {len(all_files)} repos data:\n")

for f in all_files:
    path_list = f.split("\\")
    f_name = path_list[-1]
    f_name2 = f_name.split("stargazers_")[1]
    repo_name = f_name2.split(".csv")[0] 
    print(repo_name) 
    df_from_file = pd.read_csv(f)
    df_from_file['repo_name'] = repo_name
    df_list.append(df_from_file)

Processing 3 repos data:

microweber
nvm-windows
react-admin


In [6]:
df_concat = pd.concat(df_list, ignore_index=True)
print(df_concat.shape)
df_concat.head(3)

(18692, 13)


Unnamed: 0,username,name,blog,company,bio,location,avatar_url,hireable,num_followers,num_following,created_at,star_time,repo_name
0,skopp,Rashaad Essop,http://skopp.skuda.net,SKUDA,,"Johannesburg, South Africa",https://avatars0.githubusercontent.com/u/16529...,True,86,531,2012-04-17 13:35:23,2013-06-10 10:16:13,microweber
1,doolab,Marcel Berger,https://doolab.io,@sharenowTech,,"Berlin, Germany",https://avatars0.githubusercontent.com/u/15897...,True,16,28,2012-03-30 03:14:39,2013-06-24 04:27:40,microweber
2,Xeoncross,David Pennington,http://davidpennington.me,News & Advertising,"Full stack Go, Typescript, and Python develope...","Dallas, Texas",https://avatars0.githubusercontent.com/u/56460...,False,459,63,2009-02-20 18:18:44,2013-07-18 16:46:29,microweber


In [7]:
## subset of just user names and repo names
df_names = df_concat[['username','repo_name']]
print(df_names.shape)

(18692, 2)


In [8]:
df_names.head(3)

Unnamed: 0,username,repo_name
0,skopp,microweber
1,doolab,microweber
2,Xeoncross,microweber


In [9]:
# # how many users starred each repo?
# df_names.groupby('repo_name')['username'].count()

# Format the data

In [11]:
## drop all usernames which are NOT duplicates (e.g. they only starred one repo)
df = df_names[df_names.duplicated(subset='username', keep=False)]
print(df.shape)
df.head(3)

(579, 2)


Unnamed: 0,username,repo_name
28,mattonik,microweber
34,thevasya,microweber
35,boris-chervenkov,microweber


# First Graph: Both Repos and Users as nodes
Two types of nodes (representing repos and users). Users are connected to repos that they starred.

In [13]:
edge_df = df.copy().reset_index(drop=True)
edge_df.columns = ['source', 'target']
edge_df

Unnamed: 0,source,target
0,mattonik,microweber
1,thevasya,microweber
2,boris-chervenkov,microweber
3,kublaj,microweber
4,k8n,microweber
...,...,...
574,skadimoolam,react-admin
575,mofelee,react-admin
576,dahoba,react-admin
577,AnotherGenZ,react-admin


In [14]:
## create networkx graph
G = nx.from_pandas_edgelist(edge_df)

print(nx.info(G))
## num nodes = # repos + # users
## num edges = # stars

Name: 
Type: Graph
Number of nodes: 290
Number of edges: 579
Average degree:   3.9931


In [None]:
## Export to .gexf file (readable by Gephi)
nx.write_gexf(G, 'repos_users_data.gexf')

# Second Graph: User-to-User graph
Only users have nodes. Users should be connected to other users, based on if they starred the same repo. Ideally, edge weight is determined by how many common starred repos each pair fo users have.

### Creating Edge List

In [None]:
# TODO: Create Edge List

In [16]:
## Only gives within-repo user connections, and duplicate connections (e.g. (user-a, user-b) and (user-b, user-a) both in the lists)

# df1 = df.groupby('repo_name')['username'].apply(lambda x : list(it.combinations(x,2)))  
# print(df1.shape)
# df1.head()

(3,)


repo_name
microweber     [(mattonik, thevasya), (mattonik, boris-cherve...
nvm-windows    [(vvasilev-, savage69kr), (vvasilev-, sjonner)...
react-admin    [(Vishal-Isharani, lucasbento), (Vishal-Ishara...
Name: username, dtype: object

In [None]:
s## both df 1 and df2 stargazers
set1 = set(df_list[0]['username'].reset_index(drop=True))
set2 = set(df_list[1]['username'].reset_index(drop=True))
set3 = set(df_list[2]['username'].reset_index(drop=True))

df1_df2 = set1 & set2
df1_df3 = set1 & set3
df2_df3 = set2 & set3
all_three = set1 & set2 & set3

In [None]:
## Two repos' overlap

edge_list = []

for u,v in it.combinations(df1_df2,2):
    edge_list.append((u,v))

for u,v in it.combinations(df1_df3,2):
    edge_list.append((u,v))

for u,v in it.combinations(df2_df3,2):
    edge_list.append((u,v))

## Three repos' overlap
for u,v in it.combinations(all_three,2):
    edge_list.append((u,v))

pairs = set(tuple(sorted(t)) for t in edge_list)
