In [1]:
import pandas as pd
import networkx as nx
from tqdm import tqdm

tqdm.pandas()

In [2]:
LP = pd.read_csv("../data/gen/lender_project.csv")
LP.rename(columns={":END_ID(Project-ID)": "project_id", ":START_ID(Lender-ID)": "lender_id"}, inplace=True)
LP.drop(columns=[":TYPE", "loan_date"], inplace=True)
LP.head()

Unnamed: 0,project_id,lender_id,lender_publicId,loan_shareAmount
0,2627672,38245,sqtruong3619,25.0
1,2627672,55957,davidandsusan9466,25.0
2,2627672,150690,futureteaming,25.0
3,2627672,386557,peregrin,5.0
4,2627672,538923,michael6729,25.0


In [3]:
PT = pd.read_csv("../data/gen/project_tags.csv")
PT.rename(columns={":END_ID": "tag"}, inplace=True)
PT["tag"] = PT["tag"].astype("category")
PT.drop(columns=[":TYPE"], inplace=True)
PT.head()

Unnamed: 0,project_id,tag
0,2627672,#Woman-Owned Business
1,2627672,#Animals
2,2627672,#Parent
3,2063069,#Woman-Owned Business
4,2063069,#Repeat Borrower


In [4]:
merged = pd.merge(LP, PT, on="project_id")
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 569505 entries, 0 to 569504
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   project_id        569505 non-null  int64   
 1   lender_id         569505 non-null  int64   
 2   lender_publicId   569505 non-null  object  
 3   loan_shareAmount  569505 non-null  float64 
 4   tag               569505 non-null  category
dtypes: category(1), float64(1), int64(2), object(1)
memory usage: 22.3+ MB


In [5]:
assert 0 == merged.duplicated().sum()
merged.drop(columns=["project_id"], inplace=True)
merged

Unnamed: 0,lender_id,lender_publicId,loan_shareAmount,tag
0,38245,sqtruong3619,25.0,#Woman-Owned Business
1,38245,sqtruong3619,25.0,#Animals
2,38245,sqtruong3619,25.0,#Parent
3,55957,davidandsusan9466,25.0,#Woman-Owned Business
4,55957,davidandsusan9466,25.0,#Animals
...,...,...,...,...
569500,2780157,christopher69594587,50.0,#Repeat Borrower
569501,933559,chris8011,50.0,#Health and Sanitation
569502,933559,chris8011,50.0,#Repeat Borrower
569503,2843468,robert28662239,25.0,#Health and Sanitation


In [9]:
LT = merged.groupby(["lender_id", "tag"]).agg({"lender_publicId": "first", "loan_shareAmount": "sum"}).reset_index()
LT.rename(columns={"loan_shareAmount": "loan_amount"}, inplace=True)
LT = LT[LT["loan_amount"] > 0]
LT

Unnamed: 0,lender_id,tag,lender_publicId,loan_amount
0,184,#Animals,subbarao,150.0
7,184,#Health and Sanitation,subbarao,50.0
25,250,#Animals,chrisprew,25.0
35,250,#Parent,chrisprew,50.0
38,250,#Repeat Borrower,chrisprew,50.0
...,...,...,...,...
1598339,6117917,#Schooling,mars9183,250.0
1598363,6117983,#Repeat Borrower,lauren5972,100.0
1598364,6117983,#Schooling,lauren5972,100.0
1598365,6117983,#Single,lauren5972,100.0


In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
LT["V1"] = le.fit_transform(LT["lender_id"])
LT

Unnamed: 0,lender_id,tag,lender_publicId,loan_amount,V1
0,184,#Animals,subbarao,150.0,0
7,184,#Health and Sanitation,subbarao,50.0,0
25,250,#Animals,chrisprew,25.0,1
35,250,#Parent,chrisprew,50.0,1
38,250,#Repeat Borrower,chrisprew,50.0,1
...,...,...,...,...,...
1598339,6117917,#Schooling,mars9183,250.0,63933
1598363,6117983,#Repeat Borrower,lauren5972,100.0,63934
1598364,6117983,#Schooling,lauren5972,100.0,63934
1598365,6117983,#Single,lauren5972,100.0,63934


In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
LT["V2"] = le.fit_transform(LT["tag"]) + LT["V1"].max() + 1
LT

Unnamed: 0,lender_id,tag,lender_publicId,loan_amount,V1,V2
0,184,#Animals,subbarao,150.0,0,63935
7,184,#Health and Sanitation,subbarao,50.0,0,63942
25,250,#Animals,chrisprew,25.0,1,63935
35,250,#Parent,chrisprew,50.0,1,63945
38,250,#Repeat Borrower,chrisprew,50.0,1,63948
...,...,...,...,...,...,...
1598339,6117917,#Schooling,mars9183,250.0,63933,63949
1598363,6117983,#Repeat Borrower,lauren5972,100.0,63934,63948
1598364,6117983,#Schooling,lauren5972,100.0,63934,63949
1598365,6117983,#Single,lauren5972,100.0,63934,63950


In [12]:
# total number of vertex
vertex_count = LT["lender_id"].nunique() + LT["tag"].nunique()
assert vertex_count == LT["V1"].nunique() + LT["V2"].nunique()
print(vertex_count)

63960


In [13]:
# create a dataframe with V1, V2, loan_amount columns
LT[["V1", "V2", "loan_amount"]].to_csv("checkpoints/lender_tag_bipartite.csv", sep="\t", header=False, index=False)

In [14]:
dictionary = LT[["V1", "lender_publicId"]].drop_duplicates()
dictionary.rename(columns={"lender_publicId": "name", "V1": "id"}, inplace=True)
dictionary.head()

Unnamed: 0,id,name
0,0,subbarao
25,1,chrisprew
53,2,brian5281
88,3,alexandre
102,4,dina


In [15]:
dictionary2 = LT[["V2", "tag"]].drop_duplicates()
dictionary2.rename(columns={"tag": "name", "V2": "id"}, inplace=True)
dictionary2.head()

Unnamed: 0,id,name
0,63935,#Animals
7,63942,#Health and Sanitation
35,63945,#Parent
38,63948,#Repeat Borrower
49,63959,#Woman-Owned Business


In [16]:
# concat two dictionaries
dictionary = pd.concat([dictionary, dictionary2])
print(len(dictionary))
assert len(dictionary) == vertex_count
dictionary.tail()

63960


Unnamed: 0,id,name
2669,63954,#Technology
3204,63939,#Fabrics
3221,63956,#Unique
4334,63944,#Orphan
6161,63946,#Refugee


In [None]:
# write the dictionary to file checkpoints/lender_tag_bipartite_Dictionary.txt, without header and index. The first column should be id
dictionary[["id", "name"]].to_csv(
    "checkpoints/lender_tag_bipartite_Dictionary.txt", sep="\t", header=False, index=False
)

# Run `biLouvain`

```bash
./biLouvain -d "\t" -i ../src/checkpoints/lender_tag_bipartite.csv  
```

Reference: https://github.com/paolapesantez/biLouvain.git

# export to Gephi

parse result file. The result file looks like this

```text
Community 0[V1]: 0
Community 1[V1]: 1
Community 2[V2]: 2,3
Community 3[V2]: 4

Singletons Partition V1: 2
Singletons Partition V2: 1
0,1,2,2,3
```

In [17]:
# read the last line of the file
with open("checkpoints/lender_tag_bipartite_ResultsCommunities.txt", "r") as f:
    result_text = f.read()

In [18]:
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility

import re

regex = r"^Community (?P<community_id>\d+)\[V(?P<vertex_type>\d+)\]: (?P<vertexes>.*)$"
matches = re.finditer(regex, result_text, re.MULTILINE)

community_result = []

for matchNum, match in enumerate(matches, start=1):
    community_id = match.group("community_id")
    vertex_type = match.group("vertex_type")
    vertexes = match.group("vertexes").split(", ")[0]
    vertexes = vertexes.split(",")
    for vertex in vertexes:
        community_result.append([community_id, vertex_type, vertex])

community_result = pd.DataFrame(community_result, columns=["community_id", "vertex_type", "vertex"])
community_result["vertex"] = community_result["vertex"].astype(int)
community_result

Unnamed: 0,community_id,vertex_type,vertex
0,0,1,4
1,0,1,5
2,0,1,6
3,0,1,11
4,0,1,12
...,...,...,...
63955,3,2,63955
63956,3,2,63956
63957,3,2,63957
63958,3,2,63958


In [26]:
dictionary = dictionary.merge(community_result, left_on="id", right_on="vertex").drop(columns=["id"])
dictionary.head()

Unnamed: 0,name,community_id,vertex_type,vertex
0,subbarao,1,1,0
1,chrisprew,1,1,1
2,brian5281,1,1,2
3,alexandre,1,1,3
4,dina,0,1,4


# Create networkx graph and show the result

In [28]:
G = nx.from_pandas_edgelist(LT, "lender_publicId", "tag", "loan_amount")
G.add_nodes_from(LT["lender_publicId"].drop_duplicates(), type="Lender")
G.add_nodes_from(LT["tag"].drop_duplicates(), type="Tag")

# refine node attributes with community id
for row in dictionary.itertuples():
    G.nodes[row.name]["community_id"] = row.community_id

G.number_of_nodes(), G.number_of_edges()
nx.write_gexf(G, "checkpoints/lender_tag_bipartite_community.gexf")