In [1]:
import pandas as pd
import networkx as nx
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

tqdm.pandas()

In [2]:
PT = pd.read_csv("../data/gen/project_tags.csv")
PT.rename(columns={":END_ID": "tag"}, inplace=True)
PT["project_id"] = PT["project_id"].astype(str)
PT["tag"] = PT["tag"].astype("category")
PT.drop(columns=[":TYPE"], inplace=True)
PT.head()

Unnamed: 0,project_id,tag
0,2627672,#Woman-Owned Business
1,2627672,#Animals
2,2627672,#Parent
3,2063069,#Woman-Owned Business
4,2063069,#Repeat Borrower


In [3]:
PT.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14478 entries, 0 to 14477
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   project_id  14478 non-null  object  
 1   tag         14478 non-null  category
dtypes: category(1), object(1)
memory usage: 128.1+ KB


In [4]:
le = LabelEncoder()
PT["V1"] = le.fit_transform(PT["tag"])
PT

Unnamed: 0,project_id,tag,V1
0,2627672,#Woman-Owned Business,24
1,2627672,#Animals,0
2,2627672,#Parent,10
3,2063069,#Woman-Owned Business,24
4,2063069,#Repeat Borrower,13
...,...,...,...
14473,2370520,#Repeat Borrower,13
14474,2370520,#Repair Renew Replace,12
14475,1903071,#Trees,20
14476,2309580,#Health and Sanitation,7


In [5]:
le = LabelEncoder()
PT["V2"] = le.fit_transform(PT["project_id"]) + PT["V1"].max() + 1
PT

Unnamed: 0,project_id,tag,V1,V2
0,2627672,#Woman-Owned Business,24,4747
1,2627672,#Animals,0,4747
2,2627672,#Parent,10,4747
3,2063069,#Woman-Owned Business,24,1436
4,2063069,#Repeat Borrower,13,1436
...,...,...,...,...
14473,2370520,#Repeat Borrower,13,2707
14474,2370520,#Repair Renew Replace,12,2707
14475,1903071,#Trees,20,143
14476,2309580,#Health and Sanitation,7,2397


In [6]:
# total number of vertex
vertex_count = PT["project_id"].nunique() + PT["tag"].nunique()
assert PT["V1"].nunique() == PT["tag"].nunique()
assert PT["V2"].nunique() == PT["project_id"].nunique()
print(vertex_count)

4795


In [7]:
# create a dataframe with V1, V2, loan_amount columns
PT["weight"] = 1
PT[["V1", "V2", "weight"]].to_csv("checkpoints/project_tag_bipartite.csv", sep="\t", header=False, index=False)

In [8]:
dictionary = PT[["V1", "tag"]].drop_duplicates()
dictionary.rename(columns={"tag": "name", "V1": "id"}, inplace=True)
dictionary.head()

Unnamed: 0,id,name
0,24,#Woman-Owned Business
1,0,#Animals
2,10,#Parent
4,13,#Repeat Borrower
9,3,#Elderly


In [9]:
dictionary2 = PT[["V2", "project_id"]].drop_duplicates()
dictionary2.rename(columns={"project_id": "name", "V2": "id"}, inplace=True)
dictionary2.head()

Unnamed: 0,id,name
0,4747,2627672
3,1436,2063069
5,1408,2057142
8,446,1934495
10,3245,2429114


In [10]:
# concat two dictionaries
dictionary = pd.concat([dictionary, dictionary2])
dictionary["name"] = dictionary["name"].astype(str)
print(len(dictionary))
assert len(dictionary) == vertex_count
dictionary.tail()

4795


Unnamed: 0,id,name
14461,2144,2273221
14464,1369,2049980
14470,2707,2370520
14475,143,1903071
14476,2397,2309580


In [11]:
dictionary.sort_values(by=["id"], inplace=True)

In [12]:
# write the dictionary to file checkpoints/lender_tag_bipartite_Dictionary.txt, without header and index. The first column should be id
dictionary[["id", "name"]].to_csv(
    "checkpoints/project_tag_bipartite_Dictionary.txt", sep="\t", header=False, index=False
)

In [None]:
dictionary

# Run `biLouvain`

```bash
./biLouvain -d "\t" -i ../src/checkpoints/project_tag_bipartite.csv -order 2
```

Reference: https://github.com/paolapesantez/biLouvain.git

# export to Gephi

parse result file. The result file looks like this

```text
Community 0[V1]: 0
Community 1[V1]: 1
Community 2[V2]: 2,3
Community 3[V2]: 4

Singletons Partition V1: 2
Singletons Partition V2: 1
0,1,2,2,3
```

In [13]:
# read the last line of the file
with open("checkpoints/project_tag_bipartite_ResultsCommunities.txt", "r") as f:
    result_text = f.read()

In [14]:
import re

regex = r"^Community (?P<community_id>\d+)\[V(?P<vertex_type>\d+)\]: (?P<vertexes>.*)$"
matches = re.finditer(regex, result_text, re.MULTILINE)

community_result = []

for matchNum, match in enumerate(matches, start=1):
    community_id = match.group("community_id")
    vertex_type = match.group("vertex_type")
    vertexes = match.group("vertexes").split(", ")[0]
    vertexes = vertexes.split(",")
    print(community_id, vertex_type, vertexes)
    for vertex in vertexes:
        community_result.append([community_id, vertex_type, vertex])

community_result = pd.DataFrame(community_result, columns=["community_id", "vertex_type", "vertex_name"])
community_result.head()

0 1 ['#Elderly']
1 1 ['#Female Education', '#Schooling']
2 1 ['#Eco-friendly', '#First Loan', '#Health and Sanitation', '#Job Creator', '#Parent', '#Repair Renew Replace']
3 1 ['#Refugee']
4 1 ['#Animals', '#Biz Durable Asset', '#Fabrics', '#Repeat Borrower', '#Unique', '#Vegan', '#Woman-Owned Business']
5 1 ['#Single']
6 1 ['#Supporting Family']
7 1 ['#Technology']
8 1 ['#Sustainable Ag', '#Trees']
9 1 ['#Orphan', '#Single Parent', '#Widowed']
10 2 ['1893260', '1893284', '1893294', '1893722', '1893728', '1893740', '1893762', '1895089', '1902589', '1903449', '1903777', '1903778', '1903986', '1904237', '1904320', '1904447', '1905072', '1905388', '1906132', '1906254', '1907618', '1907678', '1907686', '1920071', '1921548', '1924828', '1925336', '1926648', '1926833', '1926834', '1929408', '1929426', '1929470', '1929472', '1933062', '1934053', '1934488', '1936449', '1940546', '1943458', '1960045', '1960048', '1960049', '1960050', '1960053', '1960261', '1960271', '1960682', '1960708', '19661

Unnamed: 0,community_id,vertex_type,vertex_name
0,0,1,#Elderly
1,1,1,#Female Education
2,1,1,#Schooling
3,2,1,#Eco-friendly
4,2,1,#First Loan


In [15]:
nodes = dictionary.merge(community_result, left_on="name", right_on="vertex_name")
len(nodes)

4795

# Create networkx graph and show the result

In [16]:
G = nx.from_pandas_edgelist(PT, "project_id", "tag")
G.add_nodes_from(PT["project_id"].drop_duplicates(), type="Project")
G.add_nodes_from(PT["tag"].drop_duplicates(), type="Tag")

# refine node attributes with community id
for row in nodes.itertuples():
    G.nodes[row.name]["community_id"] = row.community_id

print(G.number_of_nodes(), G.number_of_edges())
nx.write_gexf(G, "checkpoints/project_tag_bipartite_community.gexf")

4795 14478
