# JN07 - Build a Bipartite Network
---
In this notebook, we developed the bipartite network from a .csv file. 

In [29]:
### Import libraries

import pandas as pd
import igraph as ig
import auxiliar_path
import numpy as np

In [30]:
### Global variables
### Global variables

DATASET = "AMZ" # AMZ, HC, PM, UN, TOY
NODE_TYPE = False

PATH_DATASET = auxiliar_path.get_path_dataset(DATASET)
PATH_NODETYPE = auxiliar_path.get_path_topbot(NODE_TYPE)

GLOBAL_PATH = "/Users/ddiaz/Documents/code/phd-thesis-lab/"

# File CSV
FILENAME = GLOBAL_PATH + "12-third_year/00-Data/"+PATH_DATASET+"/01-DistributionsCSV/"+DATASET+"-Rw.csv"

In [31]:
### Read CSV

df = pd.read_csv(FILENAME)

# Remove noisy column
df = df.drop(columns=["Unnamed: 0"])
print(df.info()) # Info
print()

# Obtener identificadores únicos por tipo
unique_source = sorted(df['uname'].unique())  # Nodos tipo 1
unique_target = sorted(df['rname'].unique())  # Nodos tipo 2

# Crear nuevo mapeo de IDs
source_mapping = {old_id: new_id for new_id, old_id in enumerate(unique_source)}
start_target_id = len(unique_source)  # El primer ID del target será el último del source + 1
target_mapping = {old_id: new_id for new_id, old_id in enumerate(unique_target, start=start_target_id)}

# Aplicar el mapeo al dataframe
df_mapped = df.replace({'uname': source_mapping, 'rname': target_mapping})

# Contar la frecuencia correcta después del mapeo
source_counts = df_mapped['uname'].value_counts().to_dict()
target_counts = df_mapped['rname'].value_counts().to_dict()

# Fusionar ambas frecuencias.
node_frequencies = {node: source_counts.get(node, 0) + target_counts.get(node, 0) for node in range(len(source_mapping) + len(target_mapping))}

# Contar la frecuencia de cada arista después del mapeo
edge_counts = df_mapped.groupby(['uname', 'rname']).size().to_dict()

# Some information about access requests
n_user = len(df_mapped.uname.drop_duplicates())
n_rscs = len(df_mapped.rname.drop_duplicates())
print(f"|U| = {n_user}")
print(f"|R| = {n_rscs}")
print(f"|U+R| = {n_user+n_rscs}")
print()

# Possible edges
n_acc_res = len(df_mapped.drop_duplicates(["uname", "rname"]))
df_pos = df_mapped[df_mapped.ACTION == 1]
n_ar_pos = len(df_pos.drop_duplicates())
n_ar_neg = len(df_mapped[df_mapped.ACTION == 0].drop_duplicates())

print(f"|L| = {n_acc_res}")
print(f"|L+| = {n_ar_pos}")
print(f"|L-| = {n_ar_neg}")
print()

if n_acc_res == n_ar_pos+n_ar_neg:
    print("*"*43)
    print("** CORRECT FLAG: Same number L = L+ + L- **")
    print("*"*43)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65538 entries, 0 to 65537
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   ACTION            65538 non-null  int64
 1   rname             65538 non-null  int64
 2   MGR_ID            65538 non-null  int64
 3   ROLE_ROLLUP_1     65538 non-null  int64
 4   ROLE_ROLLUP_2     65538 non-null  int64
 5   ROLE_DEPTNAME     65538 non-null  int64
 6   ROLE_TITLE        65538 non-null  int64
 7   ROLE_FAMILY_DESC  65538 non-null  int64
 8   ROLE_FAMILY       65538 non-null  int64
 9   ROLE_CODE         65538 non-null  int64
 10  uname             65538 non-null  int64
dtypes: int64(11)
memory usage: 5.5 MB
None

|U| = 9561
|R| = 7518
|U+R| = 17079

|L| = 32769
|L+| = 30872
|L-| = 1897

*******************************************
** CORRECT FLAG: Same number L = L+ + L- **
*******************************************


In [4]:
### Generate bipartite graph

# Crear el grafo bipartito en igraph
edges = list(edge_counts.keys())  # Lista de aristas sin duplicados
g = ig.Graph(edges=edges, directed=False)

# Etiquetar los nodos con su tipo
g.vs['type'] = [0] * len(source_mapping) + [1] * len(target_mapping)  # 0 para tipo 1, 1 para tipo 2

# Agregar el atributo de frecuencia de los nodos
g.vs['frequency'] = [node_frequencies[node] for node in range(len(g.vs))]

# Agregar el atributo de peso a las aristas
g.es['weight'] = [edge_counts[edge] for edge in edges]

# Number of nodes
print(g.summary())
print(f"|V| = {g.vcount()}")
print(f"|U| = {len(g.vs.select(type_eq=0))}")
print(f"|R| = {len(g.vs.select(type_eq=1))}")
print(f"|E| = {g.ecount()}")
print(f"Is bipartite = {g.is_bipartite()}")

IGRAPH U-WT 17079 32769 -- 
+ attr: frequency (v), type (v), weight (e)
|V| = 17079
|U| = 9561
|R| = 7518
|E| = 32769
Is bipartite = True


In [18]:
### Save the graph

FILE_GRAPH = GLOBAL_PATH + "12-third_year/00-Data/"+PATH_DATASET+"/02-Graphs/binet-"+DATASET+"-Rw.graphml"
g.write_graphml(FILE_GRAPH  )

In [5]:
df

Unnamed: 0,ACTION,rname,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE,uname
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908,1
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539,2
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880,3
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322,4
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325,5
...,...,...,...,...,...,...,...,...,...,...,...
65533,1,23497,16971,117961,118300,119993,118321,240983,290919,118322,4025
65534,1,25139,311198,91261,118026,122392,121143,173805,249618,121145,4798
65535,1,34924,28805,117961,118327,120299,124922,152038,118612,124924,6808
65536,1,80574,55643,118256,118257,117945,280788,280788,292795,119082,1265


In [20]:
### GLOBAL variables

CSV_PATH = "../00-Data/01-AMZ/01-DistributionsCSV/AMZ-Rw.csv"
GRAPH_PATH = "../00-Data/01-AMZ/02-Graphs/02-bot/AMZ_bot_resall_noise_alpha1.graphml"

In [21]:
### Read GRAPH

g = ig.read(GRAPH_PATH)
print(g.summary(), "\n")

print("Graph info:")
print("\t|V| =", g.vcount())
print("\t|E| =", g.ecount())
print("\t d  =", g.density())

IGRAPH U-W- 7226 59753 -- 
+ attr: id (v), weight (e) 

Graph info:
	|V| = 7226
	|E| = 59753
	 d  = 0.0022890427397412455


In [3]:
### K-modes algorithm
from kmodes import KModes

# Select the number of clusters###
num_clusters = 20

#DO NOT CHANGE THIS CODE
# seed = 29

#Compute centroids and labels
# num_init = 5
centroids = []
kmodes_huang = KModes(n_clusters=num_clusters, init='Huang', verbose=0)
cluster_labels = kmodes_huang.fit_predict(df_pos.drop(columns=["rname", "uname"]))
centroids = kmodes_huang.cluster_centroids_
df_pos["cls"] = cluster_labels
print('Ready!')   

NameError: name 'df_pos' is not defined

In [2]:
df_pos["cls"].value_counts()

NameError: name 'df_pos' is not defined

In [8]:
### Implementar la función para obtener todos los vecinos

def get_neis(data):
    """Get the nodes in the same cluster."""
    dict_user = dict()
    dict_res = dict()
    for user in data.uname.drop_duplicates():
        all_clusters = set(data[data.uname==user]["cls"])
        neis_user = set()
        for cls in all_clusters:
            neis_user = neis_user.union(set(data[data.cls==cls]["uname"]))
        dict_user[user] = neis_user
    
    for res in data.rname.drop_duplicates():
        all_clusters = set(data[data.rname==res]["cls"])
        neis_user = set()
        for cls in all_clusters:
            neis_user = neis_user.union(set(data[data.cls==cls]["rname"]))
        dict_res[res] = neis_user
    
    return dict_user, dict_res

dict_user, dict_res = get_neis(df_pos)

In [32]:
### Community Detection

comms = g.community_multilevel(weights=g.es["weight"])
g.vs["cls"] = comms.membership
print(comms.summary(), "\n")

Clustering with 7226 elements and 342 clusters 



In [4]:
comms_with_one = [i for i, subgraph in enumerate(comms.subgraphs()) if len(subgraph.vs) > 1]
comms_with_one

NameError: name 'comms' is not defined

In [38]:
### Implementar función obtener vecinos en el mismo formato pero el el clustering.

def get_neis_comms(graph, not_in):
    dict_user = dict()
    for user in graph.vs():
        cluster_ = user["cls"]
        if not cluster_ in not_in:
            neis_user = list(graph.vs.select(cls_eq=cluster_)["id"])
            neis_user = set([int(x[1:]) for x in neis_user])
            dict_user[int(user["id"][1:])] = neis_user
    return dict_user

In [40]:
dict_res_comms = get_neis_comms(g, comms_with_one)

In [43]:
def jaccard_sim(dict_kmodes, dict_louvain):
    dict_resul = {}
    total_mean = []
    for key_, item_ in dict_kmodes.items():
        if key_ in dict_louvain.keys():
            intersec = item_.intersection(dict_louvain[key_])
            set_union = item_.union(dict_louvain[key_])
            jacc_index = len(intersec) / len(set_union)
            dict_resul[key_] = jacc_index
            total_mean.append(jacc_index)
        else:
            continue
    return dict_resul, total_mean

In [44]:
dict_result, jacc_list = jaccard_sim(dict_user, dict_res_comms)
dict_result

{1: 0.05448567402536402,
 2: 0.012940330697340043,
 3: 0.048616305160807775,
 4: 0.07402282947077136,
 6: 0.05448567402536402,
 7: 0.01725625539257981,
 8: 0.01984126984126984,
 9: 0.05166586190246258,
 10: 0.03497942386831276,
 11: 0.013089005235602094,
 13: 0.010224948875255624,
 14: 0.03315994798439532,
 15: 0.027777777777777776,
 16: 0.06621556307394877,
 17: 0.048616305160807775,
 18: 0.026143790849673203,
 19: 0.06128133704735376,
 20: 0.06621556307394877,
 21: 0.008152173913043478,
 22: 0.013210039630118891,
 23: 0.03868103994927077,
 24: 0.06621556307394877,
 25: 0.03653250773993808,
 26: 0.017297297297297298,
 27: 0.023132848645076007,
 28: 0.04124263524370648,
 29: 0.06621556307394877,
 30: 0.0296220633299285,
 31: 0.013210039630118891,
 32: 0.07402282947077136,
 33: 0.0015503875968992248,
 34: 0.027777777777777776,
 35: 0.047054322876817135,
 36: 0.07402282947077136,
 37: 0.05448567402536402,
 38: 0.03265666372462489,
 39: 0.018502202643171806,
 40: 0.015078821110349555,
 42

In [45]:

# Arreglo de enteros de ejemplo
data = jacc_list

# Media
mean = np.mean(data)
print(f"Media: {mean}")

# Mediana
median = np.median(data)
print(f"Mediana: {median}")

# Moda
#mode = stats.mode(data)
#print(f"Moda: {mode.mode[0]} (aparece {mode.count[0]} veces)")

# Desviación estándar
std_dev = np.std(data)
print(f"Desviación estándar: {std_dev}")

# Varianza
variance = np.var(data)
print(f"Varianza: {variance}")

# Mínimo
min_value = np.min(data)
print(f"Mínimo: {min_value}")

# Máximo
max_value = np.max(data)
print(f"Máximo: {max_value}")

# Rango (diferencia entre el máximo y el mínimo)
range_value = np.ptp(data)  # ptp = "peak to peak"
print(f"Rango: {range_value}")

# Suma de todos los elementos
sum_value = np.sum(data)
print(f"Suma: {sum_value}")

# Cuartiles (25%, 50%, 75%)
quartiles = np.percentile(data, [25, 50, 75])
print(f"Cuartiles (25%, 50%, 75%): {quartiles}")


Media: 0.032961461798923154
Mediana: 0.030833333333333334
Desviación estándar: 0.01779123389729731
Varianza: 0.0003165280035883408
Mínimo: 0.00046040515653775324
Máximo: 0.07402282947077136
Rango: 0.07356242431423361
Suma: 224.69828508325912
Cuartiles (25%, 50%, 75%): [0.01818182 0.03083333 0.04692082]


In [35]:
comms_with_one = [i for i in comms. if i == 1]
comms_with_one

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
