# Clustering comparation between Louvain and Karimi
----
In this notebook, we developed the comparation between the clustering in Louvain \
and Kmodes (from Karimi proposal).

The objective of this experiment is to analize if the same users and the same resources \
are clustered togheter. For instance, if the node $i$ is grouped with the same nodes.

We have two posible sitations:
1. If the same users/resources are grouped with the same users/resources in both \
cases  (kmodes and louvain).
    - In this case, our proposal have the same information that Karimi Proposal. \
    We don't want this case.
2. If the same users/resources are NOT grouped with the same users/resources in both \
    cases (kmodes and louvain).
    - In this other case, our proposal have different information than Karimi so our \
    mined rules are different. We want this case.

Based on the results, the conclusión is:


In [5]:
### Import libraries

import pandas as pd
import os
import igraph as ig
import matplotlib.pyplot as plt
from kmodes import KModes

In [39]:
### GLOBAL variables

CSV_PATH = "../00-Data/01-AMZ/01-DistributionsCSV/AMZ-Rw.csv"
GRAPH_PATH = "../00-Data/01-AMZ/02-Graphs/01-Top/AMZ_top_resall_noise_alpha1.graphml"

In [16]:
### import CSV
df = pd.read_csv(CSV_PATH)

# Remove noisy column
df = df.drop(columns=["Unnamed: 0"])
df = df[df.ACTION==1].drop_duplicates()
print(df.info()) # Info
print()

data = df.values.tolist()

<class 'pandas.core.frame.DataFrame'>
Index: 30872 entries, 0 to 32768
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   ACTION            30872 non-null  int64
 1   rname             30872 non-null  int64
 2   MGR_ID            30872 non-null  int64
 3   ROLE_ROLLUP_1     30872 non-null  int64
 4   ROLE_ROLLUP_2     30872 non-null  int64
 5   ROLE_DEPTNAME     30872 non-null  int64
 6   ROLE_TITLE        30872 non-null  int64
 7   ROLE_FAMILY_DESC  30872 non-null  int64
 8   ROLE_FAMILY       30872 non-null  int64
 9   ROLE_CODE         30872 non-null  int64
 10  uname             30872 non-null  int64
dtypes: int64(11)
memory usage: 2.8 MB
None



In [40]:
### Read GRAPH

g = ig.read(GRAPH_PATH)
print(g.summary(), "\n")

print("Graph info:")
print("\t|V| =", g.vcount())
print("\t|E| =", g.ecount())
print("\t d  =", g.density())

IGRAPH U-WT 7518 72277 -- 
+ attr: frequency (v), id (v), type (v), weight (e) 

Graph info:
	|V| = 7518
	|E| = 72277
	 d  = 0.002557898116048246


  return reader(f, *args, **kwds)


In [None]:
### K-modes algorithm

# Select the number of clusters###
num_clusters = 20

#DO NOT CHANGE THIS CODE
# seed = 29

#Compute centroids and labels
# num_init = 5
centroids = []
kmodes_huang = KModes(n_clusters=num_clusters, init='Huang', verbose=0)
cluster_labels = kmodes_huang.fit_predict(df.drop(columns=["rname", "uname"]))
centroids = kmodes_huang.cluster_centroids_
df["cls"] = cluster_labels
print('Ready!')   

Ready!


In [38]:
### Implementar la función para obtener todos los vecinos

def get_neis(data):
    """Get the nodes in the same cluster."""
    dict_user = dict()
    dict_res = dict()
    for user in data.uname.drop_duplicates():
        all_clusters = set(data[data.uname==user]["cls"])
        neis_user = set()
        for cls in all_clusters:
            neis_user = neis_user.union(set(data[data.cls==cls]["uname"]))
        dict_user[user] = neis_user
    
    for res in data.rname.drop_duplicates():
        all_clusters = set(data[data.rname==user]["cls"])
        neis_user = set()
        for cls in all_clusters:
            neis_user = neis_user.union(set(data[data.cls==cls]["rname"]))
        dict_res[user] = neis_user
    
    return dict_user, dict_res

dict_user, dict_res = get_neis(df)

In [47]:
### Community Detection

comms = g.community_multilevel(weights=g.es["weight"])
g.vs["cls"] = comms.membership
print(comms.summary(), "\n")

Clustering with 7518 elements and 5609 clusters 



In [55]:
### Implementar función obtener vecinos en el mismo formato pero el el clustering.

def get_neis_comms(graph):
    dict_user = dict()
    for user in graph.vs():
        all_clusters = set(data[data.uname==user]["cls"])
        neis_user = set()
        for cls in all_clusters:
            neis_user = neis_user.union(set(data[data.cls==cls]["uname"]))
        dict_user[user] = neis_user

In [54]:
list(g.vs()["id"])

['n9561',
 'n9562',
 'n9563',
 'n9564',
 'n9565',
 'n9566',
 'n9567',
 'n9568',
 'n9569',
 'n9570',
 'n9571',
 'n9572',
 'n9573',
 'n9574',
 'n9575',
 'n9576',
 'n9577',
 'n9578',
 'n9579',
 'n9580',
 'n9581',
 'n9582',
 'n9583',
 'n9584',
 'n9585',
 'n9586',
 'n9587',
 'n9588',
 'n9589',
 'n9590',
 'n9591',
 'n9592',
 'n9593',
 'n9594',
 'n9595',
 'n9596',
 'n9597',
 'n9598',
 'n9599',
 'n9600',
 'n9601',
 'n9602',
 'n9603',
 'n9604',
 'n9605',
 'n9606',
 'n9607',
 'n9608',
 'n9609',
 'n9610',
 'n9611',
 'n9612',
 'n9613',
 'n9614',
 'n9615',
 'n9616',
 'n9617',
 'n9618',
 'n9619',
 'n9620',
 'n9621',
 'n9622',
 'n9623',
 'n9624',
 'n9625',
 'n9626',
 'n9627',
 'n9628',
 'n9629',
 'n9630',
 'n9631',
 'n9632',
 'n9633',
 'n9634',
 'n9635',
 'n9636',
 'n9637',
 'n9638',
 'n9639',
 'n9640',
 'n9641',
 'n9642',
 'n9643',
 'n9644',
 'n9645',
 'n9646',
 'n9647',
 'n9648',
 'n9649',
 'n9650',
 'n9651',
 'n9652',
 'n9653',
 'n9654',
 'n9655',
 'n9656',
 'n9657',
 'n9658',
 'n9659',
 'n9660',
