# Clustering for RDD

In [3]:
# Suppress the aggressive warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Add the previous path for libraries
from sys import path
path.append("..")

import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as shc

from rdd import measures
from rdd import RDD
from rdd import ascos
from rdd.RDD import get_rdd_matrix
from rdd import visualize
from rdd import other_sims
from rdd.utils import df_to_cluster_list
from rdd.other_sims import kmedoid_clustering2


G1, G2 = nx.Graph(), nx.Graph()

G1.add_edges_from([
    (1, 2),
    (2, 3),
    (2, 4),
    (3, 4),
    (3, 6),
    (4, 5),
])

G2.add_edges_from([
    (1,2),
    (2,4),
    (2,3),
    (3,5),
    (4,5),
    (5,6)
])

# G2.add_edges_from([
#     (5, 2),
#     (2, 3),
#     (2, 4),
#     (3, 4),
#     (3, 6),
#     (4, 1),
# ])


G3 = nx.read_adjlist('../data/facebook_combined.txt', nodetype=int)
G4 = nx.read_gml('../data/karate.gml', label='id')

real_paths1 = nx.single_source_shortest_path(G3, 1, 2)
node_list1 = RDD.populate_node_list(real_paths1)
list_of_nodes = []
for node in node_list1:
    list_of_nodes.append(node.name)
G5 = RDD.nodes_to_graph(G3, list_of_nodes)
G6 = nx.read_gml('../data/lesmiserables.gml')
G7 = nx.read_adjlist('../data/Cattle_protein_interactions_(IntAct).txt')
G8 = nx.read_adjlist('../data/rat_protein_interactions_(IntAct).txt')

target_G = G7
target_rad = 6
target_measure = measures.global_graph_morgan_index
target_clusters = 5

pos = nx.spring_layout(target_G)

## Hierarchical Clustering

### Dendrogram:

In [4]:
data = get_rdd_matrix(target_G, target_rad, target_measure)
plt.figure(figsize=(10, 7))
plt.title("Customer Dendograms")
dend = shc.dendrogram(shc.linkage(data, method='ward'))

KeyboardInterrupt: 

### Agglomerative Hierarchical Clustering:

In [None]:
# %%timeit -n1 -r1
df = other_sims.agglomerative_hierarchical_clustering(target_G,target_rad, measures.global_graph_degree, target_clusters)
fig = visualize.visualize_rdd_agglomerative_hierarchical_clustering(target_G, target_rad, target_measure, pos, target_clusters)
fig.show()
df = df.sort_values('cluster')
# print(df)

clusters = df_to_cluster_list(df)
result = nx.algorithms.community.modularity(target_G, clusters)
print(result)

## K-Medoids Clustering

In [None]:
%%timeit -n1 -r1
df = other_sims.kmedoid_clustering2(target_G, RDD.get_rdd_matrix(target_G, target_rad, measures.global_graph_degree), target_clusters)
fig = visualize.visualize_rdd_kmedoid(target_G, target_rad, target_measure, pos, target_clusters)
fig.show()
df = df.sort_values('cluster')
# print(df)1

clusters = df_to_cluster_list(df)
result = nx.algorithms.community.modularity(target_G, clusters)
print(result)

# Modularity

Modularity is the fraction of the edges that fall within the given groups minus the expected fraction if edges were distributed at random. The value of the modularity for unweighted and undirected graphs lies in the range $[-1/2,1][-1/2,1]$. It is positive if the number of edges within groups exceeds the number expected on the basis of chance. For a given division of the network's vertices into some modules, modularity reflects the concentration of edges within modules compared with random distribution of links between all nodes regardless of modules.

https://en.wikipedia.org/wiki/Modularity_(networks)

Here our goal is to test the modularity after clustering based on RDD values and compare that to clustering based on other algorithms, such as ASCOS.

## Max modularity

In [None]:
print("Max modularity:", nx.algorithms.community \
      .modularity(target_G, nx.algorithms.community \
      .modularity_max.greedy_modularity_communities(target_G)))

all_results = pd.DataFrame({'clusters': range(1, 20, 1)})

# KMedoid clustering Modularity

In [None]:
kmedoid_results = pd.DataFrame()

## RDD: Global Graph Degree

In [None]:
results = []
for k in range(1,10,1):
    rdd_matrix = RDD.get_rdd_matrix(target_G, target_rad, measures.global_graph_degree)
#     rdd_matrix.index += 1
    cluster_data = kmedoid_clustering2(target_G, rdd_matrix, k)
    clusters = df_to_cluster_list(cluster_data)
    result = nx.algorithms.community.modularity(target_G, clusters)
    print(f"Modularity {k} clusters:", result)
    results.append(result)
kmedoid_results['RDD_GGD'] = results

## Ascos

In [None]:
results = []
for k in range(1,10,1):
    ascos_matrix = ascos.get_ascos_matrix(target_G)
#     ascos_matrix.index += 1
    cluster_data = kmedoid_clustering2(target_G, ascos_matrix, k)
    clusters = df_to_cluster_list(cluster_data)
    result = nx.algorithms.community.modularity(target_G, clusters)
    print(f"Modularity {k} clusters:", result)
    results.append(result)
kmedoid_results['ascos'] = results

## SimRank

In [None]:
results = []
for k in range(1,10,1):
    simrank_matrix = other_sims.get_simrank_matrix(target_G)
#     ascos_matrix.index += 1
    cluster_data = kmedoid_clustering2(target_G, simrank_matrix, k)
    clusters = df_to_cluster_list(cluster_data)
    result = nx.algorithms.community.modularity(target_G, clusters)
    print(f"Modularity {k} clusters:", result)
    results.append(result)
kmedoid_results['simrank'] = results

## Modularity Results: So Far

In [None]:
kmedoid_results

## Improving Modularity

### We will try to improve clustering modularity by using the Morgan Index(MI) as a measure.

In [None]:
results = []
for k in range(1,10,1):
    rdd_matrix = RDD.get_rdd_matrix(target_G, target_rad, measures.global_graph_morgan_index)
#     rdd_matrix.index += 1
    cluster_data = kmedoid_clustering2(target_G, rdd_matrix, k)
    clusters = df_to_cluster_list(cluster_data)
    result = nx.algorithms.community.modularity(target_G, clusters)
    print(f"Modularity {k} clusters:", result)
    results.append(result)
kmedoid_results['RDD_MI'] = results

In [None]:
kmedoid_results