In [1]:
import numpy as np
import random
import igraph
import networkx as nx
import csv
import json
import pandas as pd
import time
import matplotlib as plt

### Data extraction

In [2]:
path = '..\data'

In [3]:
def load_data(path, filename):
    with open(path + filename) as json_file:
        data = json.load(json_file)
    print(len(data))
    return data

In [4]:
#load data
data_HR = load_data(path,'\HR_genres.json')
data_HU = load_data(path,'\HU_genres.json')
data_RO = load_data(path,'\RO_genres.json')

54573
47538
41773


In [5]:
with open(path + 'HR_edges.csv', "r") as f:
    reader = csv.reader(f)
    edges  = list(reader)[1:] 

In [6]:
def create_graph(path, filename):
    with open(path + filename, "r") as f:
        reader = csv.reader(f)
        edges  = list(reader)[1:]
    edges = [(int(edge[0]),int(edge[1])) for edge in edges]
    Nb_nodes = max([max(nodes) for nodes in edges])+1
    print(Nb_nodes)
    g = igraph.Graph()
    g_dir = igraph.Graph(directed=True)
    g.add_vertices(Nb_nodes)
    g_dir.add_vertices(Nb_nodes)
    g.add_edges(edges)
    g_dir.add_edges(edges)
    return g, g_dir

In [7]:
g_HR, g_dir_HR = create_graph(path,'HR_edges.csv')
g_HU, g_dir_HU = create_graph(path,'HU_edges.csv')
g_RO, g_dir_RO = create_graph(path,'RO_edges.csv')

54573
47538
41773


### Communities detection

Useful link : https://yoyoinwanderland.github.io/2017/08/08/Community-Detection-in-Python/

In [8]:
def naive_compute_error(clusters, data):
    #first genre of each user in regard to the main genre of its cluster
    error = 0
    for cluster in clusters:
        genre_cluster = []
        for val in cluster:
            genre_cluster.append(data[str(val)][0])
        main_genre = max(set(genre_cluster), key = genre_cluster.count)
        error += sum(main_genre == genre for genre in genre_cluster)/len(data)
    return error

In [9]:
#fast_greedy
t = time.time()
fast_greedy = g_HR.community_fastgreedy(weights = None)
print("Time : {}".format(time.time() - t))
print("Nb of clusters : {}".format(fast_greedy.optimal_count))
#converting to communities
communities_fs = fast_greedy.as_clustering()
#To access the communities
#print(communities_fs[0])
#computation of error
print("Error : {}".format(naive_compute_error(communities_fs, data_HR)))

Time : 254.3667449951172
Nb of clusters : 147
Error : 0.18620929763802552


In [10]:
#walktrap
t = time.time()
walktrap = g_HR.community_walktrap(weights = None)
print("Time : {}".format(time.time() - t))
print("Nb of clusters : {}".format(walktrap.optimal_count))
#converting to communities
communities_wt = walktrap.as_clustering()
#To access the communities
#print(communities_wt[0])
print("Error : {}".format(naive_compute_error(communities_wt, data_HR)))

Time : 1090.7635185718536
Nb of clusters : 1073
Error : 0.20323236765432648


In [11]:
#leading eigenvector
t = time.time()
communities_eg = g_HR.community_leading_eigenvector(weights = None)
print("Time : {}".format(time.time() - t))
print("Nb of clusters : {}".format(len(communities_eg)))
#To access the communities
#print(communities_wt[0])
print("Error : {}".format(naive_compute_error(communities_eg, data_HR)))

Time : 56.470285415649414
Nb of clusters : 21
Error : 0.18996573397101127


In [12]:
#label_propagation
t = time.time()
communities_lp = g_HR.community_label_propagation(weights = None)
print("Time : {}".format(time.time() - t))
print("Nb of clusters : {}".format(len(communities_lp)))
#To access the communities
#print(communities_wt[0])
print("Error : {}".format(naive_compute_error(communities_lp, data_HR)))

Time : 2.6703810691833496
Nb of clusters : 106
Error : 0.19544463379326785


In [13]:
#multilevel
t = time.time()
communities_ml = g_HR.community_multilevel(weights = None)
print("Time : {}".format(time.time() - t))
print("Nb of clusters : {}".format(len(communities_ml)))
#To access the communities
#print(communities_wt[0])
print("Error : {}".format(naive_compute_error(communities_ml, data_HR)))

Time : 3.840144157409668
Nb of clusters : 26
Error : 0.18882964103127925


In [14]:
#infomap
t = time.time()
communities_im = g_dir_HR.community_infomap()
print("Time : {}".format(time.time() - t))
print("Nb of clusters : {}".format(len(communities_im)))
#To access the communities
#print(communities_wt[0])
print("Error : {}".format(naive_compute_error(communities_im, data_HR)))

Time : 417.592973947525
Nb of clusters : 2897
Error : 0.24746669598519228


In [15]:
#edge_betweeness = Girvan Newman
t = time.time()
edge_betweeness = g_dir_HR.community_walktrap()
print("Time : {}".format(time.time() - t))
print("Nb of clusters : {}".format(edge_betweeness.optimal_count))
#converting to communities
communities_eb = edge_betweeness.as_clustering()
#To access the communities
#print(communities_wt[0])
print("Error : {}".format(naive_compute_error(communities_eb, data_HR)))

Time : 889.0161607265472
Nb of clusters : 1073
Error : 0.18996573397101127


In [None]:
# #optimal_modularity : problem because of GLPK
# t = time.time()
# communities_om = g_HR.community_optimal_modularity()
# print(len(communities_om))
# print(time.time() - t)