In [1]:
import numpy as np
import random
import igraph
import networkx as nx
import csv
import json
import pandas as pd
import time
import matplotlib as plt

### Data extraction

In [2]:
path = '..\data'

In [3]:
def load_data(path, filename):
    with open(path + filename) as json_file:
        data = json.load(json_file)
    print(len(data))
    return data

In [4]:
#load data
data_HR = load_data(path,'\HR_genres.json')
data_HU = load_data(path,'\HU_genres.json')
data_RO = load_data(path,'\RO_genres.json')

54573
47538
41773


In [5]:
with open(path + '\HR_edges.csv', "r") as f:
    reader = csv.reader(f)
    edges  = list(reader)[1:] 

In [6]:
def create_graph(path, filename):
    with open(path + filename, "r") as f:
        reader = csv.reader(f)
        edges  = list(reader)[1:]
    edges = [(int(edge[0]),int(edge[1])) for edge in edges]
    Nb_nodes = max([max(nodes) for nodes in edges])+1
    print(Nb_nodes)
    g = igraph.Graph()
    g_dir = igraph.Graph(directed=True)
    g.add_vertices(Nb_nodes)
    g_dir.add_vertices(Nb_nodes)
    g.add_edges(edges)
    g_dir.add_edges(edges)
    return g, g_dir

In [7]:
g_HR, g_dir_HR = create_graph(path,'\HR_edges.csv')
g_HU, g_dir_HU = create_graph(path,'\HU_edges.csv')
g_RO, g_dir_RO = create_graph(path,'\RO_edges.csv')

54573
47538
41773


### Communities detection

Useful link : https://yoyoinwanderland.github.io/2017/08/08/Community-Detection-in-Python/

In [9]:
def naive_compute_error(clusters, data):
    #first genre of each user in regard to the main genre of its cluster
    error = 0
    for cluster in clusters:
        genre_cluster = []
        for val in cluster:
            genre_cluster.append(data[str(val)][0])
        main_genre = max(set(genre_cluster), key = genre_cluster.count)
        error += sum(main_genre == genre for genre in genre_cluster)/len(data)
    return error

In [33]:
def print_all(names, algorithms, times, nb_clusters, errors, modularities):
    print('')
    print(names[-1] + ' - ' + algorithms[-1])
    print("Time : {}".format(times[-1]))
    print("Nb of clusters : {}".format(nb_clusters[-1]))
    print("Error : {}".format(errors[-1]))
    print("Modularity score : {}".format(modularities[-1]))

## All algorithms

In [34]:
#multilevel = louvain algorithm
algorithms, names, times, nb_clusters, errors, modularities = [], [], [], [], [], []
for name, graph, data in zip(['Croatia', 'Hungary', 'Romania'], [g_HR, g_HU, g_RO], [data_HR, data_HU, data_RO]):
    list_functions = [graph.community_multilevel, graph.community_label_propagation,
                      graph.community_leading_eigenvector]
    for algorithm, function in zip(['Louvain', 'Label propagation', 'Spectral Clustering'], list_functions):
        t = time.time()
        communities = function(weights = None)

        # list & print
        algorithms.append(algorithm)
        names.append(name)
        times.append(time.time() - t)
        nb_clusters.append(len(communities))
        errors.append(naive_compute_error(communities, data))
        modularities.append(graph.modularity(communities))
        print_all(names, algorithms, times, nb_clusters, errors, modularities)
    
    list_functions = [graph.community_fastgreedy, graph.community_walktrap]
    for algorithm, function in zip(['FastGreedy', 'Walktrap'], list_functions):
        t = time.time()
        communities = function(weights = None)
        
        # list & print
        algorithms.append(algorithm)
        names.append(name)
        times.append(time.time() - t)
        nb_clusters.append(communities.optimal_count)
        communities_clusters = communities.as_clustering()
        errors.append(naive_compute_error(communities_clusters, data))
        modularities.append(graph.modularity(communities_clusters))
        print_all(names, algorithms, times, nb_clusters, errors, modularities)
    
df = pd.DataFrame({'algorithm':algorithms,
                   'name':names,
                   'time': times,
                   'nb_clusters': nb_clusters,
                   'error': errors,
                   'modularity': modularities})
display(df)

df.to_csv('results.csv', index = False)


Croatia - Louvain
Time : 2.876326084136963
Nb of clusters : 26
Error : 0.18882964103127925
Modularity score : 0.7398027639791998

Croatia - Label propagation
Time : 2.0629217624664307
Nb of clusters : 122
Error : 0.1973136899199238
Modularity score : 0.6784772465634586

Hungary - Louvain
Time : 2.0156548023223877
Nb of clusters : 25
Error : 0.21668980604989696
Modularity score : 0.6791371560455277

Hungary - Label propagation
Time : 1.7884762287139893
Nb of clusters : 32
Error : 0.2171736295174388
Modularity score : 0.1354255515231916

Romania - Louvain
Time : 1.812486171722412
Nb of clusters : 45
Error : 0.21638378857156537
Modularity score : 0.7538882403213177

Romania - Label propagation
Time : 2.5115818977355957
Nb of clusters : 552
Error : 0.2250736121418121
Modularity score : 0.6045274962194309


Unnamed: 0,algorithm,name,time,nb_clusters,error,modularity
0,Louvain,Croatia,2.876326,26,0.18883,0.739803
1,Label propagation,Croatia,2.062922,122,0.197314,0.678477
2,Louvain,Hungary,2.015655,25,0.21669,0.679137
3,Label propagation,Hungary,1.788476,32,0.217174,0.135426
4,Louvain,Romania,1.812486,45,0.216384,0.753888
5,Label propagation,Romania,2.511582,552,0.225074,0.604527


In [None]:
#infomap
t = time.time()
communities_im = g_HR.community_infomap()
print("Time : {}".format(time.time() - t))
print("Nb of clusters : {}".format(len(communities_im)))
#To access the communities
#print(communities_wt[0])
print("Error : {}".format(naive_compute_error(communities_im, data_HR)))
#print("Modularity score : {}".format(g_dir_HR.modularity(communities_im)))

In [15]:
#edge_betweeness = Girvan Newman
t = time.time()
edge_betweeness = g_dir_HR.community_walktrap()
print("Time : {}".format(time.time() - t))
print("Nb of clusters : {}".format(edge_betweeness.optimal_count))
#converting to communities
communities_eb = edge_betweeness.as_clustering()
#To access the communities
#print(communities_wt[0])
print("Error : {}".format(naive_compute_error(communities_eb, data_HR)))

Time : 1119.9673688411713
Nb of clusters : 1073
Error : 0.20323236765432648
