## Imports and utils

In [1]:
import networkx as nx
import csv
import time
import pandas as pd
from cdlib import algorithms, readwrite, evaluation
import infomap

In [2]:
def datafileToGraph(fileName):
    emailRaw = pd.read_csv(fileName, header=None)
    emailRaw = emailRaw[0].str.split(n=2, expand=True)
    emailRaw.columns = ['Source', 'Target']
    #print(emailRaw)
    emailNetwork = nx.from_pandas_edgelist(emailRaw, source='Source', target='Target', edge_attr=None)
    #nx.draw(emailNetwork)  
    return emailNetwork
emailNet = datafileToGraph('emailNet.txt')
print(emailNet)

Graph with 1005 nodes and 16706 edges


In [39]:
def averageDegree(networkx):
    degrees = [val for (node, val) in networkx.degree()]
    sum = 0
    for d in degrees:
        sum += d
    return sum/len(degrees)

## Email network properties 

In [40]:
print("Average degree:", averageDegree(emailNet))

Average degree: 33.245771144278606


## Community finding methods for the email network

In [None]:
start_time = time.time()
emailLouvain = algorithms.louvain(emailNet)
print("Execution time for Louvain in email net: %.6s seconds" % (time.time() - start_time))
start_time = time.time()
emailGN = algorithms.girvan_newman(emailNet, level=3)
print("Execution time for Girvan-Newman in email net: %.6s seconds" % (time.time() - start_time))
start_time = time.time()
#emailKC = algorithms.kclique(emailNet, k=10)
print("Execution time for KCliques in email net: %.6s seconds" % (time.time() - start_time))
emailInfomap = infomap.Infomap(silent=True, num_trials=10)
emailInfomap.read_file("emailNet.txt")
start_time = time.time()
emailInfomap.run()
print("Execution time for Infomap in email net: %.6s seconds" % (time.time() - start_time))

Execution time for Louvain in email net: 1.4616 seconds


## Louvain method

In [3]:
louvain = algorithms.louvain(emailNet, randomize=False)

size = evaluation.size(emailNet, louvain)
ad = evaluation.avg_distance(emailNet, louvain)
aid = evaluation.average_internal_degree(emailNet, louvain)
ae = evaluation.avg_embeddedness(emailNet, louvain)
at = evaluation.avg_transitivity(emailNet, louvain)
hd = evaluation.hub_dominance(emailNet, louvain)
s = evaluation.significance(emailNet, louvain)

print("Louvain Size:", size)
print("Louvain Average Path Length:",  ad)
print("Louvain Average Internal Degree:",  aid)
print("Louvain Average Embeddedness:",  ae)
print("Louvain Average Transitivity:",  at)
print("Louvain Hub Dominance:",  hd)
print("Louvain Significance:", s)

Louvain Size: FitnessResult(min=1, max=261, score=37.22222222222222, std=64.32978305306432)
Louvain Average Path Length: FitnessResult(min=0, max=2.3716180371352786, score=0.6392902534004595, std=0.989179262730051)
Louvain Average Internal Degree: FitnessResult(min=2.0, max=25.22222222222222, score=7.18398189864131, std=8.232092674228479)
Louvain Average Embeddedness: FitnessResult(min=0.5466830894380645, max=1.0, score=0.9175635952607045, std=0.1340199174233298)
Louvain Average Transitivity: FitnessResult(min=0.0, max=0.7350972398842094, score=0.15928893592655843, std=0.24983574801483868)
Louvain Hub Dominance: FitnessResult(min=0.4115384615384615, max=0.7014925373134329, score=0.5614501513977875, std=0.0882550992567745)
Louvain Significance: FitnessResult(min=None, max=None, score=61927.644126528896, std=None)


In [10]:
louvainDict = louvain.to_node_community_map()
#print(louvainDict)
#louvain.to_json()

## Leiden method 

In [5]:
leiden = algorithms.leiden(emailNet)

size = evaluation.size(emailNet, leiden)
ad = evaluation.avg_distance(emailNet, leiden)
aid = evaluation.average_internal_degree(emailNet, leiden)
ae = evaluation.avg_embeddedness(emailNet, leiden)
at = evaluation.avg_transitivity(emailNet, leiden)
hd = evaluation.hub_dominance(emailNet, leiden)
s = evaluation.significance(emailNet, leiden)

print("Leiden Size:", size)
print("Leiden Average Path Length:",  ad)
print("Leiden Average Internal Degree:",  aid)
print("Leiden Average Embeddedness:",  ae)
print("Leiden Average Transitivity:",  at)
print("Leiden Hub Dominance:",  hd)
print("Leiden Significance:", s)

Leiden Size: FitnessResult(min=1, max=304, score=30.454545454545453, std=66.48904713131142)
Leiden Average Path Length: FitnessResult(min=0, max=2.353051948051948, score=0.508191201499132, std=0.9065447028670119)
Leiden Average Internal Degree: FitnessResult(min=2.0, max=24.761061946902654, score=6.0269345157410354, std=7.733023565260327)
Leiden Average Embeddedness: FitnessResult(min=0.5356463675213675, max=1.0, score=0.873095564148379, std=0.15405771690515224)
Leiden Average Transitivity: FitnessResult(min=0.0, max=0.7327577583712487, score=0.1336083284039451, std=0.23987844077413756)
Leiden Hub Dominance: FitnessResult(min=0.42857142857142855, max=1.1428571428571428, score=0.6164170022951139, std=0.21582328997876116)
Leiden Significance: FitnessResult(min=None, max=None, score=64292.9327862404, std=None)


## Modularity

In [6]:
louvainGN = evaluation.newman_girvan_modularity(emailNet, louvain)
louvainER = evaluation.erdos_renyi_modularity(emailNet, louvain)
louvainZ = evaluation.z_modularity(emailNet, louvain)

leidenGN = evaluation.newman_girvan_modularity(emailNet, leiden)
leidenER = evaluation.erdos_renyi_modularity(emailNet, leiden)
leidenZ = evaluation.z_modularity(emailNet, leiden)

print("Louvain Girvan-Newman mod:", louvainGN)
print("Louvain Erdos-Renyi mod:", louvainER)
print("Louvain Z-mod:", louvainZ)
print()
print("Leiden Girvan-Newman mod:", leidenGN)
print("Leiden Erdos-Renyi mod:", leidenER)
print("Leiden Z-mod mod:", leidenZ)

Louvain Girvan-Newman mod: FitnessResult(min=None, max=None, score=0.43215837750885905, std=None)
Louvain Erdos-Renyi mod: FitnessResult(min=None, max=None, score=0.45267147482823566, std=None)
Louvain Z-mod: FitnessResult(min=None, max=None, score=1.1577714911318295, std=None)

Leiden Girvan-Newman mod: FitnessResult(min=None, max=None, score=0.4328989259954435, std=None)
Leiden Erdos-Renyi mod: FitnessResult(min=None, max=None, score=0.459745303278471, std=None)
Leiden Z-mod mod: FitnessResult(min=None, max=None, score=1.0806990083303127, std=None)


## External Evaluation

In [7]:
nmi = evaluation.normalized_mutual_information(louvain, leiden)
ami = evaluation.adjusted_mutual_information(louvain, leiden)
ari = evaluation.adjusted_rand_index(louvain, leiden)
f1 = evaluation.f1(louvain, leiden)
voi = evaluation.variation_of_information(louvain, leiden)

print("Normalized Mutual Information between Louvain and Leiden:", nmi)
print("Adjusted Mutual Information between Louvain and Leiden:", ami)
print("Adjusted Rand Index between Louvain and Leiden:", ari)
print("F1 measure between Louvain and Leiden:", f1)
print("Variation of information between Louvain and Leiden:", voi)

Normalized Mutual Information between Louvain and Leiden: MatchingResult(score=0.8102393361458828, std=None)
Adjusted Mutual Information between Louvain and Leiden: MatchingResult(score=0.7992288953958384, std=None)
Adjusted Rand Index between Louvain and Leiden: MatchingResult(score=0.7235333319811447, std=None)
F1 measure between Louvain and Leiden: MatchingResult(score=0.9462962962962962, std=0.14000391920832067)
Variation of information between Louvain and Leiden: MatchingResult(score=1.1175518597870175, std=None)


## Email Labels import 

In [8]:
def convertTextDatasetToCSV(txtFile, csvFile):
    txtFile = open(txtFile, 'r')
    
    txtLines = txtFile.readlines()
    csvLines = {}
    
    for line in txtLines:
        lineVec = line[:-1].split(' ')
        if lineVec[1] in csvLines.keys():
            csvLines[lineVec[1]].append(lineVec[0])
        else:
            csvLines[lineVec[1]] = [lineVec[0]]
            
    with open(csvFile, 'w',newline="") as csv_file:  
        writer = csv.writer(csv_file)
        for key, value in csvLines.items():
            writer.writerow(value)
        
convertTextDatasetToCSV('emailLabels.txt', 'emailLabels.csv')
emailLabels = readwrite.read_community_csv("emailLabels.csv", ",", str)

In [9]:
nmiLouvainLabels = louvain.normalized_mutual_information(emailLabels)
nmiLeidenLabels = leiden.normalized_mutual_information(emailLabels)

print("NMI for Louvain and labels:", nmiLouvainLabels)
print("NMI for Leiden and labels:", nmiLeidenLabels)

NMI for Louvain and labels: MatchingResult(score=0.6005609123420333, std=None)
NMI for Leiden and labels: MatchingResult(score=0.5702335748478711, std=None)
