In [15]:
import time
import matplotlib.pyplot as plt
import pandas as pd

import networkx as nx
import graph_tool.all as gt

import cc_graph_ops

In [16]:
INPUT_FILE = 'fdg_input_file.json'

In [17]:
g = cc_graph_ops.create_graph_from_file(INPUT_FILE, format='gt')

In [18]:
# Overflow for 32-bit ints
weight_long = g.ep['weight'].copy(value_type='long')

In [19]:
coef, stdev = gt.global_clustering(g, weight=weight_long)

In [20]:
coefs, stdevs = dict(), dict()

coefs['all'] = coef
stdevs['all'] = stdev

In [21]:
quota = 10
proportion = 0.3
min_subgraph_size = 100

In [22]:
licenses = list(cc_graph_ops.get_licenses(g))
subgraphs = cc_graph_ops.all_license_subgraphs(g, licenses, quota=quota, proportion=proportion)

In [23]:
for license, subg in subgraphs.items():
    if subg.num_vertices() < min_subgraph_size:
        continue
    coef, stdev = gt.global_clustering(subg, weight=weight_long)
    coefs[license] = coef
    stdevs[license] = stdev

In [24]:
size = {license: subg.num_vertices() for license, subg in subgraphs.items()}
size['all'] = g.num_vertices()

In [25]:
keys = list(coefs.keys())
keys.sort(key=lambda x: -coefs[x])

In [26]:
table = [[k, size[k], coefs[k]] for k in keys]
global_clustering_df = pd.DataFrame(table, columns=['License', 'Size', 'Clustering'])
global_clustering_df = global_clustering_df.set_index('License')
display(global_clustering_df)

Unnamed: 0_level_0,Size,Clustering
License,Unnamed: 1_level_1,Unnamed: 2_level_1
"('by-nc-sa', '2.5')",1118,0.793559
"('by-sa', '3.0')",6514,0.601791
all,235863,0.440094
"('cc0', '1.0')",893,0.410368
"('by-nc-nd', '4.0')",7387,0.40435
"('by', '3.0')",5722,0.403254
"('by-nd', '4.0')",629,0.328749
"('by-sa', '4.0')",7066,0.323638
"('by', '4.0')",9085,0.278524
"('by-nc-nd', '2.5')",1491,0.199153


In [27]:
local_coefs = dict()
for license in coefs:
    if license != 'all':
        local_clusters = gt.local_clustering(subgraphs[license], weight=weight_long)
        local_clusters = {v: local_clusters[v] for v in subgraphs[license].vertices()}
        local_clusters = sorted(list(local_clusters.items()), key=lambda x: -x[1])
        local_coefs[license] = local_clusters

In [28]:
for license in global_clustering_df.index:
    if license != 'all':
        print(license, global_clustering_df.loc[license, 'Clustering'])
        data = []
        for v, coef in local_coefs[license][:10]:
            data.append([subgraphs[license].vp['provider_domain'][v], coef])
        display(pd.DataFrame(data, columns=['Domain', 'Clustering']))
        print()

('by-nc-sa', '2.5') 0.7935592444906805


Unnamed: 0,Domain,Clustering
0,shun.im,218.0
1,www.qiusi.me,72.666667
2,www.surveillanceincanada.org,36.0
3,pochistvane.com,21.619565
4,www.xj123.info,16.769231
5,www.danielgarciaperis.cat,15.912334
6,praxis.tw,12.285714
7,xn--czq75pvv1aj5c.org,9.75
8,felixc.at,9.333333
9,blog.imprenditore.me,6.914286



('by-sa', '3.0') 0.6017905751196228


Unnamed: 0,Domain,Clustering
0,contentedness.net,20815000.0
1,50days.org,20815000.0
2,www.linke-piraten-hd.de,20815000.0
3,hejlt.org,10407500.0
4,enskedebilder.com,10407500.0
5,www.infoturia.com,6938334.0
6,www.daemmen-und-sanieren.de,6938334.0
7,fortunengineering.com,6938334.0
8,www.adhugger.net,6938334.0
9,geschichtskasten.kastenwesen.com,5203764.0



('cc0', '1.0') 0.4103683368256483


Unnamed: 0,Domain,Clustering
0,wiki.ms.agh.edu.pl,25790.333333
1,ariadne.uni-greifswald.de,3975.766667
2,data.nola.gov,2316.8
3,da.dl.itc.u-tokyo.ac.jp,2183.380102
4,data.southampton.ac.uk,1490.361111
5,ehsanakhgari.org,1435.0
6,risdmuseum.org,1125.065574
7,tudigit.ulb.tu-darmstadt.de,648.8005
8,parismuseescollections.paris.fr,487.120477
9,robustlybeneficial.org,450.992369



('by-nc-nd', '4.0') 0.404350249377385


Unnamed: 0,Domain,Clustering
0,www.elphis.or.kr,912178.0
1,www.chefstyle.co.kr,912178.0
2,www.meistercom.com,456089.0
3,openeg.co.kr,456089.0
4,wacomkoreablog.com,456089.0
5,nerd.kr,304161.333333
6,enternews1.com,182435.6
7,innanum.org,182435.6
8,blog.hwenc.co.kr,136867.7
9,volunteeringculture.or.kr,114022.25



('by', '3.0') 0.40325441864045214


Unnamed: 0,Domain,Clustering
0,hronlineph.com,36658.333333
1,africommons.com,8729.657143
2,desculpeanossafalha.com.br,4982.216162
3,dallas.libertarianleft.org,3111.0
4,www.stembook.org,1989.0
5,thenetmonitor.org,1814.871029
6,advox.it,1533.711297
7,www.socialbrite.org,1423.75526
8,www.periodicos.ufn.edu.br,1136.944444
9,www.fmwic.com,979.548387



('by-nd', '4.0') 0.3287485837530737


Unnamed: 0,Domain,Clustering
0,journals.squ.edu.om,101.2
1,quoimedia.com,38.0
2,yellowfever.com.au,36.066667
3,bizkai.eus,22.705552
4,travelhealthpolicy.com.au,20.415094
5,derbreitenbacher.de,12.809524
6,www.siegen-wittgenstein.info,7.915916
7,singularityhub.com,7.83871
8,www.pnvnafarroa.eus,3.534815
9,sitio.fdsl.org.do,3.438403



('by-sa', '4.0') 0.32363769772268114


Unnamed: 0,Domain,Clustering
0,blog.tommyku.com,40000480.0
1,www.swalladge.net,13333900.0
2,awtfy.com,10000300.0
3,guillaume-chevalier.com,10000220.0
4,blog.mitsuruog.info,10000120.0
5,smlpoints.com,8000118.0
6,www.8u.co.kr,8000096.0
7,www.chunyangwen.com,8000096.0
8,lucisferre.net,8000096.0
9,www.kyleondy.com,6667246.0



('by', '4.0') 0.27852419549229096


Unnamed: 0,Domain,Clustering
0,radonjournal.net,439516.066667
1,tianqi.name,131824.0
2,cryo.org.ua,127449.2
3,felixwong.com,93291.212308
4,habricentral.org,93237.733333
5,journals.scholarpublishing.org,85599.963235
6,www.mozello.co.uk,84804.725146
7,www.cargill.school.nz,80746.952381
8,www.saferoptions.org,74745.787879
9,nachhaltig-beleuchten.de,74623.190476



('by-nc-nd', '2.5') 0.1991532284616921


Unnamed: 0,Domain,Clustering
0,www.mindsfree.info,3385.666667
1,www.a6fanzine.it,2067.333333
2,www.francescouboldi.com,1977.333333
3,www.emergenzautismo.org,1157.464286
4,www.valentano.net,1050.233333
5,max510.com,988.7
6,www.ildottoredeicomputer.com,985.606061
7,www.scienzenoetiche.it,791.0
8,metalskunk.com,619.403333
9,www.aetnanet.org,545.085714



('by-nc-sa', '2.0') 0.17113967154967388


Unnamed: 0,Domain,Clustering
0,rdl.de,1814.333333
1,talkingheads.net,536.517241
2,secant.cs.purdue.edu,420.866667
3,journalatelier.com,318.0
4,www.lafibala.org,202.361111
5,notremetier.se-unsa.org,161.890909
6,www.cse.cuhk.edu.hk,105.380952
7,nicolas.picand.fr,93.0
8,awiki.theseed.io,91.691168
9,www.marcguidoni.fr,91.085714



('by-nc', '4.0') 0.17038576946281686


Unnamed: 0,Domain,Clustering
0,smeru.or.id,24626.333333
1,ccdemocraticas.net,4750.333333
2,www.oneducation.net,4702.672727
3,www.iberoamericana.se,4397.7
4,rcm.insmet.cu,4397.666667
5,ojs.gayanabotanica.cl,3878.0
6,revistamedicasinergia.com,2250.8
7,erevistas.uacj.mx,2199.357143
8,sustain.pata.org,1522.435435
9,seer.ufsj.edu.br,1296.410256



('by-nc-sa', '3.0') 0.16198905644013484


Unnamed: 0,Domain,Clustering
0,www.progettodighe.it,24293.47619
1,michaeljcripps.com,2080.0
2,www.windowswiki.info,1692.0
3,kenliu.name,830.033333
4,biblioteca.fundaciopalau.cat,521.866667
5,www.csadepistaggio.org,486.0
6,voyez.ca,457.481476
7,www.konjunktion.info,375.153846
8,beziaud.org,357.734127
9,wikimerda.org,353.115516



('by-nd', '3.0') 0.04323249151810079


Unnamed: 0,Domain,Clustering
0,www.ladikvetvicka.cz,2391.0
1,www.ufac.br,516.978022
2,acervo.ufvjm.edu.br,81.205263
3,radek-velicka.cz,79.7
4,www.novo.justica.gov.br,30.764977
5,www.ibametro.ba.gov.br,13.791395
6,www.tecnopolo.it,13.6
7,seplan.ba.gov.br,8.130824
8,www.fpc.pt,8.026863
9,revista.ibict.br,6.384384



('by-nc', '2.0') 0.022298638973056988


Unnamed: 0,Domain,Clustering
0,aspb-rta.secure-platform.com,153.325216
1,phenome2020.org,111.419612
2,www.timbres-bordeaux.fr,7.443192
3,album-timbres.fr,4.930547
4,docteurhtml5.com,2.177742
5,lasikuu.net,1.473684
6,kasvukipuja.net,0.875
7,marcophilie.france-timbres.net,0.599999
8,www.marcophilie.e-timbres.net,0.586445
9,www.planchage-timbres.fr,0.362091



('gpl', '2.0') 0.014264716486049621


Unnamed: 0,Domain,Clustering
0,www.iwp.jku.at,1044.182698
1,www.right2info.org,378.166667
2,www.cae.tntech.edu,281.070875
3,portal.metodista.br,162.545455
4,libraryguides.griffith.edu.au,143.305263
5,www.ictp.tv,96.818182
6,www2.rpgfund.org,90.245355
7,cmstest.ifac-control.org,87.277092
8,www2.spokaneasl.com,76.766513
9,download.zope.org,73.026169



('by-nc', '3.0') 0.013891561750649074


Unnamed: 0,Domain,Clustering
0,heritage.nzei.org.nz,12378.666667
1,e-pub.uni-weimar.de,1642.533333
2,www.nzonscreen.com,96.157986
3,seguridad.observatoriofundapro.com,94.4
4,imagesdutravail.edel.univ-poitiers.fr,87.522876
5,ajme.aut.ac.ir,67.100613
6,www.smarty.com.es,42.966667
7,boundforsouthaustralia.com.au,42.285714
8,bryophyteportal.org,23.39774
9,desitkapomaha.cz,21.916337



('by-nc-nd', '3.0') 0.01158709239008921


Unnamed: 0,Domain,Clustering
0,ayumihorie.com,42647.666667
1,www.albrecht-durer.org,42619.0
2,godinci.org,42619.0
3,www.askasu.idv.tw,38766.666667
4,www.recreathing.com,31964.2
5,www.farflungfamilies.net,31964.2
6,www.yesbutnobutyes.com,28533.7
7,pinkskulldesign.com,28533.7
8,www.alma-tadema.org,25589.4
9,www.plantprotection.pl,21325.333333



('by-nc-nd', '2.0') 0.010806048844987766


Unnamed: 0,Domain,Clustering
0,www.aureliablogmode.com,997.0
1,www.linaudible.com,544.0
2,www.sistoeurs.net,498.333333
3,meiz.me,226.883092
4,www.aquarelle-en-liberte.fr,199.4
5,koinai.net,135.357143
6,www.gasengi.com,93.922481
7,tournevis.net,92.8
8,www.estceque.org,76.692308
9,www.fredorando.fr,52.473684



('by-sa', '2.0') 0.010649647971143237


Unnamed: 0,Domain,Clustering
0,bb.mehr-demokratie.de,271.0
1,geektionnerd.net,78.372549
2,grisebouille.net,77.666667
3,ourjourneypeterborough.org,44.553258
4,irfm.regardscitoyens.org,43.528455
5,peirce.gis-lab.info,33.014259
6,www.nanterrux.org,24.362637
7,www.migazin.de,21.363636
8,www.closedpubs.co.uk,19.966079
9,blog.roozeec.fr,17.111111



('by-nc', '2.5') 0.008382192822623676


Unnamed: 0,Domain,Clustering
0,playstationblast.forumbrasil.net,61.393443
1,www.corridorkitchen.com,2.606557
2,onebitemore.com,0.317365
3,www.raspberricupcakes.com,0.042654
4,media.mcgill.ca,0.012821
5,www.icebreaker.com.br,0.010621
6,forum.nintendoblast.com.br,0.005793
7,www.infoarena.ro,0.000557
8,xkcd.com,6e-06
9,thinkdo.se,0.0



('by-sa', '2.5') 0.007628377551355449


Unnamed: 0,Domain,Clustering
0,www.stubbornmule.net,119.181818
1,blog.fogus.me,19.5
2,hlds.pl,6.905526
3,buildingaudio.com,4.447619
4,secondlifeshrink.com,3.333333
5,webnovedad.com,3.0
6,ar.globedia.com,2.160173
7,openfonts.hagilda.com,1.941176
8,devenirgenial.fr,1.466946
9,pourquoidieu.fr,1.445655



('by', '2.5') 0.004017385348822739


Unnamed: 0,Domain,Clustering
0,www.mp.se,2598.545455
1,maicelular.com,317.0
2,www.saltedhash.co.il,317.0
3,coffee3.org,127.833333
4,blog.syafril.com,63.4
5,www.circ-ien-andolsheim.ac-strasbourg.fr,29.0
6,tasglann.reefnet.co.uk,20.0
7,www.stylius.net,19.208791
8,svtcobergher.fr,13.071429
9,blog.foolip.org,12.97076



('by', '2.0') 0.0035473412391762186


Unnamed: 0,Domain,Clustering
0,szwinsurance.com,1823.0
1,www.ukauthority.com,1217.7
2,www.netfrag.org,719.0
3,rfdinsurance.com,607.666667
4,accentfginsurance.com,607.666667
5,terryins.com,455.7
6,rlsullivaninsurance.com,455.7
7,mummainsurance.com,455.7
8,omega-financial.com,455.7
9,cantonbecker.com,439.0



('by-nc-sa', '4.0') 0.003095672064132717


Unnamed: 0,Domain,Clustering
0,www.pcihispano.com,15005.892857
1,volatilesystems.org,8753.125
2,textilegeschichten.net,5016.109091
3,botons.eu,3654.0
4,www.lapsusmentis.com,3153.666667
5,ojs.correspondenciasyanalisis.com,1759.294872
6,etherboot.org,1480.283333
7,stinpriza.org,1227.259259
8,www.pedagomosaique.com,1042.357143
9,www.teawiki.net,915.418776



('pdm', '1.0') 0.001155464633078091


Unnamed: 0,Domain,Clustering
0,www.europeana.eu,6.426225
1,www.jugendhilfeportal.de,2.385621
2,dija.faktor-e.domainfactory-kunde.de,0.166358
3,biblioteca.galiciana.gal,0.098662
4,www.edu.xunta.gal,0.092572
5,m.dija.de,0.081716
6,fdanj.nlm.nih.gov,0.039724
7,cartotecadigital.icgc.cat,0.000739
8,cartotecadigital.icc.cat,0.000554
9,openimages.eu,0.000438



('by-nd', '2.0') 0.0


Unnamed: 0,Domain,Clustering
0,freerepublic.com,0.0
1,www.teemingbrain.com,0.0
2,clubdumillenaire.fr,0.0
3,www.zakon-pritazlivosti.cz,0.0
4,uncontrollinglove.com,0.0
5,davidcantone.com,0.0
6,www.livewritethrive.com,0.0
7,www.reha-trans.fr,0.0
8,semeunacte.com,0.0
9,www.empire-of-books.com,0.0



