https://methodmatters.github.io/network-community-detection/

In [1]:
import pandas as pd
import itertools
import numpy as np

cols = ["id", "parent_id", "title", "cited", "publication_data", "snippet"]
df = pd.read_csv("../flask-backend/data/output4.csv", usecols=cols)
df = df.set_index("id")

In [2]:
def copy_df(input_df):
    return input_df.copy()

def format_numeric_types(df):
    df.cited = df.cited.fillna(0)
    df = df.astype({"parent_id":"float", "cited":"float"})
    return df
    
def format_str_types(df):
    df.publication_data = df.publication_data.str.replace(u"\xa0", " ").str.replace('\u2026', '') # remove ellipsis and non-breaking space
    df[ ["authors", "journal_date"] ] = df.publication_data.str.split(" - ", n=1, expand=True)
    df = df.drop(columns='publication_data')
    df["title"] = df["title"].str.title()
    df["authors"] = df["authors"].str.title()
    df["authors"] = df["authors"].str.split(", ")
    return df
    
def identify_same_articles_under_different_ids(df):
    df = df.reset_index()
    df["id"] = df.groupby('title')["id"].transform(lambda x: min(x)) # donner le même id pour les articles ayant le même nom
    df = df.set_index("id")
    return df

def add_parents_data(df):
    df["authors_of_parent_article"] = df.parent_id.apply(lambda parent_id: df.loc[parent_id, "authors"] if parent_id != -1 else [])
    df["title_of_parent_article"] = df.parent_id.apply(lambda parent_id: df.loc[parent_id, "title"] if parent_id != -1 else [])
    return df

df = (
      df.pipe(copy_df)
        .pipe(format_str_types)
        .pipe(format_numeric_types)
        .pipe(add_parents_data)
        .pipe(identify_same_articles_under_different_ids)
     )

In [3]:
df.sample(3)

Unnamed: 0_level_0,parent_id,title,cited,snippet,authors,journal_date,authors_of_parent_article,title_of_parent_article
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
942,69.0,Relationship Between Container Ship Underwater...,194.0,Low-frequency ocean ambient noise is dominated...,"[Mf Mckenna, Sm Wiggins, Ja Hildebrand]","Scientific reports, 2013 - nature.com","[Sm Wiggins, Ja Hildebrand]",High-Frequency Acoustic Recording Package (Har...
575,136.0,How To Set Sound Exposure Criteria For Fishes,19.0,Underwater sounds from human sources can have ...,"[Ad Hawkins, C Johnson, An Popper]","The Journal of the , 2020 - asa.scitation.org","[An Popper, Ad Hawkins]",An Overview Of Fish Bioacoustics And The Impac...
1365,1350.0,Automatic Rain And Cicada Chorus Filtering Of ...,16.0,Recording and analysing environmental audio re...,"[A Brown, S Garg, J Montgomery]","Applied Soft Computing, 2019 - Elsevier",[T Ganchev],Computational Bioacoustics: Biodiversity Monit...


In [4]:
def create_nodes_from_influential_authors(df, NUMBER_OF_NODES=100):
    total_num_citation_per_author = (df.loc[:, ["title", "authors", "cited"] ]
                                       .drop_duplicates(subset="title")
                                       .explode("authors")
                                       .groupby("authors")
                                       .sum()
                                       .loc[:, 'cited']
                                       .sort_values(ascending=False)
                                       .iloc[:NUMBER_OF_NODES]
                                    )
    
    titles = (df.reset_index()
              .loc[:, ["title", "authors"]]
              .drop_duplicates(subset="title")
              .explode("authors")
           )
    titles = titles.groupby("authors")["title"].apply('<br>'.join)
    titles = titles[total_num_citation_per_author.index]

    return pd.DataFrame(data = {
            "id": range(NUMBER_OF_NODES),
            "author": total_num_citation_per_author.index,
            "#times_cited": total_num_citation_per_author.values, 
            "titles": titles.values
    })

def get_working_relation_between_authors(df, nodes):
    coauthors = df.authors.apply(lambda team: list(itertools.combinations(team, r=2)))
    mask = coauthors.apply(len) > 0
    coauthors = coauthors[mask]
    coauthors = coauthors.explode()
    coauthorship = coauthors.map(lambda t: tuple(sorted(t)))
    edges = coauthorship.value_counts()
    return pd.DataFrame( data = {
            'author1': edges.index.map(lambda t: t[0]),
            'author2': edges.index.map(lambda t: t[1]),
            'number': edges.values 
    })

def names_to_nodes_ids(coauthors_edges, nodes):
    nodes_ids = coauthors_edges[ ["author1", "author2"] ].applymap(lambda name: nodes[nodes.author==name].id.values[0]).reset_index(drop=True)
    return pd.DataFrame( data = { 
            "id_author1": nodes_ids.author1,
            "id_author2": nodes_ids.author2,
            "number": coauthors_edges["number"].values
    })


def get_citing_relation_between_authors(df):
    edges = df.explode('authors').explode("authors_of_parent_article")[["authors", "authors_of_parent_article"]]
    edges = edges.dropna(axis="index")
    edges = edges.reset_index().groupby(["authors", "authors_of_parent_article"]).count()
    edges = edges.reset_index()
    edges.columns = ["author1", "author2", "number"]
    return edges

def select_subset_edges(edges, subset_authors):
    coauthors_to_keep = edges.query('author1 in @subset_authors and author2 in @subset_authors')
    return coauthors_to_keep


def create_coauthor_edges(df, nodes):
    return (df.pipe(lambda df: df.copy(deep=True))
              .pipe(get_working_relation_between_authors, nodes=nodes)
              .pipe(select_subset_edges, subset_authors=nodes.author.values)
              .pipe(names_to_nodes_ids, nodes=nodes)
   )

def create_citing_edges(df, nodes):
    return (df.pipe(lambda df: df.copy(deep=True))
              .pipe(get_citing_relation_between_authors)
              .pipe(select_subset_edges, subset_authors=nodes.author.values)
              .pipe(names_to_nodes_ids, nodes=nodes)
    )

NUMBER_OF_NODES = 100
nodes = create_nodes_from_influential_authors(df, NUMBER_OF_NODES=100)
coauthors_edges = create_coauthor_edges(df, nodes)
citing_edges = create_citing_edges(df, nodes)

In [43]:
nodes

Unnamed: 0,id,author,#times_cited,titles,group
0,0,Jw Bradbury,4395.0,Principles Of Animal Communication,3.0
1,1,Sl Vehrencamp,4395.0,Principles Of Animal Communication,3.0
2,2,H Slabbekoorn,3107.0,Population‐Level Consequences Of Seismic Surve...,17.0
3,3,Wj Richardson,3080.0,Marine Mammals And Noise<br>Influences Of Man‐...,4.0
4,4,Ci Malme,2842.0,Marine Mammals And Noise,4.0
...,...,...,...,...,...
95,95,Rm Rolland,775.0,Overcoming The Challenges Of Studying Conserva...,8.0
96,96,F Ladich,745.0,Diversity In Fish Auditory Systems: One Of The...,-1.0
97,97,A Frantzis,723.0,A Review Of The Effects Of Seismic Surveys On ...,-1.0
98,98,B Møhl,720.0,Estimating Source Position Accuracy Of A Large...,8.0


In [5]:
import json
data = {
    "nodes": json.loads(nodes.to_json(orient = "records")),
    "coauthors_edges": json.loads(coauthors_edges.to_json(orient = "records")),
    "citing_edges": json.loads(citing_edges.to_json(orient = "records"))
}

data

{'nodes': [{'id': 0,
   'author': 'Jw Bradbury',
   '#times_cited': 4395.0,
   'titles': 'Principles Of Animal Communication'},
  {'id': 1,
   'author': 'Sl Vehrencamp',
   '#times_cited': 4395.0,
   'titles': 'Principles Of Animal Communication'},
  {'id': 2,
   'author': 'H Slabbekoorn',
   '#times_cited': 3107.0,
   'titles': 'Population‐Level Consequences Of Seismic Surveys On Fishes: An Interdisciplinary Challenge<br>Son Et Lumiere: Sound And Light Effects On Spatial Distribution And Swimming Behavior In Captive Zebrafish<br>Birdsong And Anthropogenic Noise: Implications And Applications For Conservation<br>Cities Change The Songs Of Birds<br>Acoustic Communication In Noise<br>A Noisy Spring: The Impact Of Globally Rising Underwater Sound Levels On Fish'},
  {'id': 3,
   'author': 'Wj Richardson',
   '#times_cited': 3080.0,
   'titles': 'Marine Mammals And Noise<br>Influences Of Man‐Made Noise And Other Human Actions On Cetacean Behaviour'},
  {'id': 4,
   'author': 'Ci Malme',
  

In [161]:
from pyvis.network import Network

def init_network(nodes, coauthors_edges, citing_edges):
    g = Network(notebook=True, 
            height='1000px', 
            width='1500px', 
            directed=True,
            bgcolor='#212121', 
            font_color='white')
    
    g.add_nodes(nodes.authors, 
                value=nodes["#times_cited"],
                label=nodes.authors, 
                title=nodes.titles
    )
    
    g.add_edges(list(coauthors_edges.itertuples(index=False, name=None)))
    
    edges_to_add = list(citing_edges.itertuples(index=False, name=None))
    for edge in edges_to_add:
        g.add_edge(edge[0], edge[1], value=edge[2], color="#4d4d00")
    
    return g


g = init_network(nodes, coauthors_edges, citing_edges)

options = """
var options = {
    "nodes": {
        "color": "#ffc900",
        "scaling": {
          "min": 5,
          "max": 50
        }
     },
     "edges": {
         "color": "#ec7e2f", 
         "opacity": "0.1"
     }
}
"""
g.set_options(options)

g.show("test.html")

In [35]:
from pyvis.network import Network

NUMBER_OF_NODES = 300

g = Network(notebook=True, 
            height='1000px', 
            width='1500px', 
            directed=True,
            bgcolor='#212121', 
            font_color='white')

# nodes
authors_to_keep = total_num_citation_per_author.index.tolist()[:NUMBER_OF_NODES]

# value of nodes (influence on community)
authors_weight = total_num_citation_per_author.values.tolist()[:NUMBER_OF_NODES]

# titles
titles_to_keep = titles.loc[authors_to_keep].to_list()

# edges
froms = test.a1.to_list()
tos = test.a2.to_list()
weights = test.authors.to_list()

g.add_nodes(authors_to_keep, 
            value=authors_weight,
            label=authors_to_keep, 
            title=titles_to_keep
)

for edge in edges.items():
    g.add_edge(authors[0], authors[1], value=val, color="green")

for authors, val in pairs_to_keep.items():
    g.add_edge(authors[0], authors[1], value=val, color="green")

options = """
var options = {
    "nodes": {
        "color": "#ffc900",
        "scaling": {
          "min": 5,
          "max": 70
        }
     },
     "edges": {
         "color": "#ec7e2f", 
         "opacity": "0.1"
     }
}
"""
g.set_options(options)
g.show("ex.html")

NameError: name 'authors' is not defined

In [13]:
from pyvis.network import Network

NUMBER_OF_NODES = 100

g = Network(notebook=True, 
            height='1000px', 
            width='1500px', 
            directed=True,
            bgcolor='#212121', 
            font_color='white')

# nodes
authors_to_keep = total_num_citation_per_author.index.tolist()[:NUMBER_OF_NODES]

# value of nodes (influence on community)
authors_weight = total_num_citation_per_author.values.tolist()[:NUMBER_OF_NODES]

# titles
titles_to_keep = titles.loc[authors_to_keep].to_list()

# edges
edges_to_keep = edges.loc[ edges.author.isin(total_num_citation_per_author[:NUMBER_OF_NODES].index), :]
edges_to_keep = edges_to_keep.loc[ edges.cited_author.isin(total_num_citation_per_author[:NUMBER_OF_NODES].index), :]

froms = edges_to_keep.author.to_list()
tos = edges_to_keep.cited_author.to_list()
weights = map(int, edges_to_keep["# times citing"])

g.add_nodes(authors_to_keep, 
            value=authors_weight,
            label=authors_to_keep, 
            title=titles_to_keep
)

g.add_edges(zip(froms, tos, weights))

options = """
var options = {
    "nodes": {
        "color": "#ffc900",
        "scaling": {
          "min": 5,
          "max": 70
        }
     },
     "edges": {
         "color": "#ec7e2f", 
         "opacity": "0.1"
     }
}
"""
g.set_options(options)
g.show("ex.html")

In [None]:
prolific_authors = (df.loc[:, ["title", "authors"]]
                      .drop_duplicates(subset="title")
                      .explode("authors")
                      .groupby("authors")
                      .count()
                      .sort_values("title", ascending=False)
                   )

prolific_authors[:10]

In [15]:
from pyvis.network import Network

NUMBER_OF_NODES = 100

g = Network(notebook=True, 
            height='1000px', 
            width='1500px', 
            directed=True,
            bgcolor='#212121', 
            font_color='white')

# nodes
authors_to_keep = total_num_citation_per_author.index.tolist()[:NUMBER_OF_NODES]

# value of nodes (influence on community)
authors_weight = total_num_citation_per_author.values.tolist()[:NUMBER_OF_NODES]

# titles
titles_to_keep = titles.loc[authors_to_keep].to_list()

# edges
edges_to_keep = edges.loc[ edges.author.isin(total_num_citation_per_author[:NUMBER_OF_NODES].index), :]
edges_to_keep = edges_to_keep.loc[ edges.cited_author.isin(total_num_citation_per_author[:NUMBER_OF_NODES].index), :]

froms = edges_to_keep.author.to_list()
tos = edges_to_keep.cited_author.to_list()
weights = map(int, edges_to_keep["# times citing"])

g.add_nodes(authors_to_keep, 
            value=authors_weight,
            label=authors_to_keep, 
            title=titles_to_keep
)

g.add_edges(zip(froms, tos, weights))

for authors, val in pairs_to_keep.items():
    g.add_edge(authors[0], authors[1], value=val, color="green", arrowStrikethrough=False)
    
options = """
var options = {
    "nodes": {
        "color": "#ffc900",
        "scaling": {
          "min": 5,
          "max": 70
        }
     },
     "edges": {
         "color": "#ec7e2f", 
         "opacity": "0.1"
     }
}
"""
g.set_options(options)
g.show("ex.html")

## Communities 

In [40]:
import community as community_louvain
import matplotlib.cm as cm
G = nx.from_pandas_edgelist(coauthors_edges, source="id_author1", target="id_author2")
partition = community_louvain.best_partition(G, weight='number')
nodes['group'] = nodes['id'].map(partition)
nodes.fillna(-1, inplace=True)

In [41]:
nodes

Unnamed: 0,id,author,#times_cited,titles,group
0,0,Jw Bradbury,4395.0,Principles Of Animal Communication,3.0
1,1,Sl Vehrencamp,4395.0,Principles Of Animal Communication,3.0
2,2,H Slabbekoorn,3107.0,Population‐Level Consequences Of Seismic Surve...,17.0
3,3,Wj Richardson,3080.0,Marine Mammals And Noise<br>Influences Of Man‐...,4.0
4,4,Ci Malme,2842.0,Marine Mammals And Noise,4.0
...,...,...,...,...,...
95,95,Rm Rolland,775.0,Overcoming The Challenges Of Studying Conserva...,8.0
96,96,F Ladich,745.0,Diversity In Fish Auditory Systems: One Of The...,-1.0
97,97,A Frantzis,723.0,A Review Of The Effects Of Seismic Surveys On ...,-1.0
98,98,B Møhl,720.0,Estimating Source Position Accuracy Of A Large...,8.0
