In [1238]:
from bs4 import BeautifulSoup
from Bio import Entrez
from Bio import Medline
from collections import defaultdict

import json
import numpy as np
import pandas as pd
import re
import requests
import time

In [1240]:
Entrez.email = "dn070017@gmail.com"
while(1):
    try:
        result = Entrez.read(Entrez.esearch(db="pubmed", retmax=10, term="Kazi Alam"))
        break
    except:
        print('connection failed')
        time.sleep(1)

In [1241]:
roster = defaultdict(dict)

In [1242]:
def order_title(title):
    if title in ["Student"]:
        return 0
    if title in ["Engineer", "Laboratory", "Others", "Researcher"]:
        return 1
    if title in ["PhD"]:
        return 2
    if title in ["Postdoc"]:
        return 3
    if title in ["Professor", "Head of Group"]:
        return 4

In [1243]:
def unify_title(title):
    match = re.search('group leader|head of group', title.lower())
    if match:
        return 'Head of Group'
    match = re.search('professor|lecturer', title.lower())
    if match:
        return 'Professor'
    match = re.search('postdoctoral|post-doctoral|postdoc|post', title.lower())
    if match:
        return 'Postdoc'
    match = re.search('doctoral|phd|ph\.d', title.lower())
    if match:
        return 'PhD'
    match = re.search('msc|bsc|master|bachelor|erasmus|undvp|trainee|student|junior fellow', title.lower())
    if match:
        return 'Student'
    match = re.search('administrat|office|director|personal assistant', title.lower())
    if match:
        return 'Administration'
    match = re.search('advis|consult', title.lower())
    if match:
        return 'Consultant'
    match = re.search('coordinator|project assistant|project manager', title.lower())
    if match:
        return 'Coordinator'
    match = re.search('laboratory|lab assistant', title.lower())
    if match:
        return 'Laboratory'
    match = re.search('IT', title)
    if match:
        return 'IT'
    match = re.search('information', title.lower())
    if match:
        return 'IT'
    match = re.search('engineer|bioinformatician|technician', title.lower())
    if match:
        return 'Engineer'
    match = re.search('research|academic employee|scientist|computational biologist', title.lower())
    if match:
        return 'Researcher'
    return 'Others'

# NCMM

In [1244]:
for p in range(1, 5):
    url = "https://www.med.uio.no/ncmm/english/people/?page={}&u-page={}".format(p, p)
    resp = requests.get(url)
    resp.encoding = 'utf-8'
    soup = BeautifulSoup(resp.text, 'html.parser')
    tds = soup.find_all("td", class_="vrtx-person-listing-name")
    for td in tds:
        i = 2
        if len(td.findChildren()) == 2:
            i = 0
        if td.findChildren()[i].text not in roster:
            name = td.findChildren()[i].text.split(',')
            name = name[1][1:] + ' ' + name[0]
            roster[name]['Name'] = name
            roster[name]['Title'] = unify_title(td.span.text)
            roster[name]['Institution'] = 'NCMM'
            titles.add(unify_title(td.span.text))
        else:
            print('Duplicated names in roster of NCMM')

In [1245]:
groups = ["esguerra", "mathelier", "gozen", "morth", 'haapaniemi', 'kuijjer', 'sekulic', 'lopez-aviles', 'staerk', 'luecke']
for g in groups:
    url = "https://www.med.uio.no/ncmm/english/groups/{}-group".format(g)
    resp = requests.get(url)
    resp.encoding = 'utf-8'
    soup = BeautifulSoup(resp.text, 'html.parser')
    div = soup.find_all("div", class_="vrtx-box-content")
    soup = BeautifulSoup(str(div), 'html.parser')
    a_list = soup.find_all("a")
    for i, a in enumerate(a_list):
        if i == 0:
            l = a.text
        if i == len(a_list) - 1:
            continue
        if a.text in roster:
            if 'Group' not in roster[a.text]:
                roster[a.text]['Group'] = set()
            roster[a.text]['Group'].add(l)
        else:
            print(a.text + 'is not in NCMM dictionary')

In [1246]:
roster['Rafael Riudavets Puig']['Group'] = ['Anthony Mathelier']
roster['Kazi Alam']['Group'] = ['Johannes Landskron']
roster['Alexandra Gade']['Group'] = ['Johannes Landskron']
roster['Silvia Espada Burriel']['Group'] = ['Sandra Lopez Aviles']
roster['Flore Kersten']['Group'] = ['Hartmut Luecke']
roster['Shixiong Wang']['Group'] = ['Antoni Hurtado Rodriguez']

roster['Johannes Landskron']['Title'] = 'Head of Group'
roster['Emma Haapaniemi']['Title'] = 'Head of Group'
roster['Janna Saarela']['Title'] = 'Head of Group'
roster['Hartmut Luecke']['Title'] = 'Head of Group'

roster['Ahmad Ali Ahmad']['Name'] = 'Ali Ahmad'
roster['Jaime Abraham Castro Mondragón']['Name'] = 'Jaime Castro Mondragón'
roster['Camila Vicencio Esguerra']['Name'] = 'Camila Esguerra'
roster['Kinga Aurelia Gawel']['Name'] = 'Kinga Gawel'
roster['João Paulo Ribeiro Proença Santana']['Name'] = 'João Santana'
roster['Antoni Hurtado Rodriguez']['Name'] = 'Antoni Rodriguez'

# FIMM

In [1247]:
groups = ['Ganna', 'Groop', 'Hennah', 'Kaprio', 'Latvala', 'Ollikainen', 'Palotie', 'Pirinen', 'Ripatti',
          'Saarela', 'Tukiainen', 'Vuoksimaa', 'Widen', 'Aittokallio', 'Heckman', 'Horvath', 'Kallioniemi',
          'Lundin', 'Verschuren', 'Wennerberg']

In [1248]:
for g in groups:
    if g == "Vuoksimaa":
        url = "https://www.fimm.fi/en/research/human-genomics/cognitive-and-brain-aging"
    else:
        url = "https://www.fimm.fi/en/research/groups/{}".format(g.lower())
    resp = requests.get(url)
    resp.encoding = 'utf-8'
    soup = BeautifulSoup(resp.text, 'html.parser')
    intro = soup.find_all("div", class_="intro")
    tbody = soup.find_all("tbody")
    
    soup = BeautifulSoup(str(intro), 'html.parser')
    leader = soup.find_all("h2")[0].text
    
    if leader in roster:
        print(leader + "can be found in NCMM")
    else:
        roster[leader]['Name'] = leader
        roster[leader]['Title'] = 'Head of Group'
        roster[leader]['Group'] = set()
        roster[leader]['Group'].add(leader)
        roster[leader]['Institution'] = 'FIMM'
    
    soup = BeautifulSoup(str(tbody), 'html.parser')
    name = soup.find_all("div", class_="show-info--name")
    position = soup.find_all("div", class_="field-job-title")
    for n, p in zip(name, position):
        match = re.search('^\s+(.+?)\s+$', p.text)
        if match:
            title = match.group(1)
        else:
            title = p.text
        if n.text == '':
            continue
        if n.text in roster:
            print(n.text + " belongs to multiple groups in FIMM")
            roster[n.text]['Group'].add(leader)
        else:
            roster[n.text]['Name'] = n.text
            roster[n.text]['Title'] = unify_title(title)
            if 'Group' not in roster[n.text]:
                roster[n.text]['Group'] = set()
            roster[n.text]['Group'].add(leader)
            roster[n.text]['Institution'] = 'FIMM'
        titles.add(unify_title(title))

Sailalitha Bollepalli belongs to multiple groups in FIMM
Milla Kibble belongs to multiple groups in FIMM
Mia Urjansson belongs to multiple groups in FIMM
Christian Benner  belongs to multiple groups in FIMM
Sini Kerminen belongs to multiple groups in FIMM
Janna Saarelacan be found in NCMM
Eero Vuoksimaacan be found in NCMM
Anna Cichonska belongs to multiple groups in FIMM
Heidi Arling-Tripepi belongs to multiple groups in FIMM
Piia  Mikkonen belongs to multiple groups in FIMM
Heidi Arling-Tripepi belongs to multiple groups in FIMM
Jie Bao belongs to multiple groups in FIMM
Nora Nordström belongs to multiple groups in FIMM
Sarang Talwelkar belongs to multiple groups in FIMM


# DANDRITE

In [1249]:
groups = ["nissen", "nykjaer", "Jensen", "philipsborn", "denham", "kvitsiani", "yonehara", 
          "nabavi", "tomonori-takeuch", "hanne-poulsen", "magnus-kjaergaard"]

In [1250]:
for g in groups:
    if g in ["tomonori-takeuch", "hanne-poulsen", "magnus-kjaergaard"]:
        url = "http://dandrite.au.dk/people/team-leaders/{}/team-members/".format(g.lower())
    else:
        url = "http://dandrite.au.dk/people/group-leaders/{}-group/group-members/".format(g.lower())
    resp = requests.get(url)
    resp.encoding = 'utf-8'
    soup = BeautifulSoup(resp.text, 'html.parser')
    divs = soup.find_all("div", class_="vcard pure-simple-person-single")
    leader = ""
    for i, div in enumerate(divs):
        soup = BeautifulSoup(str(div), "html.parser")
        first_name = soup.find("span", class_="given-name").text
        last_name = soup.find("span", class_="family-name").text
        title = soup.find("span", class_="title")
        if title is not None:
            title = title.text
        else:
            title = ""
        name = first_name + " " + last_name
        if i == 0:
            leader = name
        
        if name in roster:
            print(name + " belongs to multiple groups in DANDRITE")
            roster[name]['Group'].add(leader)
        else:
            roster[name]['Name'] = name
            if i == 0:
                roster[name]['Title'] = "Head of Group"
            else:
                roster[name]['Title'] = title
            if 'Group' not in roster[name]:
                roster[name]['Group'] = set()
            roster[name]['Group'].add(leader)
            roster[name]['Institution'] = 'DANDRITE'
        
        titles.add(unify_title(title))

Hanne Poulsen belongs to multiple groups in DANDRITE
Monica Dahlstrup Sietam belongs to multiple groups in DANDRITE
Magnus Kjærgaard belongs to multiple groups in DANDRITE


# MIMS

In [1251]:
mims = pd.read_csv('./MIMS.csv', sep=',', header=None)

In [1252]:
for i, d in mims.iterrows():
    name = d[0]
    title = d[1]
    leader = d[2]
    if name in roster:
        print(name + " belongs to multiple groups in MIMS")
        roster[name]['Group'].add(leader)
    else:
        roster[name]['Name'] = name
        if 'Group' not in roster[name]:
            roster[name]['Group'] = set()
        roster[name]['Group'].add(leader)
        roster[name]['Title'] = unify_title(title)
        roster[name]['Institution'] = 'MIMS'
        titles.add(unify_title(title))

Yngve Östberg belongs to multiple groups in MIMS
Lalitha Tadala belongs to multiple groups in MIMS


# Fetch Publication

In [1254]:
for n in roster.keys():
    while(True):
        try:
            result = Entrez.read(Entrez.esearch(db="pubmed", retmax=50, term=n + "[AUTH]"))
            roster[n]['Publications'] = (set(result["IdList"]))
            time.sleep(0.5)
            break
        except:
            time.sleep(0.5)
            print('retry ' + n)

In [1255]:
publications = dict()
for n in roster.keys():
    publications[n] = roster[n]['Publications']

In [1189]:
for n in roster.keys():
    roster[n]['Publications'] = publications[n]

# Build Connection

In [1277]:
names = list(roster.keys())
links = np.zeros((len(names), len(names)))
for i in range(0, len(names)):
    a = names[i]
    for j in range(0, len(names)):
        if i == j:
            continue
        link = 0
        b = names[j]
        if 'Group' in roster[b] and a in roster[b]['Group']:
            link += 1
        link += len(roster[a]['Publications'] & roster[b]['Publications'])
        links[i, j] = np.max([link, links[i, j]])
        links[j, i] = np.max([link, links[j, i]])

# Write Output

In [1278]:
is_node = set()
json_obj = defaultdict(list)
for i in range(links.shape[0]):
    for j in range(i + 1, links.shape[0]):
        if links[i, j] == 0:
            continue
        if unify_title(roster[names[i]]["Title"]) in ["IT", "Administration", "Coordinator", "Consultant"]:
            continue
        if unify_title(roster[names[j]]["Title"]) in ["IT", "Administration", "Coordinator", "Consultant"]:
            continue
        data = {
            "source": roster[names[i]]['Name'],
            "target": roster[names[j]]['Name'],
            "value": links[i, j],
        }
        is_node.add(roster[names[i]]['Name'])
        is_node.add(roster[names[j]]['Name'])
        json_obj["links"].append(data)

In [1279]:
for i, n in enumerate(names):
    if np.sum(links[i, :]) + np.sum(links[:, i]) == 0:
        continue
    if unify_title(roster[n]["Title"]) in ["IT", "Administration", "Coordinator", "Consultant"]:
        continue
    if roster[n]['Name'] not in is_node:
        continue
    if "Institution" not in roster[n]:
        ins = roster[n]["Insitution"]
    else:
        ins = roster[n]["Institution"]
    data = {
        "id": roster[n]["Name"],
        "title": order_title(unify_title(roster[n]["Title"])),
        "institution": ins
    }
    titles.add(unify_title(roster[n]['Title']))
    json_obj["nodes"].append(data)

In [1280]:
with open('./dataset/roster.json', 'w') as file:
    json.dump(json_obj, file)