In [1]:
import networkx as nx
import matplotlib.pyplot as plt
from pymongo import MongoClient
import  numpy as np
import datetime as dt
from joblib import Parallel, delayed
import os.path
from bson import ObjectId
from math import log, exp
import spacy

In [2]:
client=MongoClient()
colombia=client["kahi_test"]
impactu=client["kahi_impactu"]

In [3]:
#getting the list of institutions ids with works
institutions_ids=[]
for aff in colombia["affiliations"].find({"types.type":{"$nin":["faculty","department","group"]}}):
    count=colombia["works"].count_documents({"authors.affiliations.id":aff["_id"]})
    if count!=0:
        institutions_ids.append(aff["_id"])

In [4]:
len(institutions_ids)

13

In [13]:
#for idx in [ObjectId("637feec471459ce0bcb7a739")]:
def network_creation(idx):
    already=impactu["affiliations"].find_one({"_id":idx})
    if already:
        return None
    aff_info=colombia["affiliations"].find_one({"_id":idx})
    name=aff_info["names"][0]["name"]
    for n in aff_info["names"]:
        if n["lang"]=="es":
            name=n["name"]
            break
        elif n["lang"]=="en":
            name=n["name"]
    nodes=[idx]
    nodes_labels=[name]
    edges=[]
    edges_coauthorships={}
    works_count=0
    for work in colombia["works"].find({"authors.affiliations.id":idx,"author_count":{"$lte":10}}):
        works_count+=1
        work_nodes=[idx]
        work_edges=[]
        for author in work["authors"]:
            for aff in author["affiliations"]:
                if not aff["id"]:
                    continue
                if aff["id"]=="":
                    continue
                if aff["id"]==idx:
                    continue
                if not aff["id"] in nodes:
                    nodes.append(aff["id"])
                    #print(author)
                    #print("-"*20)
                    #print(aff)
                    #name=aff["names"][0]["name"] # el esquema cambió
                    name=aff["name"]
                    
                    # for n in aff["names"]: # ya no tiene el idioma, esta parte no sirve
                    #     if n["lang"]=="es":
                    #         name=n["name"]
                    #         break
                    #     elif n["lang"]=="en":
                    #         name=n["name"]
                    nodes_labels.append(name)
                if not aff["id"] in work_nodes:
                    for node in work_nodes:
                        edge_found=False
                        if (idx,aff["id"]) in work_edges:
                            edge_found=True
                        elif (aff["id"],idx) in edges:
                            edge_found=True
                        if edge_found==False:
                            work_edges.append((idx,aff["id"]))
                    work_nodes.append(aff["id"])
        #Connecting all the nodes in the work among them
        #checking if the connection already exists to add one to the count of coauthorships
        for node in work_nodes:
            if not node in nodes:
                nodes.append(node)
        for nodea,nodeb in work_edges:
            edge_found=False
            if (nodea,nodeb) in edges:
                edges_coauthorships[str(nodea)+str(nodeb)]+=1
                edge_found=True
            elif (nodeb,nodea) in edges:
                edges_coauthorships[str(nodeb)+str(nodea)]+=1
                edge_found=True
            if edge_found==False:
                edges_coauthorships[str(nodea)+str(nodeb)]=1
                edges.append((nodea,nodeb))
    #adding the connections between the coauthoring institutions
    for node in nodes:
        if node==idx:
            continue
        for work in colombia["works"].find({"$and":[{"authors.affiliations.id":node},{"authors.affiliations.id":{"$ne":idx}}],"author_count":{"$lte":10}}):
            for author in work["authors"]:
                for aff in author["affiliations"]:
                    if aff["id"]==idx:
                        print("Problem found")
                        continue
                    if not aff["id"] in nodes:
                        continue
                    if node==aff["id"]:
                        continue
                    if (node,aff["id"]) in edges:
                        edges_coauthorships[str(node)+str(aff["id"])]+=1
                    elif (aff["id"],node) in edges:
                        edges_coauthorships[str(aff["id"])+str(node)]+=1
                    else:
                        edges_coauthorships[str(node)+str(aff["id"])]=1
                        edges.append((node,aff["id"]))
    #Constructing the actual format to insrt in db
    num_nodes=len(nodes)
    nodes_db=[]
    for i,node in enumerate(nodes):
        degree=len([1 for i,j in edges if i==node or j==node])
        size=50*log(1+degree/(num_nodes-1),2) if num_nodes>1 else 1
        nodes_db.append(
            {
                "id":str(node),
                "label":nodes_labels[i],
                "degree":degree,
                "size":size
            }
        )
    edges_db=[]
    for nodea,nodeb in edges:
        coauthorships=0
        if str(nodea)+str(nodeb) in edges_coauthorships.keys():
            coauthorships=edges_coauthorships[str(nodea)+str(nodeb)]
        elif str(nodeb)+str(nodea) in edges_coauthorships.keys():
            coauthorships=edges_coauthorships[str(nodeb)+str(nodea)]
        edges_db.append({
            "source":str(nodea),
            "sourceName":nodes_labels[nodes.index(nodea)],
            "target":str(nodeb),
            "targetName":nodes_labels[nodes.index(nodeb)],
            "coauthorships":coauthorships,
            "size":coauthorships,
        })
    top=max([e["coauthorships"] for e in edges_db]) if len(edges_db)>0 else 1
    bot=min([e["coauthorships"] for e in edges_db]) if len(edges_db)>0 else 1
    #avg=mean([e["coauthorships"] for e in edges])
    for edge in edges_db:
        if abs(top-edge["coauthorships"])<0.01:
            edge["size"]=10
        elif abs(bot-edge["coauthorships"])<0.01:
            edge["size"]=1
        else:
            size=10/(1+exp(6-10*edge["coauthorships"]/top))
            edge["size"]=size if size>=1 else 1
    impactu["affiliations"].insert_one({
        "_id":idx,
        "coauthorship_network":{
            "nodes":nodes_db,
            "edges":edges_db
        }
    })
    '''if "Antioquia" in nodes_labels[0]:
        print(nodes_labels[0],works_count,len(nodes),len(edges))
        G=nx.Graph()
        G.add_nodes_from(nodes)
        G.add_edges_from(edges)
        nx.draw(G)'''
    #print(nodes_db)
    #print(edges_db)

In [14]:
network_creation(institutions_ids[0])

In [15]:
Parallel(n_jobs=1,backend="multiprocessing",verbose=10)(delayed(network_creation)(oaid) for oaid in institutions_ids)

[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   7 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  12 tasks      | elapsed:    0.0s


[None, None, None, None, None, None, None, None, None, None, None, None, None]

In [17]:
#getting the list of institutions ids with works
authors_ids=[]
for author in colombia["person"].find():
    count=colombia["works"].count_documents({"authors.id":author["_id"]})
    if count!=0:
        authors_ids.append(author["_id"])
print(len(authors_ids))

20


In [18]:
def network_creation(idx):
    already=impactu["person"].find_one({"_id":idx})
    if already:
        return None
    aff_info=colombia["person"].find_one({"_id":idx})
    name=aff_info["full_name"]
    nodes=[idx]
    nodes_labels=[name]
    edges=[]
    edges_coauthorships={}
    works_count=0
    for work in colombia["works"].find({"authors.id":idx,"author_count":{"$lte":10}}):
        works_count+=1
        work_nodes=[idx]
        work_edges=[]
        for author in work["authors"]:
            if not author["id"]:
                continue
            if author["id"]=="":
                continue
            if author["id"]==idx:
                continue
            if not author["id"] in nodes:
                nodes.append(author["id"])
                name=author["full_name"]
                nodes_labels.append(name)
            if not author["id"] in work_nodes:
                for node in work_nodes:
                    edge_found=False
                    if (idx,author["id"]) in work_edges:
                        edge_found=True
                    elif (author["id"],idx) in edges:
                        edge_found=True
                    if edge_found==False:
                        work_edges.append((idx,author["id"]))
                work_nodes.append(author["id"])
        #Connecting all the nodes in the work among them
        #checking if the connection already exists to add one to the count of coauthorships
        for node in work_nodes:
            if not node in nodes:
                nodes.append(node)
        for nodea,nodeb in work_edges:
            edge_found=False
            if (nodea,nodeb) in edges:
                edges_coauthorships[str(nodea)+str(nodeb)]+=1
                edge_found=True
            elif (nodeb,nodea) in edges:
                edges_coauthorships[str(nodeb)+str(nodea)]+=1
                edge_found=True
            if edge_found==False:
                edges_coauthorships[str(nodea)+str(nodeb)]=1
                edges.append((nodea,nodeb))
    #adding the connections between the coauthoring institutions
    for node in nodes:
        if node==idx:
            continue
        for work in colombia["works"].find({"$and":[{"authors.id":node},{"authors.id":{"$ne":idx}}],"author_count":{"$lte":10}}):
            for author in work["authors"]:
                if author["id"]==idx:
                    print("Problem found")
                    continue
                if not author["id"] in nodes:
                    continue
                if node==author["id"]:
                    continue
                if (node,author["id"]) in edges:
                    edges_coauthorships[str(node)+str(author["id"])]+=1
                elif (author["id"],node) in edges:
                    edges_coauthorships[str(author["id"])+str(node)]+=1
                else:
                    edges_coauthorships[str(node)+str(author["id"])]=1
                    edges.append((node,author["id"]))
    #Constructing the actual format to insrt in db
    num_nodes=len(nodes)
    nodes_db=[]
    for i,node in enumerate(nodes):
        degree=len([1 for i,j in edges if i==node or j==node])
        size=50*log(1+degree/(num_nodes-1),2) if num_nodes>1 else 1
        nodes_db.append(
            {
                "id":str(node),
                "label":nodes_labels[i],
                "degree":degree,
                "size":size
            }
        )
    edges_db=[]
    for nodea,nodeb in edges:
        coauthorships=0
        if str(nodea)+str(nodeb) in edges_coauthorships.keys():
            coauthorships=edges_coauthorships[str(nodea)+str(nodeb)]
        elif str(nodeb)+str(nodea) in edges_coauthorships.keys():
            coauthorships=edges_coauthorships[str(nodeb)+str(nodea)]
        edges_db.append({
            "source":str(nodea),
            "sourceName":nodes_labels[nodes.index(nodea)],
            "target":str(nodeb),
            "targetName":nodes_labels[nodes.index(nodeb)],
            "coauthorships":coauthorships,
            "size":coauthorships,
        })
    top=max([e["coauthorships"] for e in edges_db]) if len(edges_db)>0 else 1
    bot=min([e["coauthorships"] for e in edges_db]) if len(edges_db)>0 else 1
    #avg=mean([e["coauthorships"] for e in edges])
    for edge in edges_db:
        if abs(top-edge["coauthorships"])<0.01:
            edge["size"]=10
        elif abs(bot-edge["coauthorships"])<0.01:
            edge["size"]=1
        else:
            size=10/(1+exp(6-10*edge["coauthorships"]/top))
            edge["size"]=size if size>=1 else 1
    impactu["person"].insert_one({
        "_id":idx,
        "coauthorship_network":{
            "nodes":nodes_db,
            "edges":edges_db
        }
    })

In [19]:
network_creation(authors_ids[0])

In [None]:
Parallel(n_jobs=20,backend="multiprocessing",verbose=10)(delayed(network_creation)(oaid) for oaid in authors_ids)

### Words

In [22]:
%%bash
python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [25]:
%%bash
python -m spacy download es_core_news_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')


In [26]:
en = spacy.load('en_core_web_sm')
es = spacy.load('es_core_news_sm')
stopwords = en.Defaults.stop_words.union(es.Defaults.stop_words)

In [27]:
#esto inserta top_words en la affiliations
#top_words: [
  #   { name: 'bind', value: 2 },
  #   { name: 'state', value: 2 },
  #   { name: 'dark', value: 2 },
  #   { name: 'matter', value: 2 },
  #   { name: 'dirac', value: 1 },
  #   { name: 'neutrino', value: 1 },
  #   { name: 'masse', value: 1 },
  #   { name: 'majorana', value: 1 },
  #   { name: 'neutrinos', value: 1 }
  # ]
#Podria hacer una funición que se llame top_words
for aff in colombia["affiliations"].find():
    aff_db=impactu["affiliations"].find_one({"_id":aff["_id"],"top_words":{"$exists":1}})
    if aff_db:
        continue
    results={}
    for work in colombia["works"].find({"authors.affiliations.id":aff["_id"],"titles":{"$exists":1}},{"titles":1}):
        title=work["titles"][0]["title"].lower()
        lang=work["titles"][0]["lang"]
        if lang=="es":
            model=es
        else:
            model=en
        title=model(title)
        for token in title:
            if token.lemma_.isnumeric():
                continue
            if token.lemma_ in stopwords:
                continue
            if len(token.lemma_)<4:
                continue
            if token.lemma_ in results.keys():
                results[token.lemma_]+=1
            else:
                results[token.lemma_]=1
    topN=sorted(results.items(), key=lambda x: x[1], reverse=True)[:20]
    results=[]
    for top in topN:
        results.append({"name":top[0],"value":top[1]})
    aff_db=impactu["affiliations"].find_one({"_id":aff["_id"]})
    if aff_db:
        impactu["affiliations"].update_one({"_id":aff["_id"]},{"$set":{"top_words":results}})
    else:
        impactu["affiliations"].insert_one({"_id":aff["_id"],"top_words":results})

In [28]:
#affiliations other tha institutions
for aff in colombia["affiliations"].find({"types.type":{"$in":["faculty","department","group"]}}):
    aff_db=impactu["affiliations"].find_one({"_id":aff["_id"],"top_words":{"$exists":1}})
    if aff_db:
        results={}
        for author in colombia["person"].find({"affiliations.id":aff["_id"]}):
            for work in colombia["works"].find({"authors.id":author["_id"]}):
                title=work["titles"][0]["title"].lower()
                lang=work["titles"][0]["lang"]
                if lang=="es":
                    model=es
                else:
                    model=en
                title=model(title)
                for token in title:
                    if token.lemma_.isnumeric():
                        continue
                    if token.lemma_ in stopwords:
                        continue
                    if len(token.lemma_)<4:
                        continue
                    if token.lemma_ in results.keys():
                        results[token.lemma_]+=1
                    else:
                        results[token.lemma_]=1
        topN=sorted(results.items(), key=lambda x: x[1], reverse=True)[:20]
        results=[]
        for top in topN:
            results.append({"name":top[0],"value":top[1]})
        impactu["affiliations"].update_one({"_id":aff["_id"]},{"$set":{"top_words":results}})

In [29]:
words_inserted_ids=[]

In [None]:
with client.start_session() as session:
    old=dt.datetime.now()
    for aff in colombia["person"].find({"_id":{"$nin":words_inserted_ids}}):
        aff_db=impactu["person"].find_one({"_id":aff["_id"],"top_words":{"$exists":1}})
        if aff_db:
            words_inserted_ids.append(aff["_id"])
            continue
        results={}
        for work in colombia["works"].find({"authors.id":aff["_id"],"titles":{"$exists":1}},{"titles":1}):
            title=work["titles"][0]["title"].lower()
            lang=work["titles"][0]["lang"]
            if lang=="es":
                model=es
            else:
                model=en
            title=model(title)
            for token in title:
                if token.lemma_.isnumeric():
                    continue
                if token.lemma_ in stopwords:
                    continue
                if len(token.lemma_)<4:
                    continue
                if token.lemma_ in results.keys():
                    results[token.lemma_]+=1
                else:
                    results[token.lemma_]=1
        topN=sorted(results.items(), key=lambda x: x[1], reverse=True)[:20]
        results=[]
        for top in topN:
            results.append({"name":top[0],"value":top[1]})
        aff_db=impactu["person"].find_one({"_id":aff["_id"]})
        if aff_db:
            impactu["person"].update_one({"_id":aff["_id"]},{"$set":{"top_words":results}})
        else:
            impactu["person"].insert_one({"_id":aff["_id"],"top_words":results})
            
        delta=dt.datetime.now()-old
        if delta.seconds>240:
            client.admin.command('refreshSessions', [session.session_id], session=session)
            old=dt.datetime.now()

In [9]:
ids=[ObjectId("63935abf71459ce0bcb933a8")]
for aff in ids:
    sum_docs=0
    for author in colombia["person"].find({"affiliations":aff}):
        sum_docs+=colombia["person"].count_documents({"authors.id":author["_id"]})
sum_docs

0