In [3]:
from pymongo import MongoClient
from bson.objectid import ObjectId
from bson import json_util
from time import time
import datetime as dt
from dateutil.relativedelta import relativedelta
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz, process
import requests
from urllib.parse import unquote
import unidecode
import re
from re import split,UNICODE
import requests
from bs4 import BeautifulSoup
import json
import pickle
from langid import classify
from geotext import GeoText

In [4]:
class JSONEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, ObjectId):
            return str(o)
        return json.JSONEncoder.default(self, o)

In [5]:
def split_names(s,exceptions=['GIL','LEW','LIZ','PAZ','REY','RIO','ROA','RUA','SUS','ZEA']):
    """
    Extract the parts of the full name `s` in the format ([] → optional):
    
    [SMALL_CONECTORS] FIRST_LAST_NAME [SMALL_CONECTORS] [SECOND_LAST_NAME] NAMES
    
    * If len(s) == 2 → Foreign name assumed with single last name on it
    * If len(s) == 3 → Colombian name assumed two last mames and one first name
    
    Add short last names to `exceptions` list if necessary
    
    Works with:
    ----
        s='LA ROTTA FORERO DANIEL ANDRES'
        s='MONTES RAMIREZ MARIA DEL CONSUELO'
        s='CALLEJAS POSADA RICARDO DE LA MERCED'
        s='DE LA CUESTA BENJUMEA MARIA DEL CARMEN'
        s='JARAMILLO OCAMPO NICOLAS CARLOS MARTI'
        s='RESTREPO QUINTERO DIEGO ALEJANDRO'
        s='RESTREPO ZEA JAIRO HUMBERTO'
        s='JIMENEZ DEL RIO MARLEN'        
        s='RESTREPO FERNÁNDEZ SARA' # Colombian: two LAST_NAMES NAME
        s='NARDI ENRICO' # Foreing
    Fails:
    ----
        s='RANGEL MARTINEZ VILLAL ANDRES MAURICIO' # more than 2 last names
        s='ROMANO ANTONIO ENEA' # Foreing → LAST_NAME NAMES
    """
    s=s.title()
    exceptions=[e.title() for e in exceptions]
    sl=re.sub('(\s\w{1,3})\s',r'\1-',s,re.UNICODE)
    sl=re.sub('(\s\w{1,3}\-\w{1,3})\s',r'\1-',sl,re.UNICODE)
    sl=re.sub('^(\w{1,3})\s',r'\1-' ,sl,re.UNICODE)
    #Clean exceptions
    #Extract short names list
    lst=[s for s in re.split( '(\w{1,3})\-',sl ) if len(s)>=1 and len(s)<=3 ]
    #intersection with exceptions list
    exc=[value for value in exceptions if value in lst]
    if exc:
        for e in exc:
            sl=sl.replace('{}-'.format(e),'{} '.format(e))
            
    #if sl.find('-')>-1:
    #print(sl)
    sll=[s.replace('-',' ') for s in sl.split()]
    if len(s.split())==2:
        sll=[s.split()[0]]+['']+[s.split()[1]]
    #
    d={'NOMBRE COMPLETO' : ' '.join(sll[2:]+sll[:2]),
     'PRIMER APELLIDO' : sll[0], 
     'SEGUNDO APELLIDO': sll[1], 
     'NOMBRES'         :' '.join(sll[2:]), 
     'INICIALES'       :' '.join( [i[0]+'.' for i in ' '.join(sll[2:]).split() ] )
    }
    return d

In [46]:
client=MongoClient()
colombia=client["colombia_udea"]
openalex=client["openalex"]
openalexco=client["openalexco"]
wos = client["wos_colombia"]["stage"]
scopus = client["scopus_colombia"]["stage"]
scholar = client["scholar_colombia"]["stage"]
oadoi = client["oadoi"]["stage"]

scienti = client["scienti_111"]["products"]

puntaje = pd.read_excel("/current/data/colombia/udea/produccion 2018-2022 al 27 oct 2022.xlsx",
                        dtype={"cedula":str}
                       )

In [11]:
puntaje.columns

Index(['cedula', 'nombre', 'tipo mat', 'capdescrip', 'tipo concep', 'tipo mov',
       'ptos', 'numero autores', 'año realiz', 'fecha vig', 'fecha aplica',
       'fecha pres', 'codigo', 'titulo', 'codigo mat', 'nombre rev o premio',
       'issn', 'numero acta', 'pais', 'idioma', 'fecha registro', 'DOI',
       'URL'],
      dtype='object')

In [50]:
def empty_work():
    return {
        "titles" : [],
        "updated" : [],
        "subtitle" : "",
        "abstract" : "",
        "keywords" : [],
        "types" : [],
        "external_ids" : [],
        "external_urls" : [],
        "date_published" : None,
        "year_published" : None,
        "bibliographic_info" : {},
        "references_count" : None,
        "references" : [],
        "citations_count" : [],
        "citations" : [],
        "author_count" : None,
        "authors" : [],
        "source" : {},
        "ranking":[],
        "subjects" : []
    }

In [13]:
affiliations_db={}
affiliations_db["udea"]=colombia["affiliations"].find_one({"names.name":"Universidad de Antioquia"})

## Parsers from each db

In [14]:
doi="10.1142/S0217732318500244"
#scholar and oadoi in lowercase, puntaje, wos and scopus as is... user regex to query mongo (implement later)

#### OpenAlex

In [79]:
def parse_openalex(reg):
    entry=empty_work()
    entry["updated"]=[{"source":"openalex","time":int(time())}]
    lang=classify(reg["title"])[0]
    entry["titles"].append({"title":reg["title"],"lang":lang})
    for source,idx in reg["ids"].items():
        if "doi" in source:
            idx=idx.replace("https://doi.org/","").lower()
        entry["external_ids"].append({"source":source,"id":idx})
    entry["year_published"]=reg["publication_year"]
    entry["date_published"]=int(dt.datetime.strptime(reg["publication_date"],"%Y-%m-%d").timestamp())
    entry["types"].append({"source":"openalex","type":reg["type"]})
    
    entry["source"]={
        "name":reg["host_venue"]["display_name"],
        "external_ids":[{"source":"openalex","id":reg["host_venue"]["id"]}]
    }
    
    if "issn_l" in reg["host_venue"].keys():
        entry["source"]["external_ids"].append({"source":"issn_l","id":reg["host_venue"]["issn_l"]})
    if "issn" in reg["host_venue"].keys():
        entry["source"]["external_ids"].append({"source":"issn","id":reg["host_venue"]["issn"][0]})
        
    entry["citations_count"].append({"source":"openalex","count":reg["cited_by_count"]})
    
    if "volume" in reg["biblio"]:
        if reg["biblio"]["volume"]:
            entry["bibliographic_info"]["volume"]=int(reg["biblio"]["volume"])
    if "issue" in reg["biblio"]:
        if reg["biblio"]["issue"]:
            entry["bibliographic_info"]["issue"]=int(reg["biblio"]["issue"])
    if "first_page" in reg["biblio"]:
        if reg["biblio"]["first_page"]:
            entry["bibliographic_info"]["start_page"]=int(reg["biblio"]["first_page"])
    if "last_page" in reg["biblio"]:
        if reg["biblio"]["last_page"]:
            entry["bibliographic_info"]["end_page"]=int(reg["biblio"]["last_page"])
    if "openaccess" in reg.keys():
        if "is_oa" in reg["openaccess"].keys():
            entry["bibliographic_info"]["is_open_acess"]=True if reg["openaccess"]["is_oa"]=="true" else False
        if "oa_status" in reg["openaccess"].keys():
            entry["bibliographic_info"]["open_access_status"]=reg["openaccess"]["oa_status"]
    #authors section
    for author in reg["authorships"]:
        affs=[]
        for inst in author["institutions"]:
            aff_entry={
                "external_ids":[{"source":"openalex","id":inst["id"]}],
                "name":inst["display_name"]
            }
            if "ror" in inst.keys():
                aff_entry["external_ids"].append({"source":"ror","id":inst["ror"]})
            affs.append(aff_entry)
        author=author["author"]
        author_entry={
            "external_ids":[{"source":"openalex","id":author["id"]}],
            "full_name":author["display_name"],
            "types":[],
            "affiliations":affs
        }
        if author["orcid"]:
            author_entry["external_ids"].append({"source":"orcid","id":author["orcid"].replace("https://orcid.org/","")})
        entry["authors"].append(author_entry)
    #concepts section
    subjects=[]
    for concept in reg["concepts"]:
        sub_entry={
            "external_ids":[{"source":"openalex","id":concept["id"]}],
            "name":concept["display_name"],
            "level":concept["level"]
        }
        subjects.append(sub_entry)
    entry["subjects"].append({"source":"openalex","subjects":subjects})
    
    return entry

In [80]:
openalex_reg=openalexco["works"].find_one({"doi":"https://doi.org/"+doi.lower()})
iu_openalex=parse_openalex(openalex_reg)


#### Puntaje

In [81]:
#puntaje
def parse_puntaje(regs,affiliation="udea"):
    entry=empty_work()
    reg=regs[0]
    entry["updated"]=[{"source":affiliation,"time":int(time())}]
    lang=classify(reg["titulo"])[0]
    entry["titles"].append({"title":reg["titulo"],"lang":lang})
    if reg["DOI"]:
        entry["external_ids"].append({"source":"doi","id":reg["DOI"].lower()})
    if reg["issn"]:
        entry["source"]={"name":reg["nombre rev o premio"],"external_ids":{"source":"issn","id":reg["issn"]}}
    else:
        entry["source"]={"name":reg["nombre rev o premio"],"external_ids":None}
    entry["year_published"]=int(reg["año realiz"])
    for reg in regs:
        name=split_names(reg["nombre"])
        aff={
            "id":affiliations_db[affiliation]["_id"],
            "name":affiliations_db[affiliation]["names"][0]["name"],
            "types":affiliations_db[affiliation]["types"]
        }
        for affname in affiliations_db[affiliation]["names"]:
            if affname["lang"]=="es":
                aff["name"]=affname["name"]
                break
            elif affname["lang"]=="en":
                aff["name"]=affname["name"]
        entry["authors"].append({
            "external_ids":{"source":"Cédula de Ciudadanía","id":reg["cedula"]},
            "full_name":name["NOMBRE COMPLETO"],
            "types":[],
            "affiliations":[aff]
        })
    return entry

In [18]:
iu_puntaje=parse_puntaje(puntaje[puntaje["DOI"]==doi].to_dict(orient="records"))

#### wos

In [19]:
def parse_wos(reg):
    entry=empty_work()
    entry["updated"]=[{"source":"wos","time":int(time())}]
    if "TI" in reg.keys():
        lang=classify(reg["TI"])[0]
        entry["titles"].append({"title":reg["TI"],"lang":lang})
    if "AB" in reg.keys():
        if reg["AB"] and reg["AB"]==reg["AB"]:
            entry["abstract"]=reg["AB"].strip()
    if "DT" in reg.keys():
        if reg["DT"] and reg["DT"]==reg["DT"]:
            entry["types"].append({"source":"wos","type":reg["DT"].strip().lower()})
    if "PY" in reg.keys():
        if reg["PY"] and reg["PY"]==reg["PY"]:
            entry["year_published"]=int(reg["PY"].strip())
    if "BP" in reg.keys():
        if reg["BP"] and reg["BP"]==reg["BP"]:
            try:
                entry["bibliographic_info"]["start_page"]=int(reg["BP"])
            except:
                pass
    if "EP" in reg.keys():
        if reg["EP"] and reg["EP"]==reg["EP"]:
            try:
                entry["bibliographic_info"]["end_page"]=int(reg["EP"])
            except:
                pass
    if "VL" in reg.keys():
        if reg["VL"] and reg["VL"]==reg["VL"]:
            try:
                entry["bibliographic_info"]["volume"]=int(reg["VL"].strip())
            except:
                pass
    if "IS" in reg.keys():
        if reg["IS"] and reg["IS"]==reg["IS"]:
            try:
                entry["bibliographic_info"]["issue"]=int(reg["IS"].strip())
            except:
                pass
    
    count=None
    if "Z9" in reg.keys():
        if reg["Z9"] and reg["Z9"]==reg["Z9"]:
            try:
                count=int(reg["Z9"].replace("\n",""))
            except:
                count=None
            entry["citations_count"].append({"source":"wos","count":count})

    if "DI" in reg.keys():
        if reg["DI"]:
            ext={"source":"doi","id":reg["DI"].lower()}
            entry["external_ids"].append(ext)
    if "UT" in reg.keys():
        if reg["UT"]:
            ext={"source":"wos","id":reg["UT"].strip().split(":")[1]}
            entry["external_ids"].append(ext)
            
    source={"external_ids":[]}
    if "SO" in reg.keys():
        if reg["SO"]:
            source["name"]=reg["SO"].rstrip()
    if "SN" in reg.keys():
        if reg["SN"]:
            source["external_ids"].append({"source":"pissn","id":reg["SN"].rstrip()})
    if "EI" in reg.keys():
        if reg["EI"]:
            source["external_ids"].append({"source":"eissn","id":reg["EI"].rstrip()})
    if "BN" in reg.keys():
        if reg["BN"]:
            source["external_ids"].append({"type":"isbn","value":reg["BN"].rstrip()})
    entry["source"]=source
    
    #authors_section
    author_count=len(reg["AU"].strip().split("\n"))
    if "C1" in reg.keys():
        if reg["C1"]:
            orcid_list=[]
            researcherid_list=[]
            if "RI" in reg.keys():
                if reg["RI"] and reg["RI"]==reg["RI"]:
                    researcherid_list=reg["RI"].rstrip().replace("; ",";").split(";")
            if "OI" in reg.keys():
                if reg["OI"] and reg["OI"]==reg["OI"]:
                    orcid_list=reg["OI"].rstrip().replace("; ",";").split(";")
            for auwaf in reg["C1"].strip().replace(".","").split("\n"):
                aulen=len(auwaf.split(";"))
                if aulen==1:
                    auaff=auwaf.split("] ")
                    if len(auaff)==1:
                        aff=auwaf
                        authors=[""]
                        if "AF" in reg.keys():
                            if len(reg["AF"].rstrip().split("\n"))==1:
                                authors=reg["AF"].rstrip()
                    else:
                        aff=auaff[1]
                        authors=[auaff[0][1:]]
                else:
                    aff=auwaf.split("] ")[1]
                    authors=auwaf.split("] ")[0][1:].split("; ")
                try:
                    instname="".join(aff.split(", ")[0])
                except:
                    instname=""
                for author in authors:
                    entry_ext=[]
                    for res in researcherid_list:
                        try:
                            name,rid=res.split("/")[-2:]
                        except Exception as e:
                            pass
                        ratio=fuzz.partial_ratio(name,author)
                        if ratio>90:
                            entry_ext.append({"source":"researcherid","id":rid})
                            break
                        elif ratio>50:
                            ratio=fuzz.token_set_ratio(name,author)
                            if ratio>90:
                                entry_ext.append({"source":"researcherid","id":rid})
                                break
                            elif ratio>50:
                                ratio=fuzz.partial_token_set_ratio(name,author)
                                if ratio>95:
                                    entry_ext.append({"source":"researcherid","id":rid})
                                    break
                    for res in orcid_list:
                        try:
                            name,rid=res.split("/")[-2:]
                        except Exception as e:
                            pass
                        ratio=fuzz.partial_ratio(name,author)
                        if ratio>90:
                            entry_ext.append({"source":"orcid","id":rid})
                            break
                        elif ratio>50:
                            ratio=fuzz.token_set_ratio(name,author)
                            if ratio>90:
                                entry_ext.append({"source":"orcid","id":rid})
                                break
                            elif ratio>50:
                                ratio=fuzz.partial_token_set_ratio(name,author)
                                if ratio>95:
                                    entry_ext.append({"source":"orcid","id":rid})
                                    break
                    author_entry={
                        "full_name":author,
                        "types":[],
                        "affiliations":[{
                            "name":instname
                        }]
                    }
                    entry["authors"].append(author_entry)
    
    return entry

In [20]:
wos_reg=wos.find_one({"DI":doi})
iu_wos=parse_wos(wos_reg)
iu_wos

{'titles': [{'title': 'Anomalous leptonic U(1) symmetry: Syndetic origin of the QCD axion, weak-scale dark matter, and radiative neutrino mass',
   'lang': 'en'}],
 'updated': [{'source': 'wos', 'time': 1675269513}],
 'subtitle': '',
 'abstract': 'The well-known leptonic U(1) symmetry of the Standard Model (SM) of quarks and leptons is extended to include a number of new fermions and scalars. The resulting theory has an invisible QCD axion (thereby solving the strong CP problem), a candidate for weak-scale dark matter (DM), as well as radiative neutrino masses. A possible key connection is a color-triplet scalar, which may be produced and detected at the Large Hadron Collider.',
 'keywords': [],
 'types': [{'source': 'wos', 'type': 'article'}],
 'external_ids': [{'source': 'doi', 'id': '10.1142/s0217732318500244'},
  {'source': 'wos', 'id': '000423488200007'}],
 'external_urls': [],
 'date_published': None,
 'year_published': 2018,
 'bibliographic_info': {'volume': 33, 'issue': 3},
 'r

#### scopus

In [30]:
def parse_scopus(reg):
    entry=empty_work()
    entry["updated"]=[{"source":"scopus","time":int(time())}]
    lang=classify(reg["Title"])[0]
    entry["titles"].append({"title":reg["Title"],"lang":lang})
    if "Abstract" in reg.keys():
        if reg["Abstract"] and reg["Abstract"]==reg["Abstract"]:
            entry["abstract"]=reg["Abstract"]
    if "Year" in reg.keys(): entry["year_published"]=reg["Year"]
    
    if "Document Type" in reg.keys():
        entry["types"].append({"source":"scopus","type":reg["Document Type"]})
    if "Index Keywords" in reg.keys():
        if reg["Index Keywords"] and reg["Index Keywords"] == reg["Index Keywords"]:
            entry["keywords"].extend(reg["Index Keywords"].lower().split("; "))
    if "Author Keywords" in reg.keys():
        if reg["Author Keywords"] and reg["Author Keywords"] == reg["Author Keywords"]:
            entry["keywords"].extend(reg["Author Keywords"].lower().split("; "))
            
    if "DOI" in reg.keys(): entry["external_ids"].append({"source":"doi","id":reg["DOI"].lower()})
    if "EID" in reg.keys(): entry["external_ids"].append({"source":"scopus","id":reg["EID"]})
    if "Pubmed ID" in reg.keys(): entry["external_ids"].append({"source":"pubmed","id":reg["Pubmed ID"]})
    
    if "Link" in reg.keys(): entry["external_urls"].append({"source":"scopus","url":reg["Link"]})
    
    if "Volume" in reg.keys(): 
        if reg["Volume"] and reg["Volume"]==reg["Volume"]:
            entry["bibliographic_info"]["volume"]=reg["Volume"]
    if "Issue" in reg.keys():
        if reg["Issue"] and reg["Issue"]==reg["Issue"]:
            entry["bibliographic_info"]["issue"]=reg["Issue"]
    if "Page start" in reg.keys():
        if reg["Page start"] and reg["Page start"] == reg["Page start"]: #checking for NaN in the second criteria
            try:
                entry["bibliographic_info"]["start_page"]=int(reg["Page start"])
            except:
                pass
    if "Page end" in reg.keys():
        if reg["Page end"] and reg["Page end"] == reg["Page end"]:
            try:
                entry["bibliographic_info"]["end_page"]=int(reg["Page end"])
            except:
                pass
            
    if "Cited by" in reg.keys():
        entry["citations_count"].append({"source":"scopus","count":int(reg["Cited by"])})
    
    source={"external_ids":[]}
    if "Source title" in reg.keys():
        if reg["Source title"] and reg["Source title"]==reg["Source title"]:
            source["title"]=reg["Source title"]
    if "ISSN" in reg.keys():
        if reg["ISSN"] and reg["ISSN"]==reg["ISSN"]:
            source["external_ids"].append({"source":"issn","id":reg["ISSN"][:4]+"-"+reg["ISSN"][4:]})
        if reg["ISBN"] and reg["ISBN"]==reg["ISBN"]:
            source["external_ids"].append({"source":"isbn","id":reg["ISBN"]})
        if  reg["CODEN"] and reg["CODEN"]==reg["CODEN"]:
            source["external_ids"].append({"source":"coden","id":reg["CODEN"]})
    entry["source"]=source
    
    #authors section
    ids=None
    if "Author(s) ID" in reg.keys(): ids=reg["Author(s) ID"].split(";")

    if "Authors with affiliations" in reg.keys():
        if reg["Authors with affiliations"] and reg["Authors with affiliations"]==reg["Authors with affiliations"]:
            if "Author(s) ID" in reg.keys():
                ids=reg["Author(s) ID"].split(";")
            auwaf_list=reg["Authors with affiliations"].split("; ")
            for i in range(len(auwaf_list)):
                auaf=split('(^[\w\-\s\.]+,\s+[\w\s\.\-]+,\s)',auwaf_list[i],UNICODE)
                if len(auaf)==1:
                    author=auaf[0]
                    affiliations=""
                else:
                    author=auaf[1]
                    affiliations=auaf[-1]
                author_entry={
                    "full_name":author.replace("-"," ").strip(),
                    "types":[],
                    "affiliations":[{"name":affiliations}]
                }
                if ids:
                    try:
                        author_entry["external_ids"]=[{"source":"scopus","id":ids[i]}]
                    except Exception as e:
                        pass
                entry["authors"].append(author_entry)
                
                
    return entry

In [31]:
scopus_reg=scopus.find_one({"DOI":doi})
iu_scopus=parse_scopus(scopus_reg)
iu_scopus

{'titles': [{'title': 'Anomalous leptonic U(1) symmetry: Syndetic origin of the QCD axion, weak-scale dark matter, and radiative neutrino mass',
   'lang': 'en'}],
 'updated': [{'source': 'scopus', 'time': 1675269635}],
 'subtitle': '',
 'abstract': 'The well-known leptonic U(1) symmetry of the Standard Model (SM) of quarks and leptons is extended to include a number of new fermions and scalars. The resulting theory has an invisible QCD axion (thereby solving the strong CP problem), a candidate for weak-scale dark matter (DM), as well as radiative neutrino masses. A possible key connection is a color-triplet scalar, which may be produced and detected at the Large Hadron Collider. © 2018 World Scientific Publishing Company.',
 'keywords': ['dark matter', 'radiative neutrino masses', 'strong cp problem'],
 'types': [{'source': 'scopus', 'type': 'Article'}],
 'external_ids': [{'source': 'doi', 'id': '10.1142/s0217732318500244'},
  {'source': 'scopus', 'id': '2-s2.0-85040862223'}],
 'exter

#### scholar

In [23]:
def parse_scholar(reg):
    entry = empty_work()
    entry["updated"]=[{"source":"scholar","time":int(time())}]
    lang=classify(reg["title"])[0]
    entry["titles"].append({"title":reg["title"],"lang":lang})
    if "year" in reg.keys():
        year=""
        try:
            if reg["year"][-1]=="\n":
                reg["year"]=reg["year"][:-1]
            year=int(reg["year"])
        except:
            pass
        entry["year_published"]=year
    if "doi" in reg.keys():
        entry["external_ids"].append({"source":"doi","id":reg["doi"].lower()})
    if "cid" in reg.keys():
        entry["external_ids"]=[{"source":"scholar","id":reg["cid"]}]
    if "abstract" in reg.keys():
        entry["abstract"]=reg["abstract"]
    if "volume" in reg.keys():
        volume=""
        try:
            if reg["volume"][-1]=="\n":
                reg["volume"]=reg["volume"][:-1]
            volume=int(reg["volume"])
            entry["bibliographic_info"]["volume"]=volume
        except:
            pass
    if "issue" in reg.keys():
        issue=""
        try:
            if reg["issue"][-1]=="\n":
                reg["issue"]=reg["issue"][:-1]
            issue=int(reg["issue"])
            entry["bibliographic_info"]["issue"]=issue 
        except:
            pass
    if "pages" in reg.keys():
        pages=""
        try:
            if reg["pages"][-1]=="\n":
                reg["pages"]=reg["pages"][:-1]
            pages=int(reg["pages"])
            entry["bibliographic_info"]["pages"]=pages
        except:
            pass
    if "bibtex" in reg.keys():
        entry["bibliographic_info"]["bibtex"]=reg["bibtex"]
    if "cites" in reg.keys():
        entry["citations_count"].append({"source":"scholar","count":int(reg["cites"])})
    if "cites_link" in reg.keys():
        entry["external_urls"].append({"source":"scholar citations","url":reg["cites_link"]})
    if "pdf" in reg.keys():
        entry["external_urls"].append({"source":"pdf","url":reg["pdf"]})
        
    if "journal" in reg.keys():
        entry["source"]={"name":reg["journal"]}
        
    #authors section
    full_name_list=[]
    if "author" in reg.keys():
        for author in reg["author"].strip().split(" and "):
            author_entry={}
            names_list=author.split(", ")
            if len(names_list)>0: last_names=names_list[0].strip()
            if len(names_list)>1: first_names=names_list[1].strip()
            full_name=first_names+" "+last_names
            author_entry["full_name"]=full_name
            
            entry["authors"].append(author_entry)
            full_name_list.append(full_name)
    if "profiles" in reg.keys():
        if reg["profiles"]:
            for name in reg["profiles"].keys():
                for i,author in enumerate(full_name_list):
                    score=fuzz.ratio(name,author)
                    if score>=80:
                        entry["authors"][i]["external_ids"]=[{"source":"scholar","id":reg["profiles"][name]}]
                        break
                    elif score>70:
                        score=fuzz.partial_ratio(name,author)
                        if score>=90:
                            entry["authors"][i]["external_ids"]=[{"source":"scholar","id":reg["profiles"][name]}]
                            break
        
    return entry

In [24]:
scholar_reg=scholar.find_one({"doi":doi.lower()})
iu_scholar=parse_scholar(scholar_reg)
iu_scholar["authors"]

[{'full_name': 'Ernest Ma'},
 {'full_name': 'Diego Restrepo',
  'external_ids': [{'source': 'scholar', 'id': '1sKULCoAAAAJ'}]},
 {'full_name': 'Óscar Zapata',
  'external_ids': [{'source': 'scholar', 'id': '0SEOpKoAAAAJ'}]}]

#### OADOI

In [25]:
def parse_oadoi(reg):
    entry = empty_work()
    entry["updated"]=[{"source":"oadoi","time":int(time())}]
    
    entry["bibliographic_info"]["is_open_access"]=reg["is_oa"] if "is_oa" in reg.keys() else 0
    entry["bibliographic_info"]["open_access_status"]=reg["oa_status"] if "oa_status" in reg.keys() else ""
    
    return entry

In [26]:
oadoi_reg=oadoi.find_one({"doi":doi.lower()})
iu_oadoi=parse_oadoi(oadoi_reg)

#### scienti

In [85]:
def parse_scienti(reg):
    entry=empty_work()
    entry["updated"]=[{"source":"scienti","time":int(time())}]
    lang=classify(reg["TXT_NME_PROD"])[0]
    entry["titles"].append({"title":reg["TXT_NME_PROD"],"lang":lang})
    entry["external_ids"].append({"source":"scienti","id":reg["COD_RH"]})
    if "TXT_DOI" in reg.keys():
        entry["external_ids"].append({"source":"doi","id":reg["TXT_DOI"].lower()})
    entry["external_urls"].append({"source":"scienti","url":reg["TXT_WEB_PRODUCTO"]})
    entry["ranking"].append({"date":"","rank":reg["SGL_CATEGORIA"],"source":"scienti"})
    entry["types"].append({"source":"scienti","type":reg["product_type"][0]["TXT_NME_TIPO_PRODUCTO"]})
    if "product_type" in reg["product_type"][0].keys():
        typ=reg["product_type"][0]["product_type"][0]["TXT_NME_TIPO_PRODUCTO"]
        entry["types"].append({"source":"scienti","type":typ})
    
    details=reg["details"][0]["article"][0]
    try:
        entry["bibliographic_info"]["start_page"]=details["TXT_PAGINA_INICIAL"]
    except:
        pass
    try:
        entry["bibliographic_info"]["end_page"]=details["TXT_PAGINA_FINAL"]
    except:
        pass
    try:
        entry["bibliographic_info"]["volume"]=details["TXT_VOLUMEN_REVISTA"]
    except:
        pass
    try:
        entry["bibliographic_info"]["issue"]=details["TXT_FASCICULO_REVISTA"]
    except:
        pass
    
    #source section
    source={"external_ids":[]}
    if "journal" in details.keys():
        journal=details["journal"][0]
        source["title"]=journal["TXT_NME_REVISTA"]
        if "TXT_ISSN_REF_SEP" in journal.keys():
            source["external_ids"].append({"source":"issn","id":journal["TXT_ISSN_REF_SEP"]})
        if "COD_REVISTA" in journal.keys():
            source["external_ids"].append({"source":"scienti","id":journal["COD_REVISTA"]})
    
    #authors section
    affiliations=[]
    if "group" in reg.keys():
        group=reg["group"][0]
        affiliations.append({
            "external_ids":[{"source":"scienti","id":group["COD_ID_GRUPO"]}],
            "name":group["NME_GRUPO"]
        })
        if "institution" in group.keys():
            inst=group["institution"][0]
            affiliations.append({
                    "external_ids":[{"source":"scienti","id":inst["COD_INST"]}],
                    "name":inst["NME_INST"]
                })
    author=reg["author"][0]
    author_entry={
        "full_name":author["TXT_TOTAL_NAMES"],
        "types":[],
        "affiliations":[{"name":affiliations}],
        "external_ids":[{"source":"scienti","id":author["COD_RH"]}]
    }
    if author["TPO_DOCUMENTO_IDENT"]=="P":
        author_entry["external_ids"].append({"source":"Passport","id":author["NRO_DOCUMENTO_IDENT"]})
    if author["TPO_DOCUMENTO_IDENT"]=="C":
        author_entry["external_ids"].append({"source":"Cédula de Ciudadanía","id":author["NRO_DOCUMENTO_IDENT"]})
    if author["TPO_DOCUMENTO_IDENT"]=="E":
        author_entry["external_ids"].append({"source":"Cédula de Extranjería","id":author["NRO_DOCUMENTO_IDENT"]})
    
    return entry

In [86]:
reg_scienti=scienti.find_one({"TXT_DOI":doi})
iu_scienti=parse_scienti(reg_scienti)

### Processing different papers from different sources to evaluate the missing information on each one of them

In [104]:
full_data=[]

In [105]:
#openalex, wos, scopus, scholar, oadoi, scienti, and puntaje
openalex_reg=openalexco["works"].find_one({"doi":"https://doi.org/"+doi.lower()})
iu_openalex=parse_openalex(openalex_reg)

entry=iu_openalex.copy()
entry["updated"].extend(iu_wos["updated"])
entry["updated"].extend(iu_scopus["updated"])
entry["updated"].extend(iu_scholar["updated"])
entry["updated"].extend(iu_puntaje["updated"])
entry["updated"].extend(iu_oadoi["updated"])
entry["updated"].extend(iu_scienti["updated"])
entry["abstract"]=iu_scopus["abstract"]
entry["keywords"]=iu_scopus["keywords"]

if iu_scholar["citations_count"]:
    entry["citations_count"].extend(iu_scholar["citations_count"])
if iu_wos["citations_count"]:
    entry["citations_count"].extend(iu_wos["citations_count"])
if iu_scopus["citations_count"]:
    entry["citations_count"].extend(iu_scopus["citations_count"])

sources=[ext["source"] for ext in entry["external_ids"]]
ids=[ext["id"] for ext in entry["external_ids"]]
for ext in iu_wos["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
for ext in iu_scopus["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
for ext in iu_scholar["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
for ext in iu_puntaje["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
for ext in iu_oadoi["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
for ext in iu_scienti["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
        
sources=[ext["source"] for ext in entry["external_urls"]]
urls=[ext["url"] for ext in entry["external_urls"]]
for ext in iu_wos["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
for ext in iu_scopus["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
for ext in iu_scholar["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
for ext in iu_puntaje["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
for ext in iu_oadoi["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
for ext in iu_scienti["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
        
sources=[ext["source"] for ext in entry["types"]]
types=[ext["type"] for ext in entry["types"]]
for ext in iu_wos["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
for ext in iu_scopus["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
for ext in iu_scholar["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
for ext in iu_puntaje["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
for ext in iu_oadoi["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
for ext in iu_scienti["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
        
entry["ranking"]=iu_scienti["ranking"]
        
#bibliographic info
for key,val in iu_wos["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
for key,val in iu_scopus["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
for key,val in iu_oadoi["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
for key,val in iu_scholar["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
for key,val in iu_puntaje["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
for key,val in iu_scienti["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
        
#source searching
source_db=None
for ext in entry["source"]["external_ids"]:
    source_db=colombia["sources"].find_one({"external_ids.id":ext["id"]})
    if source_db:
        break
if source_db:
    entry["source"]=source_db.copy()
    entry["source"]["id"]=str(source_db["_id"])
    del(entry["source"]["_id"])

del(entry["source"]["external_ids"])

#subjects searching
for subjects in entry["subjects"]:
    for i,sub in enumerate(subjects["subjects"]):
        for ext in sub["external_ids"]:
            sub_db=colombia["subjects"].find_one({"external_ids.id":ext["id"]})
            if sub_db:
                entry["subjects"][0]["subjects"][i]=sub_db.copy()
                entry["subjects"][0]["subjects"][i]["id"]=str(sub_db["_id"])
                del(entry["subjects"][0]["subjects"][i]["relations"])
                del(entry["subjects"][0]["subjects"][i]["_id"])
                break
        

#improve authors and affiliations info
au_name_list=[au["full_name"] for au in entry["authors"]]
for j,data in enumerate([iu_wos,iu_scholar,iu_scienti,iu_puntaje,iu_scopus]):
    for author in data["authors"]:
        idx=None
        match,score=process.extractOne(author["full_name"],au_name_list,scorer=fuzz.ratio)
        print("Ratio: ",score,author["full_name"],match)
        if score>=70:
            idx=au_name_list.index(match)
        elif score>50:
            match,score=process.extractOne(author["full_name"],au_name_list,scorer=fuzz.partial_ratio)
            print("Partial ratio: ",score,author["full_name"],match)
            if score>=80:
                idx=au_name_list.index(match)
            elif score>60:
                match,score=process.extractOne(author["full_name"],au_name_list,scorer=fuzz.token_sort_ratio)
                print("Token sort ratio: ",score,author["full_name"],match)
                if score>=99:
                    idx=au_name_list.index(match)
        if idx:
            sources=[ext["source"] for ext in entry["authors"][idx]["external_ids"]]
            ids=[ext["id"] for ext in entry["authors"][idx]["external_ids"]]
            for ext in author["external_ids"]:
                if not ext["id"] in ids:
                    entry["authors"][idx]["external_ids"].append(ext)
                    sources.append(ext["source"])
                    ids.append(ext["id"])
        #Create the same loop as above to improve affiliations
        
#search authors and affiliations in db
for i,author in enumerate(entry["authors"]):
    author_db=None
    for ext in author["external_ids"]:
        author_db=colombia["person"].find_one({"external_ids.id":ext["id"]})
        if author_db:
            break
    if author_db:
        sources=[ext["source"] for ext in author_db["external_ids"]]
        ids=[ext["id"] for ext in author_db["external_ids"]]
        for ext in author["external_ids"]:
            if not ext["id"] in ids:
                author_db["external_ids"].append(ext)
                sources.append(ext["source"])
                ids.append(ext["id"])
        entry["authors"][i]=author_db
        entry["authors"][i]["id"]=str(author_db["_id"])
        del(entry["authors"][i]["_id"])
    else:
        for j,aff in enumerate(author["affiliations"]):
            aff_db=None
            for ext in aff["external_ids"]:
                aff_db=colombia["affiliations"].find_one({"external_ids.id":ext["id"]})
                if aff_db:
                    break
            if aff_db:
                entry["authors"][i]["affiliations"][j]=aff_db
                entry["authors"][i]["affiliations"][j]["id"]=str(aff_db["_id"])
                del(entry["authors"][i]["affiliations"][j]["_id"])

entry["author_count"]=len(entry["authors"])
full_data.append(entry)
#entry["authors"]

Ratio:  63 Ma, Ernest Ernest Ma
Partial ratio:  80 Ma, Ernest Ernest Ma
Ratio:  55 Restrepo, Diego Diego Restrepo
Partial ratio:  50 Restrepo, Diego Diego Restrepo
Ratio:  48 Zapata, Oscar Óscar Zapata
Ratio:  100 Ernest Ma Ernest Ma
Ratio:  100 Diego Restrepo Diego Restrepo
Ratio:  100 Óscar Zapata Óscar Zapata
Ratio:  56 Oscar Alberto Zapata Noreña Óscar Zapata
Partial ratio:  67 Oscar Alberto Zapata Noreña Óscar Zapata
Token sort ratio:  59 Oscar Alberto Zapata Noreña Óscar Zapata
Ratio:  60 Diego Alejandro Restrepo Quintero Diego Restrepo
Partial ratio:  79 Diego Alejandro Restrepo Quintero Diego Restrepo
Token sort ratio:  60 Diego Alejandro Restrepo Quintero Diego Restrepo
Ratio:  29 Ma, E., Ernest Ma
Ratio:  64 Restrepo, D., Diego Restrepo
Partial ratio:  84 Restrepo, D., Diego Restrepo
Ratio:  57 Zapata, Ó., Óscar Zapata
Partial ratio:  80 Zapata, Ó., Óscar Zapata


In [None]:
entry

In [97]:
#openalex, wos, scopus, scholar, oadoi, scienti
openalex_reg=openalexco["works"].find_one({"doi":"https://doi.org/"+doi.lower()})
iu_openalex=parse_openalex(openalex_reg)

entry=iu_openalex.copy()
entry["updated"].extend(iu_wos["updated"])
entry["updated"].extend(iu_scopus["updated"])
entry["updated"].extend(iu_scholar["updated"])
entry["updated"].extend(iu_oadoi["updated"])
entry["updated"].extend(iu_scienti["updated"])
entry["abstract"]=iu_scopus["abstract"]
entry["keywords"]=iu_scopus["keywords"]

if iu_scholar["citations_count"]:
    entry["citations_count"].extend(iu_scholar["citations_count"])
if iu_wos["citations_count"]:
    entry["citations_count"].extend(iu_wos["citations_count"])
if iu_scopus["citations_count"]:
    entry["citations_count"].extend(iu_scopus["citations_count"])

sources=[ext["source"] for ext in entry["external_ids"]]
ids=[ext["id"] for ext in entry["external_ids"]]
for ext in iu_wos["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
for ext in iu_scopus["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
for ext in iu_scholar["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
for ext in iu_oadoi["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
for ext in iu_scienti["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
        
sources=[ext["source"] for ext in entry["external_urls"]]
urls=[ext["url"] for ext in entry["external_urls"]]
for ext in iu_wos["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
for ext in iu_scopus["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
for ext in iu_scholar["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
for ext in iu_oadoi["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
for ext in iu_scienti["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
        
sources=[ext["source"] for ext in entry["types"]]
types=[ext["type"] for ext in entry["types"]]
for ext in iu_wos["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
for ext in iu_scopus["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
for ext in iu_scholar["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
for ext in iu_oadoi["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
for ext in iu_scienti["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
        
entry["ranking"]=iu_scienti["ranking"]

#bibliographic info
for key,val in iu_wos["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
for key,val in iu_scopus["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
for key,val in iu_oadoi["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
for key,val in iu_scholar["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
for key,val in iu_scienti["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
        
#source searching
source_db=None
for ext in entry["source"]["external_ids"]:
    source_db=colombia["sources"].find_one({"external_ids.id":ext["id"]})
    if source_db:
        break
if source_db:
    entry["source"]=source_db.copy()
    entry["source"]["id"]=str(source_db["_id"])
    del(entry["source"]["_id"])

del(entry["source"]["external_ids"])

#subjects searching
for subjects in entry["subjects"]:
    for i,sub in enumerate(subjects["subjects"]):
        for ext in sub["external_ids"]:
            sub_db=colombia["subjects"].find_one({"external_ids.id":ext["id"]})
            if sub_db:
                entry["subjects"][0]["subjects"][i]=sub_db.copy()
                entry["subjects"][0]["subjects"][i]["id"]=str(sub_db["_id"])
                del(entry["subjects"][0]["subjects"][i]["relations"])
                del(entry["subjects"][0]["subjects"][i]["_id"])
                break
        

#improve authors and affiliations info
au_name_list=[au["full_name"] for au in entry["authors"]]
for j,data in enumerate([iu_wos,iu_scholar,iu_scienti,iu_scopus]):
    for author in data["authors"]:
        idx=None
        match,score=process.extractOne(author["full_name"],au_name_list,scorer=fuzz.ratio)
        print("Ratio: ",score,author["full_name"],match)
        if score>=70:
            idx=au_name_list.index(match)
        elif score>50:
            match,score=process.extractOne(author["full_name"],au_name_list,scorer=fuzz.partial_ratio)
            print("Partial ratio: ",score,author["full_name"],match)
            if score>=80:
                idx=au_name_list.index(match)
            elif score>60:
                match,score=process.extractOne(author["full_name"],au_name_list,scorer=fuzz.token_sort_ratio)
                print("Token sort ratio: ",score,author["full_name"],match)
                if score>=99:
                    idx=au_name_list.index(match)
        if idx:
            sources=[ext["source"] for ext in entry["authors"][idx]["external_ids"]]
            ids=[ext["id"] for ext in entry["authors"][idx]["external_ids"]]
            for ext in author["external_ids"]:
                if not ext["id"] in ids:
                    entry["authors"][idx]["external_ids"].append(ext)
                    sources.append(ext["source"])
                    ids.append(ext["id"])
        #Create the same loop as above to improve affiliations
        
#search authors and affiliations in db
for i,author in enumerate(entry["authors"]):
    author_db=None
    for ext in author["external_ids"]:
        author_db=colombia["person"].find_one({"external_ids.id":ext["id"]})
        if author_db:
            break
    if author_db:
        sources=[ext["source"] for ext in author_db["external_ids"]]
        ids=[ext["id"] for ext in author_db["external_ids"]]
        for ext in author["external_ids"]:
            if not ext["id"] in ids:
                author_db["external_ids"].append(ext)
                sources.append(ext["source"])
                ids.append(ext["id"])
        entry["authors"][i]=author_db
        entry["authors"][i]["id"]=str(author_db["_id"])
        del(entry["authors"][i]["_id"])
    else:
        for j,aff in enumerate(author["affiliations"]):
            aff_db=None
            for ext in aff["external_ids"]:
                aff_db=colombia["affiliations"].find_one({"external_ids.id":ext["id"]})
                if aff_db:
                    break
            if aff_db:
                entry["authors"][i]["affiliations"][j]=aff_db
                entry["authors"][i]["affiliations"][j]["id"]=str(aff_db["_id"])
                del(entry["authors"][i]["affiliations"][j]["_id"])

entry["author_count"]=len(entry["authors"])
full_data.append(entry)
#entry["authors"]

Ratio:  63 Ma, Ernest Ernest Ma
Partial ratio:  80 Ma, Ernest Ernest Ma
Ratio:  55 Restrepo, Diego Diego Restrepo
Partial ratio:  50 Restrepo, Diego Diego Restrepo
Ratio:  48 Zapata, Oscar Óscar Zapata
Ratio:  100 Ernest Ma Ernest Ma
Ratio:  100 Diego Restrepo Diego Restrepo
Ratio:  100 Óscar Zapata Óscar Zapata
Ratio:  29 Ma, E., Ernest Ma
Ratio:  64 Restrepo, D., Diego Restrepo
Partial ratio:  84 Restrepo, D., Diego Restrepo
Ratio:  57 Zapata, Ó., Óscar Zapata
Partial ratio:  80 Zapata, Ó., Óscar Zapata


In [103]:
#openalex, wos, scopus, scholar, scienti
openalex_reg=openalexco["works"].find_one({"doi":"https://doi.org/"+doi.lower()})
iu_openalex=parse_openalex(openalex_reg)

entry=iu_openalex.copy()
entry["updated"].extend(iu_wos["updated"])
entry["updated"].extend(iu_scopus["updated"])
entry["updated"].extend(iu_scholar["updated"])
entry["updated"].extend(iu_scienti["updated"])
entry["abstract"]=iu_scopus["abstract"]
entry["keywords"]=iu_scopus["keywords"]

if iu_scholar["citations_count"]:
    entry["citations_count"].extend(iu_scholar["citations_count"])
if iu_wos["citations_count"]:
    entry["citations_count"].extend(iu_wos["citations_count"])
if iu_scopus["citations_count"]:
    entry["citations_count"].extend(iu_scopus["citations_count"])

sources=[ext["source"] for ext in entry["external_ids"]]
ids=[ext["id"] for ext in entry["external_ids"]]
for ext in iu_wos["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
for ext in iu_scopus["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
for ext in iu_scholar["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
for ext in iu_scienti["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])

        
sources=[ext["source"] for ext in entry["external_urls"]]
urls=[ext["url"] for ext in entry["external_urls"]]
for ext in iu_wos["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
for ext in iu_scopus["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
for ext in iu_scholar["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
for ext in iu_scienti["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])

        
sources=[ext["source"] for ext in entry["types"]]
types=[ext["type"] for ext in entry["types"]]
for ext in iu_wos["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
for ext in iu_scopus["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
for ext in iu_scholar["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
for ext in iu_scienti["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
        
entry["ranking"]=iu_scienti["ranking"]

#bibliographic info
for key,val in iu_wos["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
for key,val in iu_scopus["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
for key,val in iu_scholar["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
for key,val in iu_scienti["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
        
#source searching
source_db=None
for ext in entry["source"]["external_ids"]:
    source_db=colombia["sources"].find_one({"external_ids.id":ext["id"]})
    if source_db:
        break
if source_db:
    entry["source"]["id"]=str(source_db["_id"])

del(entry["source"]["external_ids"])

#subjects searching
for subjects in entry["subjects"]:
    for i,sub in enumerate(subjects["subjects"]):
        for ext in sub["external_ids"]:
            sub_db=colombia["subjects"].find_one({"external_ids.id":ext["id"]})
            if sub_db:
                entry["subjects"][0]["subjects"][i]=sub_db.copy()
                entry["subjects"][0]["subjects"][i]["id"]=str(sub_db["_id"])
                del(entry["subjects"][0]["subjects"][i]["relations"])
                del(entry["subjects"][0]["subjects"][i]["_id"])
                break
        

#improve authors and affiliations info
au_name_list=[au["full_name"] for au in entry["authors"]]
for j,data in enumerate([iu_wos,iu_scholar,iu_scienti,iu_scopus]):
    for author in data["authors"]:
        idx=None
        match,score=process.extractOne(author["full_name"],au_name_list,scorer=fuzz.ratio)
        print("Ratio: ",score,author["full_name"],match)
        if score>=70:
            idx=au_name_list.index(match)
        elif score>50:
            match,score=process.extractOne(author["full_name"],au_name_list,scorer=fuzz.partial_ratio)
            print("Partial ratio: ",score,author["full_name"],match)
            if score>=80:
                idx=au_name_list.index(match)
            elif score>60:
                match,score=process.extractOne(author["full_name"],au_name_list,scorer=fuzz.token_sort_ratio)
                print("Token sort ratio: ",score,author["full_name"],match)
                if score>=99:
                    idx=au_name_list.index(match)
        if idx:
            sources=[ext["source"] for ext in entry["authors"][idx]["external_ids"]]
            ids=[ext["id"] for ext in entry["authors"][idx]["external_ids"]]
            for ext in author["external_ids"]:
                if not ext["id"] in ids:
                    entry["authors"][idx]["external_ids"].append(ext)
                    sources.append(ext["source"])
                    ids.append(ext["id"])
        #Create the same loop as above to improve affiliations
        
#search authors and affiliations in db
for i,author in enumerate(entry["authors"]):
    author_db=None
    for ext in author["external_ids"]:
        author_db=colombia["person"].find_one({"external_ids.id":ext["id"]})
        if author_db:
            break
    if author_db:
        sources=[ext["source"] for ext in author_db["external_ids"]]
        ids=[ext["id"] for ext in author_db["external_ids"]]
        for ext in author["external_ids"]:
            if not ext["id"] in ids:
                author_db["external_ids"].append(ext)
                sources.append(ext["source"])
                ids.append(ext["id"])
        entry["authors"][i]=author_db
        entry["authors"][i]["id"]=str(author_db["_id"])
        del(entry["authors"][i]["_id"])
    else:
        for j,aff in enumerate(author["affiliations"]):
            aff_db=None
            for ext in aff["external_ids"]:
                aff_db=colombia["affiliations"].find_one({"external_ids.id":ext["id"]})
                if aff_db:
                    break
            if aff_db:
                entry["authors"][i]["affiliations"][j]=aff_db
                entry["authors"][i]["affiliations"][j]["id"]=str(aff_db["_id"])
                del(entry["authors"][i]["affiliations"][j]["_id"])

entry["author_count"]=len(entry["authors"])
full_data.append(entry)
#entry["authors"]

Ratio:  63 Ma, Ernest Ernest Ma
Partial ratio:  80 Ma, Ernest Ernest Ma
Ratio:  55 Restrepo, Diego Diego Restrepo
Partial ratio:  50 Restrepo, Diego Diego Restrepo
Ratio:  48 Zapata, Oscar Óscar Zapata
Ratio:  100 Ernest Ma Ernest Ma
Ratio:  100 Diego Restrepo Diego Restrepo
Ratio:  100 Óscar Zapata Óscar Zapata
Ratio:  29 Ma, E., Ernest Ma
Ratio:  64 Restrepo, D., Diego Restrepo
Partial ratio:  84 Restrepo, D., Diego Restrepo
Ratio:  57 Zapata, Ó., Óscar Zapata
Partial ratio:  80 Zapata, Ó., Óscar Zapata


In [99]:
#openalex, wos, scopus
openalex_reg=openalexco["works"].find_one({"doi":"https://doi.org/"+doi.lower()})
iu_openalex=parse_openalex(openalex_reg)

entry=iu_openalex.copy()
entry["updated"].extend(iu_wos["updated"])
entry["updated"].extend(iu_scopus["updated"])
entry["updated"].extend(iu_scienti["updated"])
entry["abstract"]=iu_scopus["abstract"]
entry["keywords"]=iu_scopus["keywords"]

if iu_wos["citations_count"]:
    entry["citations_count"].extend(iu_wos["citations_count"])
if iu_scopus["citations_count"]:
    entry["citations_count"].extend(iu_scopus["citations_count"])


sources=[ext["source"] for ext in entry["external_ids"]]
ids=[ext["id"] for ext in entry["external_ids"]]
for ext in iu_wos["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
for ext in iu_scopus["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
for ext in iu_scienti["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])

        
sources=[ext["source"] for ext in entry["external_urls"]]
urls=[ext["url"] for ext in entry["external_urls"]]
for ext in iu_wos["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
for ext in iu_scopus["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
for ext in iu_scienti["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])

        
sources=[ext["source"] for ext in entry["types"]]
types=[ext["type"] for ext in entry["types"]]
for ext in iu_wos["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
for ext in iu_scopus["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
for ext in iu_scienti["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
        
entry["ranking"]=iu_scienti["ranking"]
        
#bibliographic info
for key,val in iu_wos["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
for key,val in iu_scopus["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
for key,val in iu_scienti["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
        
#source searching
source_db=None
for ext in entry["source"]["external_ids"]:
    source_db=colombia["sources"].find_one({"external_ids.id":ext["id"]})
    if source_db:
        break
if source_db:
    entry["source"]["id"]=str(source_db["_id"])

del(entry["source"]["external_ids"])

#subjects searching
for subjects in entry["subjects"]:
    for i,sub in enumerate(subjects["subjects"]):
        for ext in sub["external_ids"]:
            sub_db=colombia["subjects"].find_one({"external_ids.id":ext["id"]})
            if sub_db:
                entry["subjects"][0]["subjects"][i]=sub_db.copy()
                entry["subjects"][0]["subjects"][i]["id"]=str(sub_db["_id"])
                del(entry["subjects"][0]["subjects"][i]["relations"])
                del(entry["subjects"][0]["subjects"][i]["_id"])
                break

        

#improve authors and affiliations info
au_name_list=[au["full_name"] for au in entry["authors"]]
for j,data in enumerate([iu_wos,iu_scienti,iu_scopus]):
    for author in data["authors"]:
        idx=None
        match,score=process.extractOne(author["full_name"],au_name_list,scorer=fuzz.ratio)
        print("Ratio: ",score,author["full_name"],match)
        if score>=70:
            idx=au_name_list.index(match)
        elif score>50:
            match,score=process.extractOne(author["full_name"],au_name_list,scorer=fuzz.partial_ratio)
            print("Partial ratio: ",score,author["full_name"],match)
            if score>=80:
                idx=au_name_list.index(match)
            elif score>60:
                match,score=process.extractOne(author["full_name"],au_name_list,scorer=fuzz.token_sort_ratio)
                print("Token sort ratio: ",score,author["full_name"],match)
                if score>=99:
                    idx=au_name_list.index(match)
        if idx:
            sources=[ext["source"] for ext in entry["authors"][idx]["external_ids"]]
            ids=[ext["id"] for ext in entry["authors"][idx]["external_ids"]]
            for ext in author["external_ids"]:
                if not ext["id"] in ids:
                    entry["authors"][idx]["external_ids"].append(ext)
                    sources.append(ext["source"])
                    ids.append(ext["id"])
        #Create the same loop as above to improve affiliations
        
#search authors and affiliations in db
for i,author in enumerate(entry["authors"]):
    author_db=None
    for ext in author["external_ids"]:
        author_db=colombia["person"].find_one({"external_ids.id":ext["id"]})
        if author_db:
            break
    if author_db:
        sources=[ext["source"] for ext in author_db["external_ids"]]
        ids=[ext["id"] for ext in author_db["external_ids"]]
        for ext in author["external_ids"]:
            if not ext["id"] in ids:
                author_db["external_ids"].append(ext)
                sources.append(ext["source"])
                ids.append(ext["id"])
        entry["authors"][i]=author_db
        entry["authors"][i]["id"]=str(author_db["_id"])
        del(entry["authors"][i]["_id"])
    else:
        for j,aff in enumerate(author["affiliations"]):
            aff_db=None
            for ext in aff["external_ids"]:
                aff_db=colombia["affiliations"].find_one({"external_ids.id":ext["id"]})
                if aff_db:
                    break
            if aff_db:
                entry["authors"][i]["affiliations"][j]=aff_db
                entry["authors"][i]["affiliations"][j]["id"]=str(aff_db["_id"])
                del(entry["authors"][i]["affiliations"][j]["_id"])

entry["author_count"]=len(entry["authors"])
full_data.append(entry)
#entry["authors"]

Ratio:  63 Ma, Ernest Ernest Ma
Partial ratio:  80 Ma, Ernest Ernest Ma
Ratio:  55 Restrepo, Diego Diego Restrepo
Partial ratio:  50 Restrepo, Diego Diego Restrepo
Ratio:  48 Zapata, Oscar Óscar Zapata
Ratio:  29 Ma, E., Ernest Ma
Ratio:  64 Restrepo, D., Diego Restrepo
Partial ratio:  84 Restrepo, D., Diego Restrepo
Ratio:  57 Zapata, Ó., Óscar Zapata
Partial ratio:  80 Zapata, Ó., Óscar Zapata


In [100]:
#openalex, wos
openalex_reg=openalexco["works"].find_one({"doi":"https://doi.org/"+doi.lower()})
iu_openalex=parse_openalex(openalex_reg)

entry=iu_openalex.copy()
entry["updated"].extend(iu_wos["updated"])
entry["updated"].extend(iu_scienti["updated"])
entry["abstract"]=iu_wos["abstract"]

if iu_wos["citations_count"]:
    entry["citations_count"].extend(iu_wos["citations_count"])


sources=[ext["source"] for ext in entry["external_ids"]]
ids=[ext["id"] for ext in entry["external_ids"]]
for ext in iu_wos["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])
for ext in iu_scienti["external_ids"]:
    if not ext["id"] in ids:
        entry["external_ids"].append(ext)
        sources.append(ext["source"])
        ids.append(ext["id"])

        
sources=[ext["source"] for ext in entry["external_urls"]]
urls=[ext["url"] for ext in entry["external_urls"]]
for ext in iu_wos["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])
for ext in iu_scienti["external_urls"]:
    if not ext["url"] in urls:
        entry["external_urls"].append(ext)
        sources.append(ext["source"])
        urls.append(ext["url"])

        
sources=[ext["source"] for ext in entry["types"]]
types=[ext["type"] for ext in entry["types"]]
for ext in iu_wos["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
for ext in iu_scienti["types"]:
    if not ext["type"] in types:
        entry["types"].append(ext)
        sources.append(ext["source"])
        types.append(ext["type"])
        
entry["ranking"]=iu_scienti["ranking"]

#bibliographic info
for key,val in iu_wos["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
for key,val in iu_scienti["bibliographic_info"].items():
    if not key in entry["bibliographic_info"].keys():
        entry["bibliographic_info"][key]=val
        
#source searching
source_db=None
for ext in entry["source"]["external_ids"]:
    source_db=colombia["sources"].find_one({"external_ids.id":ext["id"]})
    if source_db:
        break
if source_db:
    entry["source"]["id"]=str(source_db["_id"])

del(entry["source"]["external_ids"])

#subjects searching
for subjects in entry["subjects"]:
    for i,sub in enumerate(subjects["subjects"]):
        for ext in sub["external_ids"]:
            sub_db=colombia["subjects"].find_one({"external_ids.id":ext["id"]})
            if sub_db:
                entry["subjects"][0]["subjects"][i]=sub_db.copy()
                entry["subjects"][0]["subjects"][i]["id"]=str(sub_db["_id"])
                del(entry["subjects"][0]["subjects"][i]["relations"])
                del(entry["subjects"][0]["subjects"][i]["_id"])
                break
        

#improve authors and affiliations info
au_name_list=[au["full_name"] for au in entry["authors"]]
for j,data in enumerate([iu_wos,iu_scienti]):
    for author in data["authors"]:
        idx=None
        match,score=process.extractOne(author["full_name"],au_name_list,scorer=fuzz.ratio)
        print("Ratio: ",score,author["full_name"],match)
        if score>=70:
            idx=au_name_list.index(match)
        elif score>50:
            match,score=process.extractOne(author["full_name"],au_name_list,scorer=fuzz.partial_ratio)
            print("Partial ratio: ",score,author["full_name"],match)
            if score>=80:
                idx=au_name_list.index(match)
            elif score>60:
                match,score=process.extractOne(author["full_name"],au_name_list,scorer=fuzz.token_sort_ratio)
                print("Token sort ratio: ",score,author["full_name"],match)
                if score>=99:
                    idx=au_name_list.index(match)
        if idx:
            sources=[ext["source"] for ext in entry["authors"][idx]["external_ids"]]
            ids=[ext["id"] for ext in entry["authors"][idx]["external_ids"]]
            for ext in author["external_ids"]:
                if not ext["id"] in ids:
                    entry["authors"][idx]["external_ids"].append(ext)
                    sources.append(ext["source"])
                    ids.append(ext["id"])
        #Create the same loop as above to improve affiliations
        
#search authors and affiliations in db
for i,author in enumerate(entry["authors"]):
    author_db=None
    for ext in author["external_ids"]:
        author_db=colombia["person"].find_one({"external_ids.id":ext["id"]})
        if author_db:
            break
    if author_db:
        sources=[ext["source"] for ext in author_db["external_ids"]]
        ids=[ext["id"] for ext in author_db["external_ids"]]
        for ext in author["external_ids"]:
            if not ext["id"] in ids:
                author_db["external_ids"].append(ext)
                sources.append(ext["source"])
                ids.append(ext["id"])
        entry["authors"][i]=author_db
        entry["authors"][i]["id"]=str(author_db["_id"])
        del(entry["authors"][i]["_id"])
    else:
        for j,aff in enumerate(author["affiliations"]):
            aff_db=None
            for ext in aff["external_ids"]:
                aff_db=colombia["affiliations"].find_one({"external_ids.id":ext["id"]})
                if aff_db:
                    break
            if aff_db:
                entry["authors"][i]["affiliations"][j]=aff_db
                entry["authors"][i]["affiliations"][j]["id"]=str(aff_db["_id"])
                del(entry["authors"][i]["affiliations"][j]["_id"])

entry["author_count"]=len(entry["authors"])
full_data.append(entry)
#entry["authors"]

Ratio:  63 Ma, Ernest Ernest Ma
Partial ratio:  80 Ma, Ernest Ernest Ma
Ratio:  55 Restrepo, Diego Diego Restrepo
Partial ratio:  50 Restrepo, Diego Diego Restrepo
Ratio:  48 Zapata, Oscar Óscar Zapata


In [101]:
#openalex
openalex_reg=openalexco["works"].find_one({"doi":"https://doi.org/"+doi.lower()})
iu_openalex=parse_openalex(openalex_reg)

entry=iu_openalex.copy()

#subjects searching
for subjects in entry["subjects"]:
    for i,sub in enumerate(subjects["subjects"]):
        for ext in sub["external_ids"]:
            sub_db=colombia["subjects"].find_one({"external_ids.id":ext["id"]})
            if sub_db:
                entry["subjects"][0]["subjects"][i]=sub_db.copy()
                entry["subjects"][0]["subjects"][i]["id"]=str(sub_db["_id"])
                del(entry["subjects"][0]["subjects"][i]["relations"])
                del(entry["subjects"][0]["subjects"][i]["_id"])
                break

#source searching
source_db=None
for ext in entry["source"]["external_ids"]:
    source_db=colombia["sources"].find_one({"external_ids.id":ext["id"]})
    if source_db:
        break
if source_db:
    entry["source"]["id"]=str(source_db["_id"])

del(entry["source"]["external_ids"])

for i,author in enumerate(entry["authors"]):
    author_db=None
    for ext in author["external_ids"]:
        author_db=colombia["person"].find_one({"external_ids.id":ext["id"]})
        if author_db:
            break
    if author_db:
        sources=[ext["source"] for ext in author_db["external_ids"]]
        ids=[ext["id"] for ext in author_db["external_ids"]]
        for ext in author["external_ids"]:
            if not ext["id"] in ids:
                author_db["external_ids"].append(ext)
                sources.append(ext["source"])
                ids.append(ext["id"])
        entry["authors"][i]=author_db
        entry["authors"][i]["id"]=str(author_db["_id"])
        del(entry["authors"][i]["_id"])
    else:
        for j,aff in enumerate(author["affiliations"]):
            aff_db=None
            for ext in aff["external_ids"]:
                aff_db=colombia["affiliations"].find_one({"external_ids.id":ext["id"]})
                if aff_db:
                    break
            if aff_db:
                entry["authors"][i]["affiliations"][j]=aff_db
                entry["authors"][i]["affiliations"][j]["id"]=str(aff_db["_id"])
                del(entry["authors"][i]["affiliations"][j]["_id"])

entry["author_count"]=len(entry["authors"])
full_data.append(entry)

### Report

In [102]:
with open("/current/data/colombia/sample_udea/sample_works.json", "w") as outfile:
    json.dump(full_data, outfile,cls=JSONEncoder)