# OpenAIRE Data EDA

## Preamble

In [None]:
%run notebook_preamble.ipy

pd.set_option('max_columns', 99)

In [None]:
import seaborn as sns
import xmltodict
import pyjq
import boto3
import io
from bs4 import BeautifulSoup

from eu_funding.visualization.visualize import pdf_cdf
from eu_funding.utils.misc_utils import print_nested_structure
from eu_funding.data.s3_transfer import get_files_from_s3
from eu_funding.data.openaire import parse_openaire_records, parse_publications_soup

## Data Structure

### Projects

In [None]:
BUCKET = 'im-eurito'
FOLDER = 'external/openaire/projectssoups'
KEY_PREFIX = 'soup'

In [None]:
for i, file in enumerate(get_files_from_s3(bucket=BUCKET, folder=FOLDER, key_prefix=KEY_PREFIX)):
    if i > 0:
        break
    soup = BeautifulSoup(file)

In [None]:
print(soup.prettify()[:10000])

In [None]:
records = []
for file in get_files_from_s3(bucket=BUCKET, folder=FOLDER, key_prefix=KEY_PREFIX):
    records.extend(parse_openaire_records(file))

In [None]:
df = pd.DataFrame().from_records(records)

In [None]:
pd.options.display.max_columns = 999

In [None]:
df.to_csv(os.path.join(inter_data_path, 'openaire_projects.csv'), index=False)

### Publications

In [None]:
BUCKET = 'im-eurito'
FOLDER = 'external/openaire/publicationssoups'
KEY_PREFIX = 'soup'

In [None]:
def load_publications():
    records = []
    for file in os.listdir(os.path.join(openaire_publication_data_path)):
        file_number = file.split('.')[0].split('_')[-1]
        if '.txt' in file:
            with open(os.path.join(openaire_publication_data_path, file), mode='rb') as f:
                data = f.read()
                soup = BeautifulSoup(data)
                rec = parse_publications_soup(soup)
                records.extend(rec)
    return records

In [None]:
records = load_publications()

In [None]:
df = pd.DataFrame().from_records(records)

In [None]:
dfs = []

for i, record in enumerate(chunks(records, 1000)):
    i += 1
    df = pd.DataFrame().from_records(record)
    df.to_csv(
        os.path.join(openaire_publication_data_path, 'csv', 'publications_parsed_{:03}.csv'.format(i)),
        index=False
    )
    dfs.append(df)

In [None]:
# publications_df = pd.concat(dfs)
publications_df = pd.read_csv(os.path.join(inter_data_path, 'openaire_publications_20190702.csv'))

### Fetch Missing PubMed DOIs

In [None]:
import requests
from time import sleep
from eu_funding.utils.misc_utils import chunks

In [None]:
def get_id_converter(pub_ids, id_type):
    id_converter_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/'
    params = {
        'idtype': id_type,
        'ids': ', '.join([str(i) for i in pub_ids]),
        'email': 'george.richardson@nesta.org.uk',
        'tool': 'eu_funding_analytics'
    }
    response = requests.get(
        url=id_converter_url,
        params=params,
    )
    results = response.content
    return results
    
def parse_id_converter_result(results, id_type):
    soup = BeautifulSoup(results)
    records = [record.attrs for record in soup.findAll('record')]
    for r in records:
        r['pid_type'] = id_type
    return records

def convert_ids(pub_ids, id_type):
    pub_id_chunks = chunks(pub_ids, 200)
    converted = []
    for chunk in pub_id_chunks:
        results = get_id_converter(chunk, id_type)
        records = parse_id_converter_result(results, id_type)
        converted.extend(records)
        sleep(1)
    return converted

In [None]:
id_type = 'pmid'
pub_ids = publications_df[publications_df['pid_type'] == id_type]['pid'].values

pmid_converted_ids = convert_ids(pub_ids, id_type)

In [None]:
def doi_col(pid, pid_type):
    if pid_type == 'doi':
        return pid
    else:
        return np.nan

publications_df['doi'] = publications_df['pid']

In [None]:
def apply_map(element, mapping):
    if element in mapping:
        return mapping[element]
    else:
        return element

In [None]:
pmid_df = pd.DataFrame().from_records(pmid_converted_ids)
pmid_doi_map = {k: v for k, v in zip(
    pmid_df['pmid'], pmid_df['doi']
)}

In [None]:
publications_df['doi'] = publications_df['doi'].apply(lambda x: apply_map(x, pmid_doi_map))

In [None]:
id_type = 'pmc'
pub_ids = publications_df[publications_df['pid_type'] == id_type]['pid'].values

pmcid_converted_ids = convert_ids(pub_ids, 'pmcid')

In [None]:
pmcid_df = pd.DataFrame().from_records(pmcid_converted_ids)
pmcid_doi_map = {k.lower(): v for k, v in zip(
    pmcid_df['pmcid'], pmcid_df['doi']
)}
publications_df['doi'] = publications_df['doi'].apply(lambda x: apply_map(x, pmcid_doi_map))

In [None]:
publications_df.reset_index(inplace=True)

In [None]:
publications_df.to_csv(os.path.join(inter_data_path, 'openaire_publications.csv'), index=False)

In [None]:
publications_df.head()

In [None]:
len(publications_df[
    (publications_df['pid'] != 'pmid') 
    & (publications_df['pid'] != 'pmc')
]
)

### MAK Enrichment

#### Functions

In [None]:
from alphabet_detector import AlphabetDetector
import pandas as pd
import requests
import json
from sqlalchemy import create_engine
from sqlalchemy import text as sql_text

# Inputs for the MAK POST request, including the API key
HEADERS = {
    'Ocp-Apim-Subscription-Key': '4774550073674321a53be3e28595c92c',
    'Content-Type': 'application/x-www-form-urlencoded'
}

# Fields to return from MAK
FIELDS = ["Id","Ti","D","AA.AuN","AA.AuId","F.FId","L","C.CN","E",
          "J.JId","AA.AfId","CC","ECC","AA.AfN","J.JN","F.FN"]


class TitleProcessor(AlphabetDetector):
    '''Processes a pure utf-8 title into something ready for a MAK query.'''
    def process_title(self, title):
        # Get replace non-alphanums (allowing foreign characters)
        result = "".join([x
                          if len(self.detect_alphabet(x)) > 0
                          or x.isnumeric()
                          else " " for x in title.lower()])
        # Replace double-spaces with single-spaces
        while "  " in result:
            result = result.replace("  "," ")
        return result.strip()


'''Find matches to titles from the MAK database.

    raw_titles: A list of titles in the form (id, title)
    call_limit: The maximum number of MAK API calls. 
                NB: Nesta's allowance is 10,000 per month.
'''
def mak_from_titles(raw_titles, call_limit, optional_columns, title_offset=0):

    # Make arXiv titles match MAK title format (strip non-alphanums,
    # allowing foreign chars)
    tp = TitleProcessor()
    titles = [(pid,tp.process_title(t)) for pid,t in raw_titles]
    # Maximum of title_count titles, returning query_count results
    title_count = 500
#     title_offset = 0
    query_count = 1000
    char_limit = 16000

    # Count the number of calls for book-keeping
    calls = 0

    # Iterate until done
    data = []
    while title_offset < len(titles):
        records = []
        # A soft limit so that we don't overrun the API limit
        if calls >= call_limit:
            break
        calls += 1
        
        first_title = title_offset
        print('Querying from {}'.format(first_title))
        # Generate the MAK query (OR statement of titles (Ti))
        expr_titles = ""

        while (len(expr_titles) < char_limit) & (title_offset < len(titles)):
            expr_titles = expr_titles + "Ti='{}',".format(titles[title_offset][1])
            title_offset += 1
        
        titles_subset = titles[first_title:title_offset]
        expr = ["Ti='"+t+"'" for _,t in titles_subset]
        print("Posting",len(expr),"queries")
        expr = ','.join(expr)
        expr = "expr=OR("+expr+")"
#         print(expr)
        
        # Write and launch the query
        query = expr+"&count="+str(query_count)+"&attributes="+",".join(FIELDS)
        r = requests.post('https://api.labs.cognitive.microsoft.com/academic/v1.0/evaluate',
                          data=query.encode("utf-8"), headers=HEADERS)
        try:
            js = r.json()
        except json.decoder.JSONDecodeError as err:
            print("Error with status code ",r.status_code)
            print(r.text)
            raise err
        try:
            print("Got",len(js["entities"]),"results")
        except KeyError as err:
            print(r.status_code)
            print(r.text)
        
        # Append the results to the output
        for pid, t in titles_subset:
            # Flag in case no match is found
            matched = False
            for row in js["entities"]:
                if t != row["Ti"]:
                    continue
                matched = True
                break
            # Default in case no match is found
            if not matched:
                data.append(dict(pid=pid,title=t,matched=False))
                continue
            # If a match was found, extract info        
            insts = list(set(author["AfN"] for author in row["AA"] if "AfN" in author))

            # Convert "extended metadata" (E) to json, then extract arxiv IDs
            arxiv_sources = []
            if "E" in row:
                if type(row["E"]) is not dict:
                    row["E"] = json.loads(row["E"])
                if 'S' in row["E"]:
                    for source in row["E"]["S"]:
                        if "U" not in source:
                            continue
                        if not source['U'].startswith("https://arxiv.org/"):
                            continue
                        arxiv_sources.append(source['U'])
            if "F" in row:
                field_names = []
                for f in row["F"]:
                    field_names.append(f['FN'])
            # Add then mandatory fields
            data_row = dict(pid=pid,title=t, institutes=insts, arxiv_sources=arxiv_sources,
                            citations=row["CC"], date=row["D"], field_names=field_names, matched=True)            
            # Then add optional fields
            for long, short in optional_columns.items():                
                second = None
                if "." in short:
                    short, second = short.split(".")
                if short in row:
                    if second is None:
                        data_row[long] = row[short]
                    elif second in row[short]:
                        data_row[long] = row[short][second]
            records.append(data_row)
            with open(os.path.join(
                ext_data_path,
                'mak', 
                'openaire_publications', 
                f'mak_oa_publications_{first_title}_{title_offset}.json'), 'w') as f:
                json.dump(records, f)
        data.extend(records)
    # Print summary statistics
    nmatch = 0 
    nboth = 0
    for row in data:
        if not row["matched"]:
            continue
        nmatch += 1
        if row["citations"] > 0 and len(row["institutes"]) > 0:
            nboth += 1
    print("Made",calls,"calls")
    print("Got",nmatch,"matches from",len(data),"queries, of which",
          nboth,"contained both institutes and citation information")
    # Done
    return data

# Stolen from https://stackoverflow.com/a/434328/1571593
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

# Execute IN statements in chunks
def execute_IN_in_chunks(con, query, chunkable, chunk_size):
    output = []
    for chunk in chunker(chunkable, chunk_size):
        result = con.execute(sql_text(query), values=tuple(chunk))
        output += result.fetchall()
    return output

#### Collect

In [None]:
from fuzzywuzzy import fuzz

In [None]:
publications_df.reset_index(inplace=True)

In [None]:
publications_df['title'].fillna('Title Missing', inplace=True)

In [None]:
n = 2716
optional_columns = dict(language="L", full_title="E.DN",
                        conference="CN", journal="E.BV", doi="E.DOI")

data = mak_from_titles(
    zip(publications_df['index'], publications_df['title'].values), call_limit=500,
    optional_columns=optional_columns, title_offset=n
)
df_magapi = pd.DataFrame(data)

In [None]:
len("expr=OR(Ti='from housing as asset to housing as patrimony policy ideas and the re emergence of the housing question',Ti='experimental analysis of nonlinear impairments in fibre optic transmission systems up to 7 3 thz',Ti='a novel method to measure electronic spectra of cold molecular ions',Ti='dichotomy of short and long thymic stromal lymphopoietin isoforms in inflammatory disorders of the bowel and skin',Ti='tumor associated macrophages as treatment targets in oncology',Ti='quesp and quest revisited fast and accurate quantitative cest experiments',Ti='system level analysis of swipt mimo cellular networks',Ti='antagonist properties of monoclonal antibodies targeting human cd28 role of valency and the heavy chain constant domain',Ti='alemannen franken pfalz oberrhein von den versuchen der landesgeschichte eine heimat zu geben',Ti='l2 induced gain for discrete time switched lur e systems via a suitable lyapunov function',Ti='production of scopularide a in submerged culture with scopulariopsis brevicaulis',Ti='youth political participation in a transition society',Ti='compile time function memoization',Ti='liveness through the lens of agency and causality',Ti='targeting the tumor and its microenvironment by a dual function decoy met receptor',Ti='observation of ph induced protein reorientation at the water surface',Ti='improving uncoordinated collaboration in partially observable domains with imperfect simultaneous action communication',Ti='computational invention of cadences and chord progressions by conceptual chord blending',Ti='energy efficient design for edge caching wireless networks when is coded caching beneficial ',Ti='role of dc sign in lassa virus entry into human dendritic cells ',Ti='mesoscopic moment equations for heat conduction characteristic features and slow fast mode decomposition',Ti='marker free phenotyping of tumor cells by fractal analysis of reflection interference contrast microscopy images',Ti='autonomous reinforcement of behavioral sequences in neural dynamics',Ti='concurrent number cruncher a gpu implementation of a general sparse linear solver',Ti='challenges for nanomechanical sensors in biological detection',Ti='measuring economic complexity of countries and products which metric to use ',Ti='youth prospects in a time of economic recession',Ti='the continuing value of twin studies in the omics era',Ti='magnetization and anisotropy of cobalt ferrite thin films',Ti='analysis of pectin mutants and natural accessions of arabidopsis highlights the impact of de methyl esterified homogalacturonan on tissue saccharification',Ti='general automatic human shape and motion capture using volumetric contour cues',Ti='clinical research on neglected tropical diseases challenges and solutions',Ti='individual nsaids and upper gastrointestinal complications',Ti='combinatorial flexibility of cytokine function during human t helper cell differentiation',Ti='analysis of the cross talk of epstein barr virus infected b cells with t cells in the marmoset',Ti='selecting translations to be post edited by sentence level automatic quality evaluation',Ti='pyrochar decomposition under french grassland monitored by 13c natural abundance',Ti='abiotic and biotic processes governing the fate of phenylurea herbicides in soils a review',Ti='dynamic herd simulations cownex a tool to assess nitrogen excretion and efficiency of a dairy cattle herd according management',Ti='the relevance of light in the formation of colloidal metal nanoparticles',Ti='how anatomy shapes dynamics a semi analytical study of the brain at rest by a simple spin model',Ti='darboux integrability and algebraic limit cycles for a class of polynomial differential systems',Ti='atm splicing variants as biomarkers for low dose dexamethasone treatment of a t',Ti='anisotropic critical state theory role of fabric',Ti='beliefs about others intentions determine whether cooperation is the faster choice',Ti='mycotoxin biotransformation by native and commercial enzymes present and future perspectives',Ti='ge mediated surface preparation for twin free 3c sic nucleation and growth on low off axis 4h sic substrate',Ti='participatory modelling to support decision making in water management under uncertainty two comparative case studies in the guadiana river basin spain',Ti='innovative instrumentation for eurisol report on the fifth eurisol user group topical meeting the ron cooke hub heslington east campus univ of york uk 15 17 july 2014',Ti='somos o que comemos',Ti='corporate governance value and performance of firms new empirical results on convergence from a large international database',Ti='sharing data for public security',Ti='fgf21 and cardiac physiopathology',Ti='pace simple multi hop scheduling for single radio 802 11 based stub wireless mesh networks',Ti='a simulation study of local defect resonances ldr ',Ti='openaire guidelines 1 1 guidelines for content providers of the openaire information space',Ti='physiologically based pharmacokinetic modeling of perfluoroalkyl substances in the human body',Ti='ethylene carbonate free adiponitrile based electrolytes compatible with graphite anodes',Ti='let me guide you pedagogical interaction style for a robot in children s education',Ti='trinocchio privacy preserving outsourcing by distributed verifiable computation',Ti='mapping phytoplankton blooms in deep subalpine lakes from sentinel 2a and landsat 8',Ti='requirements document wp3 d3 1',Ti='mirri policy on accession',Ti='photonics4all start up challenge report',Ti='injectable rectifiers as microdevices for remote electrical stimulation an alternative to inductive coupling',Ti='systems medicine and integrated care to combat chronic noncommunicable diseases',Ti='correlations between islet autoantibody specificity and the slc30a8 genotype with hla dqb1 and metabolic control in new onset type 1 diabetes',Ti='environmental regulation and competitiveness empirical evidence on the porter hypothesis from european manufacturing sectors',Ti='clinical pet imaging of insulinoma and beta cell hyperplasia',Ti='inhibiting receptor tyrosine kinase axl with small molecule inhibitor bms 777607 reduces glioblastoma growth migration and invasion in vitro and in vivo',Ti='democritus an adaptive particle in cell pic code for object plasma interactions',Ti=' mygreenservices un projet en mode living lab pilote par inria sophia antipolis relatif a la co creation de services environnementaux bases sur des capteurs citoyens ville de nice ',Ti='le role du lobe temporal median dans les liens entre musiques et paroles une approche en neuropsychologie et neuro imagerie',Ti='traitrecordj a programming language with traits and records',Ti='observation of poiseuille flow of phonons in black phosphorus',Ti='guanylate binding protein 5 impairing virion infectivity by targeting retroviral envelope glycoproteins',Ti='intercontinental karyotype environment parallelism supports a role for a chromosomal inversion in local adaptation in a seaweed fly',Ti='multiprocessor scheduling of precedence constrained mixed critical jobs',Ti='effect of the surface structure of pt 100 and pt 110 on the oxidation of carbon monoxide in alkaline solution an ftir and electrochemical study',Ti='microbial inhibition of oral epithelial wound recovery potential role for quorum sensing molecules ',Ti='psychological complaints among children in joint physical custody and other family types considering parental factors',Ti='defect induced local variation of crystal phase transition temperature in metal halide perovskites',Ti='electra irp voltage control strategy for enhancing power system stability in future grid architectures',Ti='the phenotypic architecture of tetraploid wheat triticum turgidum l effects of domestication and post domestication under contrasting nitrogen fertilisation',Ti='multi area network model of visual cortex',Ti='gearing motion in cogwheel pairs of molecular rotors weak coupling limit',Ti='pectenotoxin s abcde ring system a complex target to test the potential of singlet oxygen super cascades as tools for synthesis',Ti='growth of krskopolje piglets during lactation and first rearing period',Ti='evilinhd a virtual research environment open and collaborative for dh scholars',Ti='optimisation of code saturne for petascale simulations',Ti='avalokitesvara of the six syllables locating the practice of the great vehicle in the landscape of central india',Ti='release of a live elixir communication strategy',Ti='electricity in hpc centres',Ti='modeling simulation and comparison of control techniques for energy storage systems',Ti='d3 1 evaluation of systematic relations between the seismic response to fluid injection and depth injection pressure crustal stress state and local structural geology',Ti='flexible multi layer sparse approximations of matrices and applications',Ti='partitioning of trace elements and metals between quasi ultrafine accumulation and coarse aerosols in indoor and outdoor air in schools',Ti='enhancing location related hydrogeological knowledge',Ti='bistability breaks off deterministic responses to intracortical stimulation during non rem sleep',Ti='a gamma moment approach to monotonic boundary estimation',Ti='serial defaults serial profits returns to sovereign lending in habsburg spain 1566 1600',Ti='crystal structure and proton conductivity of basn0 6sc0 4o3 d insights from neutron powder diffraction and solid state nmr spectroscopy electronic supplementary information esi available rietveld fit of dry basn0 6sc0 4o3 d sample fig s1 119sn fig s2 45sc fig s3 s6 and 17o fig s7 spectra of all materials as a function of sc doping concentration 45sc mqmas of deuterated basn0 9sc0 1o3 d fig s4 45sc mqmas of dry and deuterated basn0 8sc0 2o3 d fig s5 45sc mqmas of dry and deuterated basn0 7sc0 3o3 d fig s6 17o mqmas of 17o enriched basn0 8sc0 2o3 d and basn0 6sc0 4o3 d fig s8 see doi 10 1039 c5ta09744d click here for additional data file ',Ti='genome sequence of bluetongue virus type 2 from india evidence for reassortment between outer capsid protein genes',Ti='accurate nuclear radii and binding energies from a chiral interaction',Ti='determining projection constants of univariate polynomial spaces',Ti='co transcriptional histone h2b monoubiquitylation is tightly coupled with rna polymerase ii elongation rate',Ti='impaired high density lipoprotein anti oxidant capacity in human abdominal aortic aneurysm',Ti='theoretical vibrational excitation cross sections and rate coefficients for electron impact resonant collisions involving rovibrationally excited n2 and no molecules',Ti='predicting species maximum dispersal distances from simple plant traits',Ti='mapping the surface adsorption forces of nanomaterials in biological systems',Ti='a novel approach for arsenic adsorbents regeneration using mgo',Ti='climate events synchronize the dynamics of a resident vertebrate community in the high arctic',Ti='efficient engineering of a bacteriophage genome using the type i e crispr cas system',Ti='effects of acceleration on gait measures in three horse gaits',Ti='internalization assays for listeria monocytogenes ',Ti='drivers phone use at red traffic lights a roadside observation study comparing calls and visual manual interactions',Ti='deformations of gr and bh thermodynamics',Ti='do kenya s climate change mitigation ambitions necessitate large scale renewable energy deployment and dedicated low carbon energy policy ',Ti='ultra high field mri post mortem structural connectivity of the human subthalamic nucleus substantia nigra and globus pallidus',Ti='cultural property',Ti='a new method for focal transient cerebral ischaemia by distal compression of the middle cerebral artery',Ti='patient safety in primary care a survey of general practitioners in the netherlands',Ti='decadal prediction skill in a multi model ensemble',Ti='the distinct role of the amygdala superior colliculus and pulvinar in processing of central and peripheral snakes',Ti='collision of almost parallel vortex filaments',Ti='atomic model of a cell wall cross linking enzyme in complex with an intact bacterial peptidoglycan',Ti='hierarchical reinforcement learning and central pattern generators for modeling the development of rhythmic manipulation skills',Ti='prospects for laser spectroscopy of highly charged ions with high harmonic xuv and soft x ray sources',Ti='evolution of theories of mind',Ti='tsi metamodels based multi objective robust optimization',Ti='de novo active sites for resurrected precambrian enzymes',Ti='facial colorings using hall s theorem',Ti='on sat technologies for dependency management and beyond',Ti='protein co evolution how do we combine bioinformatics and experimental approaches ',Ti='large spin relaxation anisotropy and valley zeeman spin orbit coupling in wse2 graphene h bn heterostructures',Ti='fasciola and fasciolosis in ruminants in europe identifying research needs',Ti='stochastic models of population extinction',Ti='high quality polarization entanglement state preparation and manipulation in standard telecommunication channels',Ti='the alma protostellar interferometric line survey pils first results from an unbiased submillimeter wavelength line survey of the class 0 protostellar binary iras 16293 2422 with alma',Ti='combined deterministic and stochastic approaches for modelling the evolution of food products along the cold chain part ii a case study',Ti='optical signal to noise ratio improvement through unbalanced noise beating in phase sensitive parametric amplifiers',Ti='interpreting multiple dualities conjectured from superconformal index identities',Ti='an efficient method to assemble linear dna templates for in vitro screening and selection systems',Ti='trust anchors in software defined networks',Ti='identification of trypanosoma cruzi discrete typing units dtus through the implementation of a high resolution melting hrm genotyping assay',Ti='artificially lit surface of earth at night increasing in radiance and extent',Ti='circularly polarized modes in magnetized spin plasmas',Ti='chronic obstructive pulmonary disease patient journey hospitalizations as window of opportunity for extra pulmonary intervention',Ti='superhydrophobic paper from nanostructured fluorinated cellulose esters',Ti='potential of natural biocides for biocontrolling phototrophic colonization on limestone',Ti='emerging techniques and exotic systems frontiers of photoionization photodetachment',Ti='single pion energy resolution of a high granularity scintillator calorimeter system',Ti='zebrafish as a model for kidney function and disease',Ti='topological order and thermal equilibrium in polariton condensates',Ti='holocene north atlantic overturning in an atmosphere ocean sea ice model compared to proxy based reconstructions',Ti='magnetically driven anisotropic structural changes in the atomic laminate mn2gac',Ti='life without geminin',Ti='monoclonal igg antibodies generated from joint derived b cells of ra patients have a strong bias toward citrullinated autoantigen recognition',Ti='population structure of atlantic mackerel scomber scombrus ',Ti='run time interoperability between neuronal network simulators based on the music framework',Ti='relationship between environmental factors dry matter loss and mycotoxin levels in stored wheat and maize infected withfusariumspecies',Ti='a fiscal union for the emu ',Ti='postoperative pain management in spanish hospitals a cohort study using the pain out registry',Ti='simulation study of cochlear implants stimulation protocols and its application to surgical planning',Ti='jornadas 2010 do departamento de quimica',Ti='why we need a token based typology a case study of analytic and lexical causatives in fifteen european languages',Ti='the grand challenge of characterizing ribonucleoprotein networks',Ti='phenomenological fingerprints of four meditations differential state changes in affect mind wandering meta cognition and interoception before and after daily practice across 9 months of training')")

In [None]:
import regex
from unidecode import unidecode

In [None]:
t = TitleProcessor()
x = [unidecode(t.process_title(s)) for s in publications_df['title'][2716:2716+180]]

In [None]:
x

In [None]:
from eu_funding.utils.nlp_utils import 

### Crossref Enrichment

In [None]:
from crossref.restful import Works

In [None]:
from threading import Thread

In [None]:
from fuzzywuzzy import fuzz
import concurrent.futures

In [None]:
session = requests.Session()

In [None]:
def get_doi_crossref(title, max_rows=5):
    title = title.lower()
    r = requests.get(
    'https://api.crossref.org/works?rows=5&query.title={}'.format(title)
    )
    doi = np.nan
    if r.status_code == 200:
        j = r.json()
        results = j['message']['items']
        dist_max = 0

        for result in results:
            result_title = result['title'][0].lower()
            dist = fuzz.ratio(title, result_title)
            if dist < 90:
                continue
            elif dist == 100:
                doi = result['DOI']
            elif 100 > dist >= 90:
                if dist > dist_max:
                    doi = result['DOI']
                    dist_max = dist
    return doi
        

In [None]:
from crossref.restful import Etiquette

In [None]:
from eu_funding.utils.misc_utils import chunks

In [None]:
import requests
from time import sleep

In [None]:
all_titles = publications_df['title'][pd.isnull(publications_df['doi'])].str.encode('utf-8')

In [None]:
connections = 20
timeout = 30

for i, titles in enumerate(chunks(all_titles, 1000)):
    out = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=connections) as executor:
        future_to_url = (executor.submit(get_doi_crossref, title.decode(), timeout) for title in titles)
        for future in concurrent.futures.as_completed(future_to_url):
            try:
                data = future.result()
            except Exception as exc:
                data = str(type(exc))
            finally:
                out.append(data)
                
    with open(os.path.join(inter_data_path, 'openaire_missing_dois', 'dois_{:03}.txt'.format(i)), 'w') as f:
        for o in out:
            f.write(str(o) + '\n')

In [None]:
missing_dois = []
files = os.listdir(os.path.join(inter_data_path, 'openaire_missing_dois'))
for file in files:
    with open(os.path.join(inter_data_path, 'openaire_missing_dois', file), 'r') as f:
        missing_dois.extend(f.read().splitlines())

In [None]:
publications_df['doi'].loc[all_titles.index] = missing_dois
publications_df['doi'][publications_df['doi'] == 'nan'] = np.nan

In [None]:
publications_df.head()

In [None]:
publications_df.to_csv(os.path.join(inter_data_path, 'openaire_publications_20190702.csv'), index=False)

## CrossRef Works

In [None]:
from crossref.restful import Etiquette

In [None]:
etiquette = Etiquette(
    application_version='0.1',
    application_url='http://www.eurito.eu/',
    application_name='eu_funding_analytics',
    contact_email='george.richardson@nesta.org.uk',   
)

In [None]:
def get_crossref_work(doi):
    works = Works(etiquette=etiquette)
    response = works.doi(doi)
    return response

In [None]:
all_dois = publications_df['doi'][~pd.isnull(publications_df['doi'])].unique()

In [None]:
test_dois = all_dois[:100]

In [None]:
import json

In [None]:
doi_chunks = list(chunks(all_dois, 1000))
doi_chunk_indices = list(range(len(doi_chunks)))

In [None]:
start = 0
connections = 2 # API will rate limit occasionally with just 2 connections so needs babysitting

for i, dois in zip(doi_chunk_indices[start:], doi_chunks[start:]):
    out = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=connections) as executor:
        future_to_url = (executor.submit(get_crossref_work, doi) for doi in dois)
        for future in concurrent.futures.as_completed(future_to_url):
            data = future.result()
            out.append(data)
                
    with open(os.path.join(ext_data_path, 'crossref', 'works_{:04}.txt'.format(i)), 'w') as f:
        json.dump(out, f)