# OpenAIRE Data EDA

## Preamble

In [None]:
%run notebook_preamble.ipy

    pd.set_option('max_columns', 99)

In [None]:
import seaborn as sns
import xmltodict
import pyjq
import boto3
import io
from bs4 import BeautifulSoup

from eu_funding.visualization.visualize import pdf_cdf
from eu_funding.utils.misc_utils import print_nested_structure
from eu_funding.data.s3_transfer import get_files_from_s3
from eu_funding.data.openaire import parse_openaire_records, parse_publications_soup

## Data Structure

### Projects

In [None]:
BUCKET = 'im-eurito'
FOLDER = 'external/openaire/projectssoups'
KEY_PREFIX = 'soup'

In [None]:
records = []
for file in get_files_from_s3(bucket=BUCKET, folder=FOLDER, key_prefix=KEY_PREFIX):
    records.extend(parse_openaire_records(file))

In [None]:
df = pd.DataFrame().from_records(records)

In [None]:
pd.options.display.max_columns = 999

In [None]:
df.to_csv(os.path.join(inter_data_path, 'openaire_projects.csv'), index=False)

### Publications

In [None]:
BUCKET = 'im-eurito'
FOLDER = 'external/openaire/publicationssoups'
KEY_PREFIX = 'soup'

In [None]:
def load_publications():
    records = []
    for file in os.listdir(os.path.join(openaire_publication_data_path)):
        file_number = file.split('.')[0].split('_')[-1]
        if '.txt' in file:
            with open(os.path.join(openaire_publication_data_path, file), mode='rb') as f:
                data = f.read()
                soup = BeautifulSoup(data)
                rec = parse_publications_soup(soup)
                records.extend(rec)
    return records

In [None]:
records = load_publications()

In [None]:
df = pd.DataFrame().from_records(records)

In [None]:
dfs = []

for i, record in enumerate(chunks(records, 1000)):
    i += 1
    df = pd.DataFrame().from_records(record)
    df.to_csv(
        os.path.join(openaire_publication_data_path, 'csv', 'publications_parsed_{:03}.csv'.format(i)),
        index=False
    )
    dfs.append(df)

In [None]:
publications_df = pd.concat(dfs)

In [None]:
publications_new_df.head()

### Fetch Missing PubMed DOIs

In [None]:
import requests
from time import sleep
from eu_funding.utils.misc_utils import chunks

In [None]:
def get_id_converter(pub_ids, id_type):
    id_converter_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/'
    params = {
        'idtype': id_type,
        'ids': ', '.join([str(i) for i in pub_ids]),
        'email': 'george.richardson@nesta.org.uk',
        'tool': 'eu_funding_analytics'
    }
    response = requests.get(
        url=id_converter_url,
        params=params,
    )
    results = response.content
    return results
    
def parse_id_converter_result(results, id_type):
    soup = BeautifulSoup(results)
    records = [record.attrs for record in soup.findAll('record')]
    for r in records:
        r['pid_type'] = id_type
    return records

def convert_ids(pub_ids, id_type):
    pub_id_chunks = chunks(pub_ids, 200)
    converted = []
    for chunk in pub_id_chunks:
        results = get_id_converter(chunk, id_type)
        records = parse_id_converter_result(results, id_type)
        converted.extend(records)
        sleep(3)
    return converted

In [None]:
id_type = 'pmid'
pub_ids = publications_df[publications_df['pid_type'] == id_type]['pid'].values

pmid_converted_ids = convert_ids(pub_ids, id_type)

In [None]:
def doi_col(pid, pid_type):
    if pid_type == 'doi':
        return pid
    else:
        return np.nan

publications_df['doi'] = publications_df.apply(lambda x: doi_col(x['pid'], x['pid_type']), axis=1)

In [None]:
pmid_df = pd.DataFrame().from_records(pmid_converted_ids)
pmid_doi_map = {k: v for k, v in zip(
    pmid_df['pmid'], pmid_df['doi']
)}
publications_df['doi'][publications_df['pid_type'] == 'pmid'] = publications_df['pid'].map(pmid_doi_map)

In [None]:
id_type = 'pmc'
pub_ids = publications_df[publications_df['pid_type'] == id_type]['pid'].values

pmcid_converted_ids = convert_ids(pub_ids, 'pmcid')

In [None]:
pmcid_df = pd.DataFrame().from_records(pmcid_converted_ids)
pmcid_doi_map = {k[3:]: v for k, v in zip(
    pmcid_df['pmcid'], pmcid_df['doi']
)}
publications_df['doi'][publications_df['pid_type'] == 'pmc'] = publications_df['pid'].map(pmcid_doi_map)

In [None]:
publications_df.reset_index(inplace=True)

In [None]:
publications_df.to_csv(os.path.join(inter_data_path, 'openaire_publications.csv'), index=False)

### Crossref Enrichment

In [None]:
publications_df = pd.read_csv(os.path.join(inter_data_path, 'openaire_publications.csv'))

In [None]:
from crossref.restful import Works

In [None]:
from threading import Thread

In [None]:
from fuzzywuzzy import fuzz
import concurrent.futures

In [None]:
session = requests.Session()

In [None]:
def get_doi_crossref(title, max_rows=5):
    title = title.lower()
    r = requests.get(
    'https://api.crossref.org/works?rows=5&query.title={}'.format(title)
    )
    doi = np.nan
    if r.status_code == 200:
        j = r.json()
        results = j['message']['items']
        dist_max = 0

        for result in results:
            result_title = result['title'][0].lower()
            dist = fuzz.ratio(title, result_title)
            if dist < 90:
                continue
            elif dist == 100:
                doi = result['DOI']
            elif 100 > dist >= 90:
                if dist > dist_max:
                    doi = result['DOI']
                    dist_max = dist
    return doi
        

In [None]:
from crossref.restful import Etiquette

In [None]:
from eu_funding.utils.misc_utils import chunks

In [None]:
import requests
from time import sleep

In [None]:
all_titles = publications_df['title'][pd.isnull(publications_df['doi'])].str.encode('utf-8')

In [None]:
connections = 20
timeout = 30

for i, titles in enumerate(chunks(all_titles, 1000)):
    out = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=connections) as executor:
        future_to_url = (executor.submit(get_doi_crossref, title.decode(), timeout) for title in titles)
        for future in concurrent.futures.as_completed(future_to_url):
            try:
                data = future.result()
            except Exception as exc:
                data = str(type(exc))
            finally:
                out.append(data)
                
    with open(os.path.join(inter_data_path, 'openaire_missing_dois', 'dois_{:03}.txt'.format(i)), 'w') as f:
        for o in out:
            f.write(str(o) + '\n')

In [None]:
missing_dois = []
files = os.listdir(os.path.join(inter_data_path, 'openaire_missing_dois'))
for file in files:
    with open(os.path.join(inter_data_path, 'openaire_missing_dois', file), 'r') as f:
        missing_dois.extend(f.read().splitlines())

In [None]:
publications_df['doi'].loc[all_titles.index] = missing_dois
publications_df['doi'][publications_df['doi'] == 'nan'] = np.nan

In [None]:
publications_df.head()

In [None]:
publications_df.to_csv(os.path.join(inter_data_path, 'openaire_publications_20190702.csv'), index=False)

## CrossRef Works

In [None]:
from crossref.restful import Etiquette

In [None]:
etiquette = Etiquette(
    application_version='0.1',
    application_url='http://www.eurito.eu/',
    application_name='eu_funding_analytics',
    contact_email='george.richardson@nesta.org.uk',   
)

In [None]:
def get_crossref_work(doi):
    works = Works(etiquette=etiquette)
    response = works.doi(doi)
    return response

In [None]:
all_dois = publications_df['doi'][~pd.isnull(publications_df['doi'])].unique()

In [None]:
test_dois = all_dois[:100]

In [None]:
import json

In [None]:
doi_chunks = list(chunks(all_dois, 1000))
doi_chunk_indices = list(range(len(doi_chunks)))

In [None]:
start = 0
connections = 2 # API will rate limit occasionally with just 2 connections so needs babysitting

for i, dois in zip(doi_chunk_indices[start:], doi_chunks[start:]):
    out = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=connections) as executor:
        future_to_url = (executor.submit(get_crossref_work, doi) for doi in dois)
        for future in concurrent.futures.as_completed(future_to_url):
            data = future.result()
            out.append(data)
                
    with open(os.path.join(ext_data_path, 'crossref', 'works_{:04}.txt'.format(i)), 'w') as f:
        json.dump(out, f)