# OpenAIRE Data EDA

## Preamble

In [None]:
%run notebook_preamble.ipy

pd.set_option('max_columns', 99)

In [None]:
import seaborn as sns
import xmltodict
import pyjq
import boto3
import io

from src.visualization.visualize import pdf_cdf
from src.utils.misc_utils import print_nested_structure
from src.data.s3_transfer import get_files_from_s3
from src.data.openaire import parse_openaire_records

## Data Structure

### Projects

In [None]:
BUCKET = 'im-eurito'
FOLDER = 'external/openaire/projectssoups'
KEY_PREFIX = 'soup'

In [None]:
records = []
for file in get_files_from_s3(bucket=BUCKET, folder=FOLDER, key_prefix=KEY_PREFIX):
    records.extend(parse_openaire_records(file))

In [None]:
df = pd.DataFrame().from_records(records)

In [None]:
pd.options.display.max_columns = 999

In [None]:
df.to_csv(os.path.join(inter_data_path, 'openaire_projects.csv'), index=False)

### Publications

In [None]:
BUCKET = 'im-eurito'
FOLDER = 'external/openaire/publicationssoups'
KEY_PREFIX = 'soup'

In [None]:
from src.data.openaire import parse_publications_soup

In [None]:
def load_publications():
    records = []
    for file in os.listdir(os.path.join(openaire_publication_data_path)):
        file_number = file.split('.')[0].split('_')[-1]
        if '.txt' in file:
            with open(os.path.join(openaire_publication_data_path, file), mode='rb') as f:
                data = f.read()
                soup = BeautifulSoup(data)
                rec = parse_publications_soup(soup)
                records.append(rec)
    return records

In [None]:
x = []
for a in soup.findAll('oaf:result'):
    resulttype = a.find('resulttype')
    if resulttype.attrs.get('classname') == "publication":
        x.append(a)

In [None]:
records = load_publications()

In [None]:
dfs = []

for i, record in enumerate(records):
    i += 1
    df = pd.DataFrame().from_records(record)
    df.to_csv(
        os.path.join(openaire_publication_data_path, 'csv', 'publications_parsed_{:03}.csv'.format(i)),
        index=False
    )
    dfs.append(df)

In [None]:
publications_df = pd.concat(dfs)

In [None]:
publications_df.to_csv(os.path.join(inter_data_path, 'openaire_publications.csv'), index=False)

In [None]:
ec_publications = publications_df[[True if len(s) > 0 else False for s in publications_df['ec_project_codes']]]

In [None]:
biggest_project = ec_publications[[True if '237920' in s else False for s in ec_publications['ec_project_codes']]]

In [None]:
biggest_project.head()

In [None]:
ec_publications.head()

In [None]:
import itertools

In [None]:
publication_types = list(itertools.chain(*ec_publications['child_types']))

In [None]:
project_codes = list(itertools.chain(*ec_publications['ec_project_codes']))

In [None]:
from collections import Counter

In [None]:
counts = Counter(project_codes)

In [None]:
ec_publications['pid_type'].value_counts()

In [None]:
len(ec_publications)

- `oaf:entity_extrainfo_citations_citation`: works that this paper references
- `oaf:entity_oaf:result_children_instance`: information about the available locations of this publication, if it is open access etc. (URLs but no DOI)
- `oaf:entity_oaf:result_children_result`: information about the publication instances - titles, acceptance dates of the instances
- `oaf:entity_oaf:result_collectedfrom`: where each publication instance was collected from
- `oaf:entity_oaf:result_context`: Funding bodies and grant information
- `oaf:entity_oaf:result_creator`: Author list
- `oaf:entity_oaf:result_originalid`: IDs at original source locations
- `oaf:entity_oaf:result_pid_#text	`: Publication IDs (normally DOIs)
- `oaf:entity_oaf:result_rels_rel`: More detailed funding information (e.g. funding levels for EC projects). **For EC projects  \['code'\] is the project ID.**