# OpenAIRE Data EDA

## Preamble

In [1]:
%run notebook_preamble.ipy

pd.set_option('max_columns', 99)

In [472]:
import seaborn as sns
import xmltodict
import pyjq
import boto3
import io

from src.visualization.visualize import pdf_cdf
from src.utils.misc_utils import print_nested_structure
from src.data.s3_transfer import get_files_from_s3
from src.data.openaire import parse_openaire_records

In [473]:
parse_openaire_records??

## Data Structure

In [456]:
BUCKET = 'im-eurito'
FOLDER = 'external/openaire/projectssoups'
KEY_PREFIX = 'soup'

In [474]:
records = []
for file in get_files_from_s3(bucket=BUCKET, folder=FOLDER, key_prefix=KEY_PREFIX):
    records.extend(parse_openaire_records(file))

In [475]:
df = pd.DataFrame().from_records(records)

In [476]:
df.head()

Unnamed: 0,oaf:entity_@xmlns:oaf,oaf:entity_@xmlns:xsi,oaf:entity_@xsi:schemalocation,oaf:entity_oaf:project_acronym,oaf:entity_oaf:project_callidentifier,oaf:entity_oaf:project_children,oaf:entity_oaf:project_code,oaf:entity_oaf:project_collectedfrom_@id,oaf:entity_oaf:project_collectedfrom_@name,oaf:entity_oaf:project_contactemail,oaf:entity_oaf:project_contactfax,oaf:entity_oaf:project_contactfullname,oaf:entity_oaf:project_contactphone,oaf:entity_oaf:project_contracttype_@classid,oaf:entity_oaf:project_contracttype_@classname,oaf:entity_oaf:project_contracttype_@schemeid,oaf:entity_oaf:project_contracttype_@schemename,oaf:entity_oaf:project_datainfo_deletedbyinference,oaf:entity_oaf:project_datainfo_inferenceprovenance,oaf:entity_oaf:project_datainfo_inferred,oaf:entity_oaf:project_datainfo_provenanceaction_@classid,oaf:entity_oaf:project_datainfo_provenanceaction_@classname,oaf:entity_oaf:project_datainfo_provenanceaction_@schemeid,oaf:entity_oaf:project_datainfo_provenanceaction_@schemename,oaf:entity_oaf:project_datainfo_trust,oaf:entity_oaf:project_duration,oaf:entity_oaf:project_ecarticle29_3,oaf:entity_oaf:project_ecsc39,oaf:entity_oaf:project_enddate,oaf:entity_oaf:project_fundingtree_funder_id,oaf:entity_oaf:project_fundingtree_funder_jurisdiction,oaf:entity_oaf:project_fundingtree_funder_name,oaf:entity_oaf:project_fundingtree_funder_shortname,oaf:entity_oaf:project_fundingtree_funding_level_1_class,oaf:entity_oaf:project_fundingtree_funding_level_1_description,oaf:entity_oaf:project_fundingtree_funding_level_1_id,oaf:entity_oaf:project_fundingtree_funding_level_1_name,oaf:entity_oaf:project_fundingtree_funding_level_1_parent_funding_level_0_class,oaf:entity_oaf:project_fundingtree_funding_level_1_parent_funding_level_0_description,oaf:entity_oaf:project_fundingtree_funding_level_1_parent_funding_level_0_id,oaf:entity_oaf:project_fundingtree_funding_level_1_parent_funding_level_0_name,oaf:entity_oaf:project_fundingtree_funding_level_1_parent_funding_level_0_parent,oaf:entity_oaf:project_fundingtree_funding_level_2_class,oaf:entity_oaf:project_fundingtree_funding_level_2_description,oaf:entity_oaf:project_fundingtree_funding_level_2_id,oaf:entity_oaf:project_fundingtree_funding_level_2_name,oaf:entity_oaf:project_fundingtree_funding_level_2_parent_funding_level_1_class,oaf:entity_oaf:project_fundingtree_funding_level_2_parent_funding_level_1_description,oaf:entity_oaf:project_fundingtree_funding_level_2_parent_funding_level_1_id,oaf:entity_oaf:project_fundingtree_funding_level_2_parent_funding_level_1_name,oaf:entity_oaf:project_fundingtree_funding_level_2_parent_funding_level_1_parent_funding_level_0_class,oaf:entity_oaf:project_fundingtree_funding_level_2_parent_funding_level_1_parent_funding_level_0_description,oaf:entity_oaf:project_fundingtree_funding_level_2_parent_funding_level_1_parent_funding_level_0_id,oaf:entity_oaf:project_fundingtree_funding_level_2_parent_funding_level_1_parent_funding_level_0_name,oaf:entity_oaf:project_fundingtree_funding_level_2_parent_funding_level_1_parent_funding_level_0_parent,oaf:entity_oaf:project_keywords,oaf:entity_oaf:project_oamandatepublications,oaf:entity_oaf:project_originalid,oaf:entity_oaf:project_pid_@classid,oaf:entity_oaf:project_pid_@classname,oaf:entity_oaf:project_pid_@schemeid,oaf:entity_oaf:project_pid_@schemename,oaf:entity_oaf:project_rels,oaf:entity_oaf:project_rels_rel_@inferenceprovenance,oaf:entity_oaf:project_rels_rel_@inferred,oaf:entity_oaf:project_rels_rel_@provenanceaction,oaf:entity_oaf:project_rels_rel_@trust,oaf:entity_oaf:project_rels_rel_country_@classid,oaf:entity_oaf:project_rels_rel_country_@classname,oaf:entity_oaf:project_rels_rel_country_@schemeid,oaf:entity_oaf:project_rels_rel_country_@schemename,oaf:entity_oaf:project_rels_rel_legalname,oaf:entity_oaf:project_rels_rel_legalshortname,oaf:entity_oaf:project_rels_rel_to_#text,oaf:entity_oaf:project_rels_rel_to_@class,oaf:entity_oaf:project_rels_rel_to_@scheme,oaf:entity_oaf:project_rels_rel_to_@type,oaf:entity_oaf:project_rels_rel_websiteurl,oaf:entity_oaf:project_startdate,oaf:entity_oaf:project_subjects_#text,oaf:entity_oaf:project_subjects_@classid,oaf:entity_oaf:project_subjects_@classname,oaf:entity_oaf:project_subjects_@schemeid,oaf:entity_oaf:project_subjects_@schemename,oaf:entity_oaf:project_title,oaf:entity_oaf:project_websiteurl,organisations
0,http://namespace.openaire.eu/oaf,http://www.w3.org/2001/XMLSchema-instance,http://namespace.openaire.eu/oaf https://www.o...,SILINANO,FP7-PEOPLE-2013-IEF,,626397,openaire____::b30dac7baac631f3da7c2bb18dd9891f,CORDA - COmmon Research DAta Warehouse,johannes.barth@ph.tum.de,+49 8928912338,"Barth, Johannes",+49 89 289 12609,MC,Support for training and career development of...,ec:FP7contractTypes,ec:FP7contractTypes,False,,False,sysimport:crosswalk:entityregistry,sysimport:crosswalk:entityregistry,dnet:provenanceActions,dnet:provenanceActions,0.9,,,False,2017-01-31,ec__________::EC,EU,European Commission,EC,,,,,,,,,,ec:program,Marie-Curie Actions,ec__________::EC::FP7::SP3::PEOPLE,PEOPLE,ec:specificprogram,SP3-People,ec__________::EC::FP7::SP3,SP3,ec:frameworkprogram,SEVENTH FRAMEWORK PROGRAMME,ec__________::EC::FP7,FP7,,,False,corda_______::626397,,,,,,,False,sysimport:crosswalk:entityregistry,0.9,DE,Germany,dnet:countries,dnet:countries,TECHNISCHE UNIVERSITAET MUENCHEN,TUM,corda_______::06ddc7f379b6f1cc6173a0c1b3d333d5,hasParticipant,dnet:project_organization_relations,organization,http://www.tu-muenchen.de,2015-02-01,,,,,,"Silicene, a new material for nanoelectronics",,
1,http://namespace.openaire.eu/oaf,http://www.w3.org/2001/XMLSchema-instance,http://namespace.openaire.eu/oaf https://www.o...,STIFF-FLOP,FP7-ICT-2011-7,,287728,openaire____::b30dac7baac631f3da7c2bb18dd9891f,CORDA - COmmon Research DAta Warehouse,paul.labbett@kcl.ac.uk,+44 20 7848 8187,"Labbett, Paul",+44 20 7848 8184,CP,Collaborative project,ec:FP7contractTypes,ec:FP7contractTypes,False,,False,sysimport:crosswalk:entityregistry,sysimport:crosswalk:entityregistry,dnet:provenanceActions,dnet:provenanceActions,0.9,,,True,2015-12-31,ec__________::EC,EU,European Commission,EC,,,,,,,,,,ec:program,Information and Communication Technologies,ec__________::EC::FP7::SP1::ICT,ICT,ec:specificprogram,SP1-Cooperation,ec__________::EC::FP7::SP1,SP1,ec:frameworkprogram,SEVENTH FRAMEWORK PROGRAMME,ec__________::EC::FP7,FP7,,,True,corda_______::287728,,,,,,,,,,,,,,,,,,,,,2012-01-01,,,,,,STIFFness controllable Flexible and Learn-able...,,"[{'org_legalname': 'STICHTING E.A.E.S', 'org_u..."
2,http://namespace.openaire.eu/oaf,http://www.w3.org/2001/XMLSchema-instance,http://namespace.openaire.eu/oaf https://www.o...,RESCUER,FP7-ICT-2013-EU-Brazil,,614154,openaire____::b30dac7baac631f3da7c2bb18dd9891f,CORDA - COmmon Research DAta Warehouse,michael.prestele@zv.fraunhofer.de,+49 89 1205 7534,"Prestele, Michael",+49 89 1205 2738,CP,Collaborative project,ec:FP7contractTypes,ec:FP7contractTypes,False,,False,sysimport:crosswalk:entityregistry,sysimport:crosswalk:entityregistry,dnet:provenanceActions,dnet:provenanceActions,0.9,,,False,2016-10-31,ec__________::EC,EU,European Commission,EC,,,,,,,,,,ec:program,Information and Communication Technologies,ec__________::EC::FP7::SP1::ICT,ICT,ec:specificprogram,SP1-Cooperation,ec__________::EC::FP7::SP1,SP1,ec:frameworkprogram,SEVENTH FRAMEWORK PROGRAMME,ec__________::EC::FP7,FP7,,,False,corda_______::614154,,,,,,,,,,,,,,,,,,,,,2013-10-01,,,,,,Reliable and Smart Crowdsourcing Solution for ...,,[{'org_legalname': 'DEUTSCHES FORSCHUNGSZENTRU...
3,http://namespace.openaire.eu/oaf,http://www.w3.org/2001/XMLSchema-instance,http://namespace.openaire.eu/oaf https://www.o...,SI-BONE-POC,ERC-2012-PoC,,324564,openaire____::b30dac7baac631f3da7c2bb18dd9891f,CORDA - COmmon Research DAta Warehouse,epo@um-mainz.de,+49 6131 17 9669,"Veith, Uta",+49 6131 17 9717,CSA,Coordination and support action,ec:FP7contractTypes,ec:FP7contractTypes,False,,False,sysimport:crosswalk:entityregistry,sysimport:crosswalk:entityregistry,dnet:provenanceActions,dnet:provenanceActions,0.9,,,False,2013-12-31,ec__________::EC,EU,European Commission,EC,,,,,,,,,,ec:program,ERC,ec__________::EC::FP7::SP2::ERC,ERC,ec:specificprogram,SP2-Ideas,ec__________::EC::FP7::SP2,SP2,ec:frameworkprogram,SEVENTH FRAMEWORK PROGRAMME,ec__________::EC::FP7,FP7,,,False,corda_______::324564,,,,,,,False,sysimport:crosswalk:entityregistry,0.9,DE,Germany,dnet:countries,dnet:countries,UNIVERSITAETSMEDIZIN DER JOHANNES GUTENBERG-UN...,UMC-Mainz,corda_______::896dd2a57a9d83017147f10046d2d0c8,hasParticipant,dnet:project_organization_relations,organization,http://www.um-mainz.de/,2013-01-01,,,,,,Silica-based Nanobiomedical Approaches for Tre...,,
4,http://namespace.openaire.eu/oaf,http://www.w3.org/2001/XMLSchema-instance,http://namespace.openaire.eu/oaf https://www.o...,SMILE,ERC-2013-CoG,,616047,openaire____::b30dac7baac631f3da7c2bb18dd9891f,CORDA - COmmon Research DAta Warehouse,staffan.svard@icm.uu.se,,"Svärd, Staffan",+46 18 471 4558,ERC,Support for frontier research (ERC),ec:FP7contractTypes,ec:FP7contractTypes,False,,False,sysimport:crosswalk:entityregistry,sysimport:crosswalk:entityregistry,dnet:provenanceActions,dnet:provenanceActions,0.9,,,False,2019-03-31,ec__________::EC,EU,European Commission,EC,,,,,,,,,,ec:program,ERC,ec__________::EC::FP7::SP2::ERC,ERC,ec:specificprogram,SP2-Ideas,ec__________::EC::FP7::SP2,SP2,ec:frameworkprogram,SEVENTH FRAMEWORK PROGRAMME,ec__________::EC::FP7,FP7,,,False,corda_______::616047,,,,,,,True,sysimport:crosswalk:entityregistry,0.9,SE,Sweden,dnet:countries,dnet:countries,Uppsala Universitet,UU,dedup_wf_001::037f409ee87c2f0ec5f0a7c673fb7512,hasParticipant,dnet:project_organization_relations,organization,http://www.uu.se/,2014-04-01,,,,,,Single Molecule Investigations in Living E. coli,,


### Functions