# Exploratory analysis of the data sets related to H2020 projects for the Mobility Report 3

Author: Damir Valput
Date: 28 May 2020

In [1]:
#!pip3 install pandas_profiling
#!pip3 install dask[dataframe]

In [2]:
import re
import numpy as np
import pandas as pd
import pandas_profiling

import pickle

import matplotlib.pyplot as plt

import os
import glob

import dask.dataframe as dd

#NLTK
import nltk
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
# all CSVs in the repo

path = "datasets/data_may20/h2020_raw/"
csvs = [f for f in glob.glob(path + "*.csv", recursive = False)]
print(len(csvs))

excels = [f for f in glob.glob(path + "*.xlsx", recursive = False)]
print(len(excels))

5
7


# Load the full set of H2020 projects (not filtered for mobility)

In [4]:
"""
The new database is stored in the folder 
    - "datasets/data_may20/h2020_raw"
    - "datasets/data_may20/fp7_raw"
    - "datasets/data_may20/reference_data"

"""

# the project file (basic info)

loc = "datasets/data_may20/h2020_raw/cordis-h2020projects.xlsx"
df_h2020 = pd.read_excel(loc, header=0, names=['rcn', 'id', 'acronym', 'status', 'programme', 'topics', \
                                             'frameworkProgramme', 'title', 'startDate', 'endDate', 'projectUrl', \
                                            'objective', 'totalCost', 'ecMaxContribution', 'call', 'fundingScheme', \
                                            'coordinator', 'coordinatorCountry', 'participants', \
                                             'participantCountries', 'subjects'])

df_h2020.head()

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
0,229267,894593,ICARUS,SIGNED,H2020-EU.3.4.7.,SESAR-ER4-31-2019,H2020,INTEGRATED COMMON ALTITUDE REFERENCE SYSTEM FO...,2020-05-01,2022-07-31,...,ICARUS project proposes an innovative solution...,1385286.25,1144587.5,H2020-SESAR-2019-2,SESAR-RIA,E-GEOS SPA,IT,TOPVIEW SRL;TELESPAZIO SPA;DRONERADAR SP Z O.O...,IT;PL;BE;ES,
1,229284,897004,ISLand,SIGNED,H2020-EU.1.3.2.,MSCA-IF-2019,H2020,Isolation and Segregation Landscape. Archaeolo...,2020-11-01,2023-10-31,...,The proposed research presents an experimental...,253052.16,253052.16,H2020-MSCA-IF-2019,MSCA-IF-GF,UNIVERSITEIT VAN AMSTERDAM,NL,,,
2,229281,896300,STRETCH,SIGNED,H2020-EU.1.3.2.,MSCA-IF-2019,H2020,Smart Textiles for RETrofitting and Monitoring...,2020-09-01,2022-08-31,...,This project aims to develop novel techniques ...,183473.28,183473.28,H2020-MSCA-IF-2019,MSCA-IF-EF-ST,JRC -JOINT RESEARCH CENTRE- EUROPEAN COMMISSION,BE,,,
3,229265,892890,RhythmicPrediction,SIGNED,H2020-EU.1.3.2.,MSCA-IF-2019,H2020,Rhythmic prediction in speech perception: are ...,2021-01-01,2022-12-31,...,Speech has rhythmic properties that widely dif...,191149.44,191149.44,H2020-MSCA-IF-2019,MSCA-IF-EF-ST,UNIVERSITE DE GENEVE,CH,,,
4,229235,886828,ASAP,SIGNED,H2020-EU.1.3.2.,MSCA-IF-2019,H2020,Advanced Solutions for Asphalt Pavements,2021-09-01,2023-08-31,...,The Advanced Solutions for Asphalt Pavements (...,187572.48,187572.48,H2020-MSCA-IF-2019,MSCA-IF-EF-ST,NEDERLANDSE ORGANISATIE VOOR TOEGEPAST NATUURW...,NL,,,


In [5]:
print("Projects: ", str(df_h2020.shape[0]))
print("Columns per project: ", str(df_h2020.shape[1]))
print(list(df_h2020.columns))

Projects:  29329
Columns per project:  21
['rcn', 'id', 'acronym', 'status', 'programme', 'topics', 'frameworkProgramme', 'title', 'startDate', 'endDate', 'projectUrl', 'objective', 'totalCost', 'ecMaxContribution', 'call', 'fundingScheme', 'coordinator', 'coordinatorCountry', 'participants', 'participantCountries', 'subjects']


In [6]:
df_h2020.describe()

Unnamed: 0,rcn,id,totalCost,ecMaxContribution,subjects
count,29329.0,29329.0,29131.0,29329.0,0.0
mean,211457.376931,951669.4,2204635.0,1803046.0,
std,10753.509763,4337573.0,8010990.0,4800797.0,
min,193157.0,115797.0,3937.5,956.25,
25%,201535.0,697900.0,168277.2,166156.8,
50%,211616.0,762079.0,794224.8,669761.8,
75%,221170.0,831285.0,2423075.0,2110303.0,
max,229303.0,101003700.0,856961900.0,470800000.0,


In [7]:
df_h2020.shape

(29329, 21)

In [8]:
# profile report provided by pandas_profiling
profile = df_h2020.profile_report(title='H2020 projects: summary statistics')

In [9]:
dirName = "pandas_reports"

try:
    os.makedirs(dirName)    
    print("Directory " , dirName ,  " Created ")
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  pandas_reports  already exists


In [10]:
profile.to_file(output_file="pandas_reports/h2020_projects_summary_report.html")

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=35.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




## Investigate duplicate values

In [11]:
title_duplicates = df_h2020[df_h2020.duplicated(subset=['title'], keep=False)]

title_duplicates.sort_values("title")

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
8489,203538,711257,4FOLD Phase 2,SIGNED,H2020-EU.3.4.;H2020-EU.2.3.1.,IT-1-2015,H2020,4FOLD Reduction of the International Transport...,2016-04-01,2019-03-31,...,Due to the global imbalance of import/export c...,3523766.25,2466636.00,H2020-SMEINST-2-2015,SME-2,HOLLAND CONTAINER INNOVATIONS NEDERLAND B.V.,NL,,,
26415,194677,650677,4FOLD,CLOSED,H2020-EU.3.4.;H2020-EU.2.3.1.,IT-1-2014-1,H2020,4FOLD Reduction of the International Transport...,2014-10-01,2015-03-31,...,Due to the global imbalance of import and expo...,71429.00,50000.00,H2020-SMEINST-1-2014,SME-1,HOLLAND CONTAINER INNOVATIONS NEDERLAND B.V.,NL,,,
9437,204432,675451,CompBioMed,SIGNED,H2020-EU.1.4.1.3.,EINFRA-5-2015,H2020,A Centre of Excellence in Computational Biomed...,2016-10-01,2019-09-30,...,This Centre of Excellence will advance the rol...,4938216.25,4938215.75,H2020-EINFRA-2015-1,RIA,UNIVERSITY COLLEGE LONDON,UK,UNIVERSIDAD POMPEU FABRA;THE UNIVERSITY OF SHE...,ES;UK;CH;NL;DE;FR;BE,
3858,223279,823712,CompBioMed2,SIGNED,H2020-EU.1.4.1.3.,INFRAEDI-02-2018,H2020,A Centre of Excellence in Computational Biomed...,2019-10-01,2023-09-30,...,CompBioMed2 is a proposal for the second phase...,8345472.50,7992822.50,H2020-INFRAEDI-2018-1,RIA,UNIVERSITY COLLEGE LONDON,UK,UNIVERSIDAD POMPEU FABRA;THE UNIVERSITY OF SHE...,ES;UK;CH;DE;NL;FR;IT;BE,
21038,207091,741640,XPECAM,CLOSED,H2020-EU.3.6.;H2020-EU.2.3.1.,SMEInst-12-2016-2017,H2020,A New Portable Spectral Camera System for the ...,2017-01-01,2017-04-30,...,Art conservators & Cultural Heritage preservat...,71429.00,50000.00,H2020-SMEINST-1-2016-2017,SME-1,SIGNINUM GESTAO DE PATRIMONIO CULTURAL LDA,PT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,227364,879570,I-KAM2EU,SIGNED,H2020-EU.2.3.2.2.,H2020-EEN-SGA4,H2020,enhancing Innovation and Key Account Managemen...,2020-01-01,2021-12-31,...,The proposal aims at improving the innovation ...,234423.75,234423.00,H2020-EEN-SGA4-2020-2021,H2020-EEN-SGA,PROMOFIRENZE AZIENDA SPECIALE DELLACAMERA DI C...,IT,CAMERA DI COMMERCIO INDUSTRIA ARTIGIANATO E AG...,IT,
24938,196237,663178,REFEREE,CLOSED,H2020-EU.3.4.;H2020-EU.2.3.1.,IT-1-2014-1,H2020,pREcise Fluids mEteRing EquipmEnt,2015-01-01,2015-04-30,...,"No one likes to be cheated, neither on purpose...",71429.00,50000.00,H2020-SMEINST-1-2014,SME-1,FLEXBIMEC INTERNATIONAL SRL,IT,,,
24015,199089,696776,REFEREE,SIGNED,H2020-EU.3.4.;H2020-EU.2.3.1.,IT-1-2015,H2020,pREcise Fluids mEteRing EquipmEnt,2015-11-01,2017-10-31,...,"We are FLEXBIMEC International SRL, an Italian...",1467250.00,1027075.00,H2020-SMEINST-2-2015,SME-2,FLEXBIMEC INTERNATIONAL SRL,IT,,,
23828,198495,674820,KAM2SouthPL2,CLOSED,H2020-EU.2.3.,INNOVATION,H2020,‘Key account management’ for the SME Instrumen...,2015-01-01,2016-12-31,...,The general objective of the project is to con...,292250.00,292250.00,H2020-Adhoc-2014-20,H2020-EEN-SGA,SWIETOKRZYSKIE CENTRUM INNOWACJI ITRANSFERU TE...,PL,"""Stowarzyszenie Grupy Przedsiebiorcow Przemysl...",PL,


In [12]:
df_h2020[df_h2020.acronym == "CompBioMed2"]

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects
3858,223279,823712,CompBioMed2,SIGNED,H2020-EU.1.4.1.3.,INFRAEDI-02-2018,H2020,A Centre of Excellence in Computational Biomed...,2019-10-01,2023-09-30,...,CompBioMed2 is a proposal for the second phase...,8345472.5,7992822.5,H2020-INFRAEDI-2018-1,RIA,UNIVERSITY COLLEGE LONDON,UK,UNIVERSIDAD POMPEU FABRA;THE UNIVERSITY OF SHE...,ES;UK;CH;DE;NL;FR;IT;BE,


In [13]:
objective_duplicates = df_h2020[df_h2020.duplicated(subset=['objective'], keep=False)]

#objective_duplicates.sort_values("objective")

In [14]:
# missing value in objectives

df_h2020[df_h2020['objective'].isna()]

Unnamed: 0,rcn,id,acronym,status,programme,topics,frameworkProgramme,title,startDate,endDate,...,objective,totalCost,ecMaxContribution,call,fundingScheme,coordinator,coordinatorCountry,participants,participantCountries,subjects


In [15]:
h2020_clean = df_h2020[df_h2020['objective'].notnull()]
h2020_clean.shape

(29329, 21)

In [16]:
# projects with duplicate objective
duplicate_to_remove = df_h2020[df_h2020.duplicated(subset=['objective'], keep='first')]['id']
#print(duplicate_to_remove)
h2020_clean = h2020_clean[~h2020_clean['id'].isin(duplicate_to_remove)]
h2020_clean.shape

(29314, 21)

In [17]:
# save cleaned projects file
#h2020_clean.to_csv("datasets/data_oct19/h2020_clean/projects_clean.csv")

In [18]:
# LIST OF project IDs that are kept for further analysis

h2020_project_ids = set(h2020_clean.id)
len(h2020_project_ids)

29314

# H2020: REPORTS

## Load summary reports: H2020

In [19]:
# the reports file - CSV is poorly formatted, so reading xlsx!!

loc = "datasets/data_may20/h2020_raw/cordis-h2020reports.xlsx"
"""
df_h2020 = pd.read_excel(loc, header=0, names=['rcn', 'id', 'acronym', 'status', 'programme', 'topics', \
                                             'frameworkProgramme', 'title', 'startDate', 'endDate', 'projectUrl', \
                                            'objective', 'totalCost', 'ecMaxContribution', 'call', 'fundingScheme', \
                                            'coordinator', 'coordinatorCountry', 'participants', \
                                             'participantCountries', 'subjects'])
"""

reports = pd.read_excel(loc)
reports.head()

Unnamed: 0,rcn,language,title,teaser,summary,workPerformed,finalResults,lastUpdateDate,country,projectID,projectAcronym,programme,topics,relatedFile,url
0,453405,en,Periodic Reporting for period 1 - DiAMS (SEMI-...,More than 420 M people live with diabetes glob...,More than 420 M people live with diabetes glob...,The development of the feasibility study gave ...,DiAMS will disrupt insulin therapy. It is invi...,2020-04-25 14:20:41,,867975,DiAMS,H2020-EU.3.;H2020-EU.2.3.;H2020-EU.2.1.,EIC-SMEInst-2018-2020,/docs/results/h2020/867/867975_PS/image.jpg,https://amf.ch/
1,453406,en,Periodic Reporting for period 1 - ArtIST (Pre-...,An estimated 4.1 million people are affected b...,An estimated 4.1 million people are affected b...,The EU grant has helped us pushing forward our...,"During the development of the project, and tog...",2020-04-25 14:21:02,,868365,ArtIST,H2020-EU.3.;H2020-EU.2.3.;H2020-EU.2.1.,EIC-SMEInst-2018-2020,/docs/results/h2020/868/868365_PS/figure1-clov...,http://www.clovermsdataanalysis.com
2,449137,en,Periodic Reporting for period 1 - BioSolar Lea...,Due to fast worldwide population growth (from ...,Due to fast worldwide population growth (from ...,Arborea completed a full analysis of the techn...,The expected outcome is to optimize manufactur...,2020-04-25 15:29:00,,878259,BioSolar Leaf,H2020-EU.3.;H2020-EU.2.3.;H2020-EU.2.1.,EIC-SMEInst-2018-2020,/docs/results/h2020/878/878259_PS/biosolar-lea...,http://arborea.io/
3,220274,en,Periodic Reporting for period 2 - NanOQTech (N...,Quantum technologies are developed to overcome...,Quantum technologies are developed to overcome...,The first part of the work carried out during ...,The nanoparticles developed during NanOQTech h...,2020-04-25 07:28:40,,712721,NanOQTech,H2020-EU.1.2.1.,FETOPEN-RIA-2014-2015,/docs/results/h2020/712/712721_PS/imagekit.png,http://www.nanoqtech.eu
4,449138,en,Periodic Reporting for period 1 - Sampols (A n...,"OrbiWise is a Swiss company based in Geneva, f...","OrbiWise is a Swiss company based in Geneva, f...","In this Feasibility Study, we have analyzed th...",There are two groups of competitors in the noi...,2020-04-25 15:01:40,,878246,Sampols,H2020-EU.3.;H2020-EU.2.3.;H2020-EU.2.1.,EIC-SMEInst-2018-2020,/docs/results/h2020/878/878246_PS/image-1.png,http://www.orbiwise.com


In [20]:
reports.shape

(15907, 15)

## Are columns "teaser" and "summary" the same for all H2020 projects??

In [21]:
print(reports.summary[1][250:350])

e of the Goals for Sustainable Development of the United Nations (UN).
The quick identification of t


In [22]:
pd.set_option('display.max_colwidth', -1)
print(reports.teaser[1][250:350])

e of the Goals for...


  """Entry point for launching an IPython kernel.


In [23]:
reports.profile_report()

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=29.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…






In [24]:
# keep projects only in h2020_project_ids
reports_clean = reports[reports.projectID.isin(h2020_project_ids)]

reports_clean.shape

(15896, 15)

In [25]:
# drop fields from the clean file: ['language', 'teaser', 'country', 'programme', 'topics']

reports_clean.drop(['language', 'teaser', 'country', 'programme', 'topics'], axis=1, inplace=True)
reports_clean.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


(15896, 10)

In [26]:
#reports_clean.to_csv("datasets/data_oct19/h2020_clean/reports_clean.csv")

# Merge basic project info and reports

In [29]:
h2020_full = h2020_clean.merge(reports_clean, how = 'left', left_on="id", right_on = "projectID", suffixes=("", "_report"))
h2020_full.shape

(29314, 31)

In [31]:
dirName = "datasets/data_may20/h2020_clean"

try:
    os.makedirs(dirName)    
    print("Directory " , dirName ,  " Created ")
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  datasets/data_may20/h2020_clean  Created 


In [32]:
h2020_full.to_csv("datasets/data_may20/h2020_clean/h2020_full.csv", index=False)

# Read other data - dictionary placement

df_h2020 - basic h2020 projects data - in a separate df
reports - report summaries of H2020 - in a separate df


KEYS --> all dataframes placed in the dictionary h2020:
- organisations
- pis - h2020 project investigators data
- publications - project publications data
- fellows - MSCA researches data
- reports - report summaries data, as in reports
- projects - basic project data, as in df_h2020
- deliverables

In [33]:
excels

['datasets/data_may20/h2020_raw/cordis-h2020organizations.xlsx',
 'datasets/data_may20/h2020_raw/cordis-h2020-erc-pi.xlsx',
 'datasets/data_may20/h2020_raw/cordis-h2020projectPublications.xlsx',
 'datasets/data_may20/h2020_raw/cordis-h2020-msca-fellows.xlsx',
 'datasets/data_may20/h2020_raw/cordis-h2020reports.xlsx',
 'datasets/data_may20/h2020_raw/cordis-h2020projects.xlsx',
 'datasets/data_may20/h2020_raw/cordis-h2020projectDeliverables.xlsx']

In [34]:
keys = ['organisations', 'pis', 'publications', 'fellows', 'reports', 'projects', 'deliverables']

h2020 = dict()
for key, loc in zip(keys, excels):
    h2020[key] = pd.read_excel(loc)
    
len(h2020)

7

In [35]:
h2020['organisations'].head()

Unnamed: 0,projectRcn,projectID,projectAcronym,role,id,name,shortName,activityType,endOfParticipation,ecContribution,...,organizationUrl,vatNumber,contactForm,contactType,contactTitle,contactFirstNames,contactLastNames,contactFunction,contactTelephoneNumber,contactFaxNumber
0,229267,894593,ICARUS,participant,938215526,TOPVIEW SRL,TOPV,PRC,False,172750.0,...,,IT03920880618,https://ec.europa.eu/research/participants/api/contact/indexcontactproject.html?pic=938215526&projectId=894593&programId=31045243,,,,,,,
1,229267,894593,ICARUS,participant,999939051,TELESPAZIO SPA,TPZ,PRC,False,169800.0,...,www.telespazio.it,IT04812701003,https://ec.europa.eu/research/participants/api/contact/indexcontactproject.html?pic=999939051&projectId=894593&programId=31045243,,,,,,,
2,229267,894593,ICARUS,participant,904200924,DRONERADAR SP Z O.O.,DRAD,PRC,False,228612.5,...,,PL5223135685,https://ec.europa.eu/research/participants/api/contact/indexcontactproject.html?pic=904200924&projectId=894593&programId=31045243,,,,,,,
3,229267,894593,ICARUS,participant,999483733,EUROCONTROL - EUROPEAN ORGANISATION FOR THE SAFETY OF AIR NAVIGATION,ECTL,REC,False,,...,www.eurocontrol.int,NOTAPPLICABLE,https://ec.europa.eu/research/participants/api/contact/indexcontactproject.html?pic=999483733&projectId=894593&programId=31045243,,,,,,,
4,229267,894593,ICARUS,coordinator,991678046,E-GEOS SPA,EGEOS,PRC,False,277050.0,...,www.e-geos.it,IT01032180778,https://ec.europa.eu/research/participants/api/contact/indexcontactproject.html?pic=991678046&projectId=894593&programId=31045243,,,,,,,


## Fellows dataset needs corrections!

In [36]:
key = 'fellows'
h2020[key] = pd.read_excel(path + "cordis-h2020-msca-fellows.xlsx", header = 2) # first two rows are garbage

In [37]:
h2020['fellows'].head()

Unnamed: 0.1,Unnamed: 0,projectId,projectAcronym,fundingScheme,organizationId,title,firstName,lastName
0,,653413,INNOVATIONOPTIONS,MSCA-IF,998096827.0,PROF,Lenos,Trigeorgis
1,,653413,INNOVATIONOPTIONS,MSCA-IF,999981052.0,PROF,Lenos,Trigeorgis
2,,653784,OrganoMag,MSCA-IF,999903840.0,DR,Fu-Sheng,Guo
3,,653846,Fractional,MSCA-IF,999467340.0,DR,Adolfo,González Grushin
4,,653846,Fractional,MSCA-IF,999984350.0,DR,Adolfo,González Grushin


In [38]:
for key in keys:
    print("File: ", key)
    print()
    print(h2020[key].head())

File:  organisations

   projectRcn  projectID projectAcronym         role         id  \
0  229267      894593     ICARUS         participant  938215526   
1  229267      894593     ICARUS         participant  999939051   
2  229267      894593     ICARUS         participant  904200924   
3  229267      894593     ICARUS         participant  999483733   
4  229267      894593     ICARUS         coordinator  991678046   

                                                                   name  \
0  TOPVIEW SRL                                                            
1  TELESPAZIO SPA                                                         
2  DRONERADAR SP Z O.O.                                                   
3  EUROCONTROL - EUROPEAN ORGANISATION FOR THE SAFETY OF AIR NAVIGATION   
4  E-GEOS SPA                                                             

  shortName activityType  endOfParticipation  ecContribution  ...  \
0  TOPV      PRC          False               172750.0 

## Completeness of data

In [39]:
h2020_full.columns.values

array(['rcn', 'id', 'acronym', 'status', 'programme', 'topics',
       'frameworkProgramme', 'title', 'startDate', 'endDate',
       'projectUrl', 'objective', 'totalCost', 'ecMaxContribution',
       'call', 'fundingScheme', 'coordinator', 'coordinatorCountry',
       'participants', 'participantCountries', 'subjects', 'rcn_report',
       'title_report', 'summary', 'workPerformed', 'finalResults',
       'lastUpdateDate', 'projectID', 'projectAcronym', 'relatedFile',
       'url'], dtype=object)

In [40]:
h2020_full.projectUrl

0        NaN                                                                              
1        NaN                                                                              
2        NaN                                                                              
3        NaN                                                                              
4        NaN                                                                              
        ...                                                                               
29309    NaN                                                                              
29310    NaN                                                                              
29311    NaN                                                                              
29312    http://www.alekon.ee/                                                            
29313    http://www.treeway.nl/treeway-receive-h2020-grant-for-biomarker-feasibility-study

In [41]:
# how many url's are missing?

sum(h2020_full.projectUrl.isna())

14820

In [42]:
h2020_full.shape

(29314, 31)