<a href="https://colab.research.google.com/github/ffg-kom/mapping_evolution/blob/master/getting_data_ready.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

The following code was copied from https://colab.research.google.com/github/nestauk/im_tutorials/blob/master/notebooks/01_intro_to_pandas_tutorial.ipynb.

In [12]:
%load_ext autoreload
%autoreload 2

# install im_tutorial package
!pip install git+https://github.com/nestauk/im_tutorials.git

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Collecting git+https://github.com/nestauk/im_tutorials.git
  Cloning https://github.com/nestauk/im_tutorials.git to /tmp/pip-req-build-mpl_eqsl
  Running command git clone -q https://github.com/nestauk/im_tutorials.git /tmp/pip-req-build-mpl_eqsl
Building wheels for collected packages: im-tutorials
  Building wheel for im-tutorials (setup.py) ... [?25l[?25hdone
  Created wheel for im-tutorials: filename=im_tutorials-0.1.1-cp36-none-any.whl size=15103 sha256=cc5fed29a8c5b1bacb2a1f495f27386b6843f0c12ec68e96dcfe70aa6c35f939
  Stored in directory: /tmp/pip-ephem-wheel-cache-tsqvbexl/wheels/47/a3/cb/bdc5f9ba49bcfd2c6864b166a1566eb2f104113bf0c3500330
Successfully built im-tutorials


In [0]:
# numpy for mathematical functions
import numpy as np
# pandas for handling tabular data
import pandas as pd
# explained later
from im_tutorials.data.cordis import h2020_projects

# for plotting
import matplotlib.pyplot as plt

# data
from im_tutorials.data.cordis import cordis_table

# Loading data

In [14]:
# load tables
cordis_o_df = cordis_table('organisations')
cordis_p_df = cordis_table('projects')
cordis_p_o_df = cordis_table('project_organisations')
cordis_pub_df = cordis_table('publications')

# merge them
cordis_project_orgs_df = cordis_p_o_df.merge(
  cordis_o_df, left_on='organization_id', right_on='id', how='left'
)
cordis_project_orgs_df = cordis_project_orgs_df.merge(
  cordis_p_df, left_on='project_rcn', right_on='rcn', how='left'
)

cordis_project_orgs_df.rename( columns={'contribution': 'ec_contribution_organization'
                                        , 'ec_contribution': 'ec_contribution_project'
                                        , 'total_cost': 'total_cost_project'
                                       }, inplace=True )

cordis_project_orgs_df.shape

(251040, 25)

In [15]:
# add number of partners per project as a column
count_partners_df = cordis_project_orgs_df[['project_rcn', 'organization_id']].groupby( by='project_rcn' ).count()
count_partners_df.rename( columns={'organization_id': 'number_of_partners'}, inplace=True )

cordis_project_orgs_df = cordis_project_orgs_df.merge(
  count_partners_df, on='project_rcn', how='left'
)

# add number of publications per project as a column
count_publications_df = cordis_pub_df[['id', 'project_rcn']].groupby( by='project_rcn' ).count()
count_publications_df.rename( columns={'id': 'number_of_publications'}, inplace=True )

cordis_project_orgs_df = cordis_project_orgs_df.merge(
  count_publications_df, on='project_rcn', how='left'
)


# shape: should by (something, 27) 
cordis_project_orgs_df.shape


(251040, 27)

In [0]:
# preparing dictionaries to effectively group data
dict_funding_schemes_along_programmes = {'BBI-CSA - Bio-based Industries Coordination and Support action': 'Bio-based Industries',
                                         'BBI-IA-DEMO - Bio-based Industries Innovation action - Demonstration': 'Bio-based Industries', 
                                         'BBI-IA-FLAG - Bio-based Industries Innovation action - Flagship': 'Bio-based Industries',
                                         'BBI-RIA - Bio-based Industries Research and Innovation action': 'Bio-based Industries', 
                                         'COFUND-EJP - COFUND (European Joint Programme)': '(general)', 
                                         'COFUND-PCP - COFUND (PCP)': '(general)', 
                                         'CS2-CSA - Coordination & support action': 'Clean Sky 2', 
                                         'CS2-IA - Innovation action': 'Clean Sky 2', 
                                         'CS2-RIA - Research and Innovation action': 'Clean Sky 2', 
                                         'CSA - Coordination and support action': '(general)', 
                                         'CSA-LS - CSA Lump sum': '(general)', 
                                         'ECSEL-CSA - ECSEL Coordination & Support action': 'ECSEL (Electronic Components and Systems for European Leadership)', 
                                         'ECSEL-IA - ECSEL Innovation Action': 'ECSEL (Electronic Components and Systems for European Leadership)', 
                                         'ECSEL-RIA - ECSEL Research and Innovation Action': 'ECSEL (Electronic Components and Systems for European Leadership)', 
                                         'ERA-NET-Cofund - ERA-NET Cofund': 'ERA-NET', 
                                         'ERC-ADG - Advanced Grant': 'ERC (European Research Council)', 
                                         'ERC-COG - Consolidator Grant': 'ERC (European Research Council)', 
                                         'ERC-POC - Proof of Concept Grant': 'ERC (European Research Council)', 
                                         'ERC-POC-LS - ERC Proof of Concept Lump Sum Pilot': 'ERC (European Research Council)', 
                                         'ERC-STG - Starting Grant': 'ERC (European Research Council)', 
                                         'FCH2-CSA - Coordination & support action': 'FCH2 (Fuel Cells and Hydrogen 2)', 
                                         'FCH2-IA - Innovation action': 'FCH2 (Fuel Cells and Hydrogen 2)', 
                                         'FCH2-RIA - Research and Innovation action': 'FCH2 (Fuel Cells and Hydrogen 2)', 
                                         'H2020-EEN-SGA - Specific Grant Agreement Enterprise Europe Network (EEN)': 'SME (Small and medium-sized Enterprises)', 
                                         'IA - Innovation action': '(general)', 
                                         'IA-LS - Innovation action Lump Sum': '(general)', 
                                         'IMI2-CSA - Coordination & support action': 'IMI2 (Innovative Medicines Initiative)', 
                                         'IMI2-RIA - Research and Innovation action': 'IMI2 (Innovative Medicines Initiative)', 
                                         'MSCA-COFUND-DP - Doctoral programmes': 'MSCA (Marie Skłodowska-Curie Actions)', 
                                         'MSCA-COFUND-FP - Fellowship programmes': 'MSCA (Marie Skłodowska-Curie Actions)', 
                                         'MSCA-IF-EF-CAR - CAR â€“ Career Restart panel': 'MSCA (Marie Skłodowska-Curie Actions)', 
                                         'MSCA-IF-EF-RI - RI â€“ Reintegration panel': 'MSCA (Marie Skłodowska-Curie Actions)', 
                                         'MSCA-IF-EF-SE - Society and Enterprise panel': 'MSCA (Marie Skłodowska-Curie Actions)', 
                                         'MSCA-IF-EF-ST - Standard EF': 'MSCA (Marie Skłodowska-Curie Actions)', 
                                         'MSCA-IF-GF - Global Fellowships': 'MSCA (Marie Skłodowska-Curie Actions)', 
                                         'MSCA-ITN-EID - European Industrial Doctorates': 'MSCA (Marie Skłodowska-Curie Actions)', 
                                         'MSCA-ITN-EJD - European Joint Doctorates': 'MSCA (Marie Skłodowska-Curie Actions)', 
                                         'MSCA-ITN-ETN - European Training Networks': 'MSCA (Marie Skłodowska-Curie Actions)', 
                                         'MSCA-RISE - Marie SkÅ‚odowska-Curie Research and Innovation Staff Exchange (RISE)': 'MSCA (Marie Skłodowska-Curie Actions)', 
                                         'PCP - Pre-Commercial Procurement': 'PCP & PPI (Pre-commercial or public Procurement)', 
                                         'PPI - Public Procurement of Innovative solutions': 'PCP & PPI (Pre-commercial or public Procurement)', 
                                         'RIA - Research and Innovation action': '(general)', 
                                         'RIA-LS - Research and Innovation action Lump Sum': '(general)', 
                                         'SESAR-CSA - Coordination and Support Action': 'SESAR (Single European Sky)', 
                                         'SESAR-IA - Innovation action': 'SESAR (Single European Sky)', 
                                         'SESAR-RIA - Research and Innovation action': 'SESAR (Single European Sky)', 
                                         'SGA-CSA - Specific Grant agreement and Coordination and Support Action': '(general)', 
                                         'SGA-RIA - SGA-RIA': '(general)', 
                                         'Shift2Rail-CSA - Coordination and Support Action': 'Shift2Rail', 
                                         'Shift2Rail-IA - Innovation action': 'Shift2Rail', 
                                         'Shift2Rail-IA-LS - Innovation Action Lump-Sum': 'Shift2Rail', 
                                         'Shift2Rail-RIA - Research and Innovation action': 'Shift2Rail', 
                                         'Shift2Rail-RIA-LS - Shift2Rail Research and Innovation Action Lump-Sum': 'Shift2Rail', 
                                         'SME-1 - SME instrument phase 1': 'SME (Small and medium-sized Enterprises)', 
                                         'SME-2 - SME instrument phase 2': 'SME (Small and medium-sized Enterprises)'
                                        }

dict_funding_schemes_along_actions = {'BBI-CSA - Bio-based Industries Coordination and Support action': 'CSA (Coordination and Support Action)',
                                      'BBI-IA-DEMO - Bio-based Industries Innovation action - Demonstration': 'IA (Innovation Action)',
                                      'BBI-IA-FLAG - Bio-based Industries Innovation action - Flagship': 'IA (Innovation Action)',
                                      'BBI-RIA - Bio-based Industries Research and Innovation action': 'RIA (Research and Innovation Action)',
                                      'COFUND-EJP - COFUND (European Joint Programme)': 'COFUND',
                                      'COFUND-PCP - COFUND (PCP)': 'COFUND',
                                      'CS2-CSA - Coordination & support action': 'CSA (Coordination and Support Action)',
                                      'CS2-IA - Innovation action': 'IA (Innovation Action)',
                                      'CS2-RIA - Research and Innovation action': 'RIA (Research and Innovation Action)',
                                      'CSA - Coordination and support action': 'CSA (Coordination and Support Action)',
                                      'CSA-LS - CSA Lump sum': 'CSA (Coordination and Support Action)',
                                      'ECSEL-CSA - ECSEL Coordination & Support action': 'CSA (Coordination and Support Action)',
                                      'ECSEL-IA - ECSEL Innovation Action': 'IA (Innovation Action)',
                                      'ECSEL-RIA - ECSEL Research and Innovation Action': 'RIA (Research and Innovation Action)',
                                      'ERA-NET-Cofund - ERA-NET Cofund': 'COFUND',
                                      'ERC-ADG - Advanced Grant': 'Grant',
                                      'ERC-COG - Consolidator Grant': 'Grant',
                                      'ERC-POC - Proof of Concept Grant': 'Grant',
                                      'ERC-POC-LS - ERC Proof of Concept Lump Sum Pilot': 'Grant',
                                      'ERC-STG - Starting Grant': 'Grant',
                                      'FCH2-CSA - Coordination & support action': 'CSA (Coordination and Support Action)',
                                      'FCH2-IA - Innovation action': 'IA (Innovation Action)',
                                      'FCH2-RIA - Research and Innovation action': 'RIA (Research and Innovation Action)',
                                      'H2020-EEN-SGA - Specific Grant Agreement Enterprise Europe Network (EEN)': 'SME (Small and medium-sized Enterprises)',
                                      'IA - Innovation action': 'IA (Innovation Action)',
                                      'IA-LS - Innovation action Lump Sum': 'IA (Innovation Action)',
                                      'IMI2-CSA - Coordination & support action': 'CSA (Coordination and Support Action)',
                                      'IMI2-RIA - Research and Innovation action': 'RIA (Research and Innovation Action)',
                                      'MSCA-COFUND-DP - Doctoral programmes': 'COFUND',
                                      'MSCA-COFUND-FP - Fellowship programmes': 'COFUND',
                                      'MSCA-IF-EF-CAR - CAR â€“ Career Restart panel': 'Fellowship',
                                      'MSCA-IF-EF-RI - RI â€“ Reintegration panel': 'Fellowship',
                                      'MSCA-IF-EF-SE - Society and Enterprise panel': 'Fellowship',
                                      'MSCA-IF-EF-ST - Standard EF': 'Fellowship',
                                      'MSCA-IF-GF - Global Fellowships': 'Fellowship',
                                      'MSCA-ITN-EID - European Industrial Doctorates': 'Innovative Training Networks',
                                      'MSCA-ITN-EJD - European Joint Doctorates': 'Innovative Training Networks',
                                      'MSCA-ITN-ETN - European Training Networks': 'Innovative Training Networks',
                                      'MSCA-RISE - Marie SkÅ‚odowska-Curie Research and Innovation Staff Exchange (RISE)': 'Research & Innovation Staff Exchange',
                                      'PCP - Pre-Commercial Procurement': 'Procurement',
                                      'PPI - Public Procurement of Innovative solutions': 'Procurement',
                                      'RIA - Research and Innovation action': 'RIA (Research and Innovation Action)',
                                      'RIA-LS - Research and Innovation action Lump Sum': 'RIA (Research and Innovation Action)',
                                      'SESAR-CSA - Coordination and Support Action': 'CSA (Coordination and Support Action)',
                                      'SESAR-IA - Innovation action': 'IA (Innovation Action)',
                                      'SESAR-RIA - Research and Innovation action': 'RIA (Research and Innovation Action)',
                                      'SGA-CSA - Specific Grant agreement and Coordination and Support Action': 'CSA (Coordination and Support Action)',
                                      'SGA-RIA - SGA-RIA': 'RIA (Research and Innovation Action)',
                                      'Shift2Rail-CSA - Coordination and Support Action': 'CSA (Coordination and Support Action)',
                                      'Shift2Rail-IA - Innovation action': 'IA (Innovation Action)',
                                      'Shift2Rail-IA-LS - Innovation Action Lump-Sum': 'IA (Innovation Action)',
                                      'Shift2Rail-RIA - Research and Innovation action': 'RIA (Research and Innovation Action)',
                                      'Shift2Rail-RIA-LS - Shift2Rail Research and Innovation Action Lump-Sum': 'RIA (Research and Innovation Action)',
                                      'SME-1 - SME instrument phase 1': 'SME (Small and medium-sized Enterprises)',
                                      'SME-2 - SME instrument phase 2': 'SME (Small and medium-sized Enterprises)'
                                     }


In [17]:
# add dictionaries as new columns
cordis_project_orgs_df['funding_scheme_programme'] = cordis_project_orgs_df['funding_scheme']
cordis_project_orgs_df['funding_scheme_programme'] = cordis_project_orgs_df['funding_scheme_programme'].map(dict_funding_schemes_along_programmes)

cordis_project_orgs_df['funding_scheme_action'] = cordis_project_orgs_df['funding_scheme']
cordis_project_orgs_df['funding_scheme_action'] = cordis_project_orgs_df['funding_scheme_action'].map(dict_funding_schemes_along_actions)


cordis_project_orgs_df.shape
    

(251040, 29)

In [18]:
# narrow our dataset down to what we want to work with
subset = cordis_project_orgs_df[
    (cordis_project_orgs_df.activity_type == 'Private for-profit entities (excluding Higher or Secondary Education Establishments)')
  & (cordis_project_orgs_df.framework == 'H2020')
]

df = subset[['organization_id', 'start_date_code', 'funding_scheme', 'funding_scheme_programme', 'funding_scheme_action', 'number_of_partners', 'number_of_publications', 'ec_contribution_organization', 'ec_contribution_project', 'total_cost_project']]

# putting data in order
df = df.sort_values( by=['organization_id', 'start_date_code'] ).reset_index()

# add a column that numbers entries per organisation
df['organization_entry_number'] = df.groupby('organization_id').cumcount()
df.shape


(37138, 12)


## Playing with the data

Just skip this part when getting data ready for exporting!

In [0]:
cordis_project_orgs_df.columns

Index(['project_rcn', 'organization_id', 'activity_type', 'address',
       'ec_contribution_organization', 'type', 'website_x', 'id', 'name',
       'country_code', 'country_name', 'rcn', 'acronym', 'end_date_code',
       'ec_contribution_project', 'framework', 'funding_scheme',
       'funded_under', 'objective', 'project_description', 'start_date_code',
       'status', 'title', 'total_cost_project', 'website_y',
       'number_of_partners'],
      dtype='object')

In [0]:
cordis_project_orgs_df['activity_type'].value_counts()

Higher or Secondary Education Establishments                                                         88335
Private for-profit entities (excluding Higher or Secondary Education Establishments)                 78810
Research Organisations                                                                               55099
Other                                                                                                15406
Public bodies (excluding Research Organisations and Secondary or Higher Education Establishments)    12454
                                                                                                       936
Name: activity_type, dtype: int64

In [0]:
# within subset, how often do organisations appear
subset['organization_id'].value_counts().head()



999993856    148
999960488     84
999909854     79
999951467     76
999908787     73
Name: organization_id, dtype: int64

In [0]:
subset[['organization_id', 'start_date_code', 'funding_scheme', 'number_of_partners', 'ec_contribution_organization', 'ec_contribution_project', 'total_cost_project']]


In [0]:
for i in range(100):
  print(cordis_project_orgs_df.loc[i + 10000].funded_under[0]['title'])



In [0]:
df['funding_scheme'].value_counts().to_csv('funding_scheme_grouping.csv')

  """Entry point for launching an IPython kernel.


# Preparing data for a Sankey diagram




## First two projects of every organisation

In [0]:
# prepare the column that shall be used as the category shown in the sankey diagram
#   v   v   v   v   v   v   v   v   v   #
sankey_column = 'funding_scheme_action'
#   ^   ^   ^   ^   ^   ^   ^   ^   ^   #


dfs = df.rename(columns={sankey_column:'sankey'})

# looking at first two entries of organisations
first = dfs[dfs['organization_entry_number'] == 0]
second = dfs[dfs['organization_entry_number'] == 1]

# left is second step, right is first step
onestep = second.merge(first, on='organization_id', how='left')

# aggregate over sankey-columns
sankey_df = onestep[['sankey_y', 'sankey_x', 'organization_id']].groupby(['sankey_y', 'sankey_x']).count()

# prepare for proper output (index -> columns, renaming)
sankey_df.reset_index(level=['sankey_y', 'sankey_x'], inplace=True)
sankey_df.rename(columns={'sankey_y':'source', 'sankey_x':'target', 'organization_id':'value'}, inplace=True)

# add a blank space to all targets (so targets differ from sources)
sankey_df['target'] = sankey_df['target'] + ' '

sankey_df.shape

(62, 3)

In [0]:
# export csv
sankey_df.to_csv('sankey_onestep_data.csv', index=False, sep=';')


## All projects

There are cases, when an organisation has two or more new projects at once, in which case there is no clear order of projects/funding schemes.
We ignored this fact and - for the sake of getting the job done - live with the noise it adds.

In [19]:
# prepare the column that shall be used as the category shown in the sankey diagram
#   v   v   v   v   v   v   v   v   v   #
sankey_column = 'funding_scheme_action'
#   ^   ^   ^   ^   ^   ^   ^   ^   ^   #


dfs = df.rename(columns={sankey_column:'sankey'})

# looking at first two entries of organisations
dfs['next_step'] = dfs['organization_entry_number'] + 1

# left is current step, right is next step
allsteps = dfs.merge(dfs, left_on=['organization_id', 'next_step']
                    , right_on=['organization_id', 'organization_entry_number'], how='left')


# aggregate over sankey-columns
sankey_df = allsteps[['sankey_y', 'sankey_x', 'organization_id']].groupby(['sankey_y', 'sankey_x']).count()

# prepare for proper output (index -> columns, renaming)
sankey_df.reset_index(level=['sankey_y', 'sankey_x'], inplace=True)
sankey_df.rename(columns={'sankey_y':'source', 'sankey_x':'target', 'organization_id':'value'}, inplace=True)

# add a blank space to all targets (so targets differ from sources)
sankey_df['target'] = sankey_df['target'] + ' '

sankey_df.shape


(70, 3)

In [0]:
# export csv
sankey_df.to_csv('sankey_allsteps_data.csv', index=False, sep=';')


# Preparing data for a network graph




## All projects

There are cases, when an organisation has two or more new projects at once, in which case there is no clear order of projects/funding schemes.
We ignored this fact and - for the sake of getting the job done - live with the noise it adds.

In [67]:
# code is copied from above and I didn't rename variables to leviate debugging
# prepare the column that shall be used as the category shown in the sankey diagram
#   v   v   v   v   v   v   v   v   v   #
sankey_column = 'funding_scheme_action'
#   ^   ^   ^   ^   ^   ^   ^   ^   ^   #


dfs = df.rename(columns={sankey_column:'sankey'})

# looking at first two entries of organisations
dfs['next_step'] = dfs['organization_entry_number'] + 1

# left is current step, right is next step
allsteps = dfs.merge(dfs, left_on=['organization_id', 'next_step']
                    , right_on=['organization_id', 'organization_entry_number'], how='left')


# aggregate over sankey-columns
sankey_df = allsteps[['sankey_y', 'sankey_x', 'organization_id']].groupby(['sankey_y', 'sankey_x']).count()

# prepare for proper output (index -> columns, renaming)
sankey_df.reset_index(level=['sankey_y', 'sankey_x'], inplace=True)
sankey_df.rename(columns={'sankey_y':'source', 'sankey_x':'target', 'organization_id':'value'}, inplace=True)


#----------- from here onwards there's network specific stuff going on

# edges, directional
edges_dir = sankey_df

# create edges without direction
edges = []
for s, t, c in zip(edges_dir['source'], edges_dir['target'], edges_dir['value']):
    edge = sorted([s, t])
    edge.append(c)
    edges.append(edge)

edges_df = pd.DataFrame(edges)
edges_df = sorted_edge_df.groupby([0, 1]).sum().reset_index()

edges_df.shape



(42, 3)

In [0]:
# export csv
sankey_df.to_csv('network_data.csv', index=False, sep=';')


# Playing around with network graphics

Code here is copied from nestauk/im_tutorials/NetworkScience.ipynb.


In [22]:
%load_ext autoreload
%autoreload 2
# install im_tutorial package
!pip install git+https://github.com/nestauk/im_tutorials.git
!pip install pybind11
!pip install cpalgorithm
!pip install networkx
!pip install seaborn
# Needed to make some of the drag and dropping work
!pip install -U bokeh

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Collecting git+https://github.com/nestauk/im_tutorials.git
  Cloning https://github.com/nestauk/im_tutorials.git to /tmp/pip-req-build-ex78lub9
  Running command git clone -q https://github.com/nestauk/im_tutorials.git /tmp/pip-req-build-ex78lub9
Building wheels for collected packages: im-tutorials
  Building wheel for im-tutorials (setup.py) ... [?25l[?25hdone
  Created wheel for im-tutorials: filename=im_tutorials-0.1.1-cp36-none-any.whl size=15103 sha256=74ffac399bc3af6f1fb95314a5f5a25217b6b1a2261499b8c4ceaf8be6577b00
  Stored in directory: /tmp/pip-ephem-wheel-cache-mjjcn44v/wheels/47/a3/cb/bdc5f9ba49bcfd2c6864b166a1566eb2f104113bf0c3500330
Successfully built im-tutorials
Collecting pybind11
[?25l  Downloading https://files.pythonhosted.org/packages/4b/4d/ae1c4d8e8b139afa9682054dd42df3b0e3b5c1731287933021b9fd7e9cc4/pybind11-2.4.3-py2.py3-none-any.whl (150kB)
[K     |█████████████████████████

In [0]:
# importing useful Python utility libraries we'll need
from collections import Counter, defaultdict
import itertools

# matplotlib for static plots
import matplotlib.pyplot as plt
from matplotlib import pylab

# makes the plots prettier
import seaborn

# numpy for mathematical functions
import numpy as np

# pandas for handling tabular data
import pandas as pd

# networkx for the analysis of graphs
import networkx as nx

#from im_tutorials.utilities import chunks
from im_tutorials.data import *

# Useful data structure to count occurances
from collections import Counter

#Iteration lib
import itertools as it