<a href="https://colab.research.google.com/github/ffg-kom/mapping_evolution/blob/master/getting_data_ready.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

The following code was copied from https://colab.research.google.com/github/nestauk/im_tutorials/blob/master/notebooks/01_intro_to_pandas_tutorial.ipynb.

In [78]:
%load_ext autoreload
%autoreload 2

# install im_tutorial package
!pip install git+https://github.com/nestauk/im_tutorials.git

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Collecting git+https://github.com/nestauk/im_tutorials.git
  Cloning https://github.com/nestauk/im_tutorials.git to /tmp/pip-req-build-4yhmgxtr
  Running command git clone -q https://github.com/nestauk/im_tutorials.git /tmp/pip-req-build-4yhmgxtr
Building wheels for collected packages: im-tutorials
  Building wheel for im-tutorials (setup.py) ... [?25l[?25hdone
  Created wheel for im-tutorials: filename=im_tutorials-0.1.1-cp36-none-any.whl size=15103 sha256=1336cd3978348dc636e3e55bef7d8a960ddc9f158d39b345b688200958a1f5be
  Stored in directory: /tmp/pip-ephem-wheel-cache-sss3t1a9/wheels/47/a3/cb/bdc5f9ba49bcfd2c6864b166a1566eb2f104113bf0c3500330
Successfully built im-tutorials


In [0]:
# numpy for mathematical functions
import numpy as np
# pandas for handling tabular data
import pandas as pd
# explained later
from im_tutorials.data.cordis import h2020_projects

# for plotting
import matplotlib.pyplot as plt

# data
from im_tutorials.data.cordis import cordis_table

# Loading data

In [106]:
# load tables
cordis_o_df = cordis_table('organisations')
cordis_p_df = cordis_table('projects')
cordis_p_o_df = cordis_table('project_organisations')

# merge them
cordis_project_orgs_df = cordis_p_o_df.merge(
  cordis_o_df, left_on='organization_id', right_on='id', how='left'
)
cordis_project_orgs_df = cordis_project_orgs_df.merge(
  cordis_p_df, left_on='project_rcn', right_on='rcn', how='left'
)

cordis_project_orgs_df.rename( columns={'contribution': 'ec_contribution_organization'
                                        , 'ec_contribution': 'ec_contribution_project'
                                        , 'total_cost': 'total_cost_project'
                                       }, inplace=True )

cordis_project_orgs_df.head(2)

Unnamed: 0,project_rcn,organization_id,activity_type,address,ec_contribution_organization,type,website_x,id,name,country_code,country_name,rcn,acronym,end_date_code,ec_contribution_project,framework,funding_scheme,funded_under,objective,project_description,start_date_code,status,title,total_cost_project,website_y
0,85231,999635926,Other,"{'city': 'TORINO', 'street': 'CORSO DUCA DEGLI...",306448,participant,www.corep.it,999635926,CONSORZIO PER LA RICERCA E L EDUCAZIONE PERMAN...,IT,Italy,85231,EuroTraining,2010-10-31,1398009,FP7,CSA - Coordination and support action,"[{'rcn': '853', 'title': 'Specific Programme ""...",The objective of the EuroTraining proposal is ...,\nNext-Generation Nanoelectronics Components a...,2007-11-01,CLOSED,Provision of a European training infrastructure,1398009,http://www.eurotraining.net/
1,85231,999665802,Private for-profit entities (excluding Higher ...,"{'city': 'HOLTE', 'street': 'FREDERIKSLUNDSVEJ...",396542,coordinator,,999665802,TECHNOCONSULT APS,DK,Denmark,85231,EuroTraining,2010-10-31,1398009,FP7,CSA - Coordination and support action,"[{'rcn': '853', 'title': 'Specific Programme ""...",The objective of the EuroTraining proposal is ...,\nNext-Generation Nanoelectronics Components a...,2007-11-01,CLOSED,Provision of a European training infrastructure,1398009,http://www.eurotraining.net/


In [107]:
# add number of partners per project as a column
count_partners_df = cordis_project_orgs_df[['project_rcn', 'organization_id']].groupby( by='project_rcn' ).count()
count_partners_df.rename( columns={'organization_id': 'number_of_partners'}, inplace=True )

cordis_project_orgs_df = cordis_project_orgs_df.merge(
  count_partners_df, on='project_rcn', how='left'
)

# display header
cordis_project_orgs_df.head(2)


Unnamed: 0,project_rcn,organization_id,activity_type,address,ec_contribution_organization,type,website_x,id,name,country_code,country_name,rcn,acronym,end_date_code,ec_contribution_project,framework,funding_scheme,funded_under,objective,project_description,start_date_code,status,title,total_cost_project,website_y,number_of_partners
0,85231,999635926,Other,"{'city': 'TORINO', 'street': 'CORSO DUCA DEGLI...",306448,participant,www.corep.it,999635926,CONSORZIO PER LA RICERCA E L EDUCAZIONE PERMAN...,IT,Italy,85231,EuroTraining,2010-10-31,1398009,FP7,CSA - Coordination and support action,"[{'rcn': '853', 'title': 'Specific Programme ""...",The objective of the EuroTraining proposal is ...,\nNext-Generation Nanoelectronics Components a...,2007-11-01,CLOSED,Provision of a European training infrastructure,1398009,http://www.eurotraining.net/,5
1,85231,999665802,Private for-profit entities (excluding Higher ...,"{'city': 'HOLTE', 'street': 'FREDERIKSLUNDSVEJ...",396542,coordinator,,999665802,TECHNOCONSULT APS,DK,Denmark,85231,EuroTraining,2010-10-31,1398009,FP7,CSA - Coordination and support action,"[{'rcn': '853', 'title': 'Specific Programme ""...",The objective of the EuroTraining proposal is ...,\nNext-Generation Nanoelectronics Components a...,2007-11-01,CLOSED,Provision of a European training infrastructure,1398009,http://www.eurotraining.net/,5


In [0]:
# narrow our dataset down to what we want to work with
subset = cordis_project_orgs_df[
    (cordis_project_orgs_df.activity_type == 'Private for-profit entities (excluding Higher or Secondary Education Establishments)')
  & (cordis_project_orgs_df.framework == 'H2020')
]

df = subset[['organization_id', 'start_date_code', 'funding_scheme', 'number_of_partners', 'ec_contribution_organization', 'ec_contribution_project', 'total_cost_project']]


# Playing with the data

In [9]:
cordis_project_orgs_df.columns

Index(['project_rcn', 'organization_id', 'activity_type', 'address',
       'ec_contribution_organization', 'type', 'website_x', 'id', 'name',
       'country_code', 'country_name', 'rcn', 'acronym', 'end_date_code',
       'ec_contribution_project', 'framework', 'funding_scheme',
       'funded_under', 'objective', 'project_description', 'start_date_code',
       'status', 'title', 'total_cost_project', 'website_y',
       'number_of_partners'],
      dtype='object')

In [0]:
cordis_project_orgs_df['activity_type'].value_counts()

Higher or Secondary Education Establishments                                                         88335
Private for-profit entities (excluding Higher or Secondary Education Establishments)                 78810
Research Organisations                                                                               55099
Other                                                                                                15406
Public bodies (excluding Research Organisations and Secondary or Higher Education Establishments)    12454
                                                                                                       936
Name: activity_type, dtype: int64

In [36]:
subset = cordis_project_orgs_df[
    (cordis_project_orgs_df.activity_type == 'Private for-profit entities (excluding Higher or Secondary Education Establishments)')
  & (cordis_project_orgs_df.framework == 'H2020')
]

# within subset, how often do organisations appear
subset['organization_id'].value_counts().head()



999993856    148
999960488     84
999909854     79
999951467     76
999908787     73
Name: organization_id, dtype: int64

In [0]:
df = subset[['organization_id', 'start_date_code', 'funding_scheme', 'number_of_partners', 'ec_contribution_organization', 'ec_contribution_project', 'total_cost_project']].head()

In [0]:
for i in range(100):
  print(cordis_project_orgs_df.loc[i + 10000].funded_under[0]['title'])



# Preparing data for a Sankey diagram




In [0]:
# setting a new column this way will throw a SettingWithCopyWarning - but it seems to work just fine
df = df.sort_values( by=['organization_id', 'start_date_code'] ).reset_index()

# adding a column for entry_number - not necessary anymore, will be cleanly done in the next step
#df['organization_entry_number'] = df.index + 1





In [0]:
# add a column that counts entries per organisation
df['organization_entry_number'] = df.groupby('organization_id').cumcount()

df[['organization_id', 'start_date_code', 'organization_entry_number']]

In [172]:
# looking at first two entries of organizations
first = df[df['organization_entry_number'] == 0]
second = df[df['organization_entry_number'] == 1]

onestep = second.merge(first, on='organization_id', how='left')


# aggregate over funding schemes
sankey_df = onestep[['funding_scheme_y', 'funding_scheme_x', 'organization_id']].groupby(['funding_scheme_y', 'funding_scheme_x']).count()

# prepare for proper output
sankey_df.reset_index(level=['funding_scheme_y', 'funding_scheme_x'], inplace=True)
sankey_df.rename(columns={'funding_scheme_y':'funding_scheme_from', 'funding_scheme_x':'funding_scheme_to'}, inplace=True)

sankey_df


Unnamed: 0,funding_scheme_from,funding_scheme_to,organization_id
0,BBI-CSA - Bio-based Industries Coordination an...,BBI-IA-DEMO - Bio-based Industries Innovation ...,1
1,BBI-CSA - Bio-based Industries Coordination an...,IA - Innovation action,1
2,BBI-IA-DEMO - Bio-based Industries Innovation ...,BBI-IA-DEMO - Bio-based Industries Innovation ...,5
3,BBI-IA-DEMO - Bio-based Industries Innovation ...,BBI-RIA - Bio-based Industries Research and In...,6
4,BBI-IA-DEMO - Bio-based Industries Innovation ...,IA - Innovation action,5
5,BBI-IA-DEMO - Bio-based Industries Innovation ...,MSCA-ITN-ETN - European Training Networks,1
6,BBI-IA-DEMO - Bio-based Industries Innovation ...,MSCA-RISE - Marie Skłodowska-Curie Research an...,1
7,BBI-IA-DEMO - Bio-based Industries Innovation ...,RIA - Research and Innovation action,7
8,BBI-IA-DEMO - Bio-based Industries Innovation ...,SME-1 - SME instrument phase 1,1
9,BBI-IA-FLAG - Bio-based Industries Innovation ...,BBI-IA-DEMO - Bio-based Industries Innovation ...,4


In [174]:
# export csv

path = 'C:/Users/KOM/Documents/HackSTIR/mapping_evolution'
sankey_df.to_csv(r'C:/Users/KOM/Documents/HackSTIR/mapping_evolution/sankey_onestep_data.csv')


FileNotFoundError: ignored