# Import modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
import seaborn as sns
from im_tutorials.data.cordis import cordis_table

# Import Data

In [2]:
cordis_orgs_df = cordis_table('organisations')
cordis_project_orgs_df = cordis_table('project_organisations')
cordis_projects_df = cordis_table('projects')
cordis_full_df = cordis_project_orgs_df.merge(
    cordis_projects_df, left_on='project_rcn', right_on='rcn', how='left'
)
cordis_full_df = cordis_full_df.merge(
    cordis_orgs_df, left_on='organization_id', right_on='id', how='left'
)
cordis_full_df.head()
cordis_full_df.columns

Index(['project_rcn', 'organization_id', 'activity_type', 'address',
       'contribution', 'type', 'website_x', 'rcn', 'acronym', 'end_date_code',
       'ec_contribution', 'framework', 'funding_scheme', 'funded_under',
       'objective', 'project_description', 'start_date_code', 'status',
       'title', 'total_cost', 'website_y', 'id', 'name', 'country_code',
       'country_name'],
      dtype='object')

# Add labels to Dataframe 

In [3]:
cordis_full_df_sorted = cordis_full_df.sort_values(by=['organization_id', 'start_date_code'])
cordis_full_df_sorted = cordis_full_df_sorted.reset_index(drop=True)
cordis_full_df_sorted['num_success'] = cordis_full_df_sorted.groupby('organization_id').cumcount()
cordis_full_df_sorted_temp = cordis_full_df_sorted[['organization_id','num_success']]
cordis_full_df_sorted_temp = cordis_full_df_sorted_temp.groupby(by=['organization_id']).max()
cordis_full_df_sorted_temp.rename(columns={'num_success':'multiple_success'}, inplace=True)
cordis_full_df_sorted = cordis_full_df_sorted.merge(
    cordis_full_df_sorted_temp, on='organization_id', how='left'
)
cordis_full_df_sorted['group_multiple_success'] = (cordis_full_df_sorted['multiple_success'] > 0) * 1
cordis_full_df_sorted.columns

Index(['project_rcn', 'organization_id', 'activity_type', 'address',
       'contribution', 'type', 'website_x', 'rcn', 'acronym', 'end_date_code',
       'ec_contribution', 'framework', 'funding_scheme', 'funded_under',
       'objective', 'project_description', 'start_date_code', 'status',
       'title', 'total_cost', 'website_y', 'id', 'name', 'country_code',
       'country_name', 'num_success', 'multiple_success',
       'group_multiple_success'],
      dtype='object')

In [4]:
cordis_full_df_sorted

Unnamed: 0,project_rcn,organization_id,activity_type,address,contribution,type,website_x,rcn,acronym,end_date_code,...,title,total_cost,website_y,id,name,country_code,country_name,num_success,multiple_success,group_multiple_success
0,101114,-99679701,Private for-profit entities (excluding Higher ...,"{'city': 'FARNBOROUGH', 'street': 'WARWICK HOU...",439152,participant,www.baesystems.com,101114,ACTUATION2015,2016-04-30,...,ACTUATION 2015: Modular Electro Mechanical Act...,34373653,http://www.actuation2015.eu/,-99679701,BAE SYSTEMS (OPERATIONS) LIMITED,GB,United Kingdom,0,0,0
1,206608,-99552481,Other,"{'city': 'Hanoi', 'street': 'Dai Co Viet 1', '...",98582,participant,http://www1.hut.edu.vn/en,206608,SEAGAL,2010-11-14,...,South East Asia centre on European GNSS for in...,548229,,-99552481,Hanoi University of Technology,VN,Vietnam,0,0,0
2,189108,-99525116,,"{'city': 'ATHENS', 'street': 'KIFISSIAS 99', '...",0,participant,,189108,VIMSEN,2017-01-31,...,VIMSEN: Virtual Microgrids for Smart Energy Ne...,3331399,,-99525116,COSMOTE KINITES TILEPIKOINONIES AE,GR,Greece,0,0,0
3,90329,-99524889,,"{'city': 'COPENHAGEN', 'street': 'H.C. Anderse...",0,participant,,90329,SETATWORK,2010-08-31,...,Sustainable energy technology at work: themati...,1207388,,-99524889,CONFEDERATION OF DANISH INDUSTRIES,DK,Denmark,0,0,0
4,206615,-99449792,Other,"{'city': 'Zurich-Airport', 'street': 'P.O.Box ...",83920,participant,www.rega.ch,206615,HEDGE,2011-05-14,...,Helicopters Deploy GNSS in Europe,1192235,,-99449792,Swiss Air Ambulance LTD,CH,Switzerland,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251035,98892,999997930,Research Organisations,"{'city': 'PARIS', 'street': 'RUE MICHEL ANGE 3...",250659,coordinator,www.cnrs.fr,98892,Bird Vocal Network,NaT,...,Individual signatures in a vocal network: the ...,250659,,999997930,CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE CNRS,FR,France,2695,2699,1
251036,107868,999997930,Research Organisations,"{'city': 'PARIS', 'street': 'RUE MICHEL ANGE 3...",280254,coordinator,www.cnrs.fr,107868,gas trapping in ice,NaT,...,Experimental study of gas strapping in amorpho...,280254,,999997930,CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE CNRS,FR,France,2696,2699,1
251037,109814,999997930,Research Organisations,"{'city': 'PARIS', 'street': 'RUE MICHEL ANGE 3...",261385,coordinator,www.cnrs.fr,109814,NIEDPWIT,NaT,...,New Insights in Eating Disorders Paving the Wa...,261385,,999997930,CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE CNRS,FR,France,2697,2699,1
251038,210724,999997930,Research Organisations,"{'city': 'PARIS', 'street': 'RUE MICHEL ANGE 3...",75000,coordinator,www.cnrs.fr,210724,SRBMETABONOXSTRESS,NaT,...,Metabonomic Analysis of the Sulfate-Reducing B...,75000,,999997930,CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE CNRS,FR,France,2698,2699,1


In [16]:
def fundedUnder2Title(x):
    return x[0]['title']
cordis_full_df_sorted['funded_under_title'] = cordis_full_df_sorted['funded_under'].apply(fundedUnder2Title)
cordis_full_df_sorted

Unnamed: 0,project_rcn,organization_id,activity_type,address,contribution,type,website_x,rcn,acronym,end_date_code,...,total_cost,website_y,id,name,country_code,country_name,num_success,multiple_success,group_multiple_success,funded_under_title
0,101114,-99679701,Private for-profit entities (excluding Higher ...,"{'city': 'FARNBOROUGH', 'street': 'WARWICK HOU...",439152,participant,www.baesystems.com,101114,ACTUATION2015,2016-04-30,...,34373653,http://www.actuation2015.eu/,-99679701,BAE SYSTEMS (OPERATIONS) LIMITED,GB,United Kingdom,0,0,0,"Specific Programme ""Cooperation"": Transport (i..."
1,206608,-99552481,Other,"{'city': 'Hanoi', 'street': 'Dai Co Viet 1', '...",98582,participant,http://www1.hut.edu.vn/en,206608,SEAGAL,2010-11-14,...,548229,,-99552481,Hanoi University of Technology,VN,Vietnam,0,0,0,"Specific Programme ""Cooperation"": Transport (i..."
2,189108,-99525116,,"{'city': 'ATHENS', 'street': 'KIFISSIAS 99', '...",0,participant,,189108,VIMSEN,2017-01-31,...,3331399,,-99525116,COSMOTE KINITES TILEPIKOINONIES AE,GR,Greece,0,0,0,"Specific Programme ""Cooperation"": Information ..."
3,90329,-99524889,,"{'city': 'COPENHAGEN', 'street': 'H.C. Anderse...",0,participant,,90329,SETATWORK,2010-08-31,...,1207388,,-99524889,CONFEDERATION OF DANISH INDUSTRIES,DK,Denmark,0,0,0,"Specific Programme ""Cooperation"": Energy"
4,206615,-99449792,Other,"{'city': 'Zurich-Airport', 'street': 'P.O.Box ...",83920,participant,www.rega.ch,206615,HEDGE,2011-05-14,...,1192235,,-99449792,Swiss Air Ambulance LTD,CH,Switzerland,0,0,0,"Specific Programme ""Cooperation"": Transport (i..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251035,98892,999997930,Research Organisations,"{'city': 'PARIS', 'street': 'RUE MICHEL ANGE 3...",250659,coordinator,www.cnrs.fr,98892,Bird Vocal Network,NaT,...,250659,,999997930,CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE CNRS,FR,France,2695,2699,1,"Specific programme ""People"" implementing the S..."
251036,107868,999997930,Research Organisations,"{'city': 'PARIS', 'street': 'RUE MICHEL ANGE 3...",280254,coordinator,www.cnrs.fr,107868,gas trapping in ice,NaT,...,280254,,999997930,CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE CNRS,FR,France,2696,2699,1,"Specific programme ""People"" implementing the S..."
251037,109814,999997930,Research Organisations,"{'city': 'PARIS', 'street': 'RUE MICHEL ANGE 3...",261385,coordinator,www.cnrs.fr,109814,NIEDPWIT,NaT,...,261385,,999997930,CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE CNRS,FR,France,2697,2699,1,"Specific programme ""People"" implementing the S..."
251038,210724,999997930,Research Organisations,"{'city': 'PARIS', 'street': 'RUE MICHEL ANGE 3...",75000,coordinator,www.cnrs.fr,210724,SRBMETABONOXSTRESS,NaT,...,75000,,999997930,CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE CNRS,FR,France,2698,2699,1,"Specific programme ""People"" implementing the S..."


# Prepare data set 1

In [None]:
cordis_full_df_filtered_num_success = cordis_full_df_sorted[cordis_full_df_sorted['num_success'] == 0]
cordis_full_df_group_multiple_success = cordis_full_df_filtered_num_success[['contribution', 'ec_contribution','total_cost','group_multiple_success']]

In [None]:
normalized_df_group_multiple_success = (cordis_full_df_group_multiple_success-cordis_full_df_group_multiple_success.mean())/cordis_full_df_group_multiple_success.std()
normalized_df_group_multiple_success['group_multiple_success'] = cordis_full_df_group_multiple_success['group_multiple_success']

In [None]:
normalized_df_group_multiple_success[10000:10020]

In [None]:
sns.pairplot(normalized_df_group_multiple_success, hue='group_multiple_success', diag_kind='hist')

# Logistic regrssion + SVM 1

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as ss

In [None]:
X, y = normalized_df_group_multiple_success[['contribution', 'ec_contribution','total_cost']], normalized_df_group_multiple_success['group_multiple_success']
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.8,  # Iris is a pretty easy task so we make it a little harder
    shuffle=True,
    random_state=42,
)

In [None]:
lr_model = LogisticRegression(solver='lbfgs', penalty='none')
lr_model.fit(X_train, y_train)

In [None]:
(pd.DataFrame(lr_model.coef_, columns=X.columns, index=['Results'])
 .assign(intercept=lr_model.intercept_)
 .assign(train_accuracy=lr_model.score(X_train, y_train)) # Train score
 .assign(test_accuracy=lr_model.score(X_test, y_test)) # Test score
 .T
)

In [None]:
y_train.value_counts()

In [None]:
cv_kwargs = dict(scoring='accuracy', cv=5, n_jobs=-1, verbose=0)
scores = cross_val_score(lr_model, X_train, y_train, **cv_kwargs)
print(f"Average cross val score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

In [None]:
param_grid = [{'C': [0.1, 1, 10, 100],
               'penalty': ['l1', 'l2'],
               'solver': ['liblinear'],
               'max_iter': [1000],
              'intercept_scaling': [0.00010, 0.1, 0.5, 1, 5, 10]},
             {'solver': ['lbfgs'], 'penalty': ['none']}]

grid = GridSearchCV(lr_model, param_grid, **cv_kwargs)
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}, with score {grid.best_score_:.3f}")

In [None]:
svm_model = SVC(C=1, kernel='rbf', gamma='auto')
cross_val_score(svm_model, X_train, y_train, **cv_kwargs).mean()

In [None]:
scores = cross_val_score(
    svm_model, X_train, y_train, cv=5, scoring='f1',
)
print(f"Average cross val score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

In [None]:
cv_kwargs['scoring'] = 'f1'