# Import modules

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
import seaborn as sns
from im_tutorials.data.cordis import cordis_table

# Import Data

In [4]:
# merge organisations, project_organisations, projects
cordis_orgs_df = cordis_table('organisations')
cordis_project_orgs_df = cordis_table('project_organisations')
cordis_projects_df = cordis_table('projects')
cordis_full_df = cordis_project_orgs_df.merge(
    cordis_projects_df, left_on='project_rcn', right_on='rcn', how='left'
)
cordis_full_df = cordis_full_df.merge(
    cordis_orgs_df, left_on='organization_id', right_on='id', how='left'
)
cordis_full_df = cordis_full_df [(cordis_full_df['activity_type'] == 'Private for-profit entities (excluding Higher or Secondary Education Establishments)')
  & (cordis_full_df['framework'] == 'H2020')]

cordis_full_df.head()
cordis_full_df.columns

ConnectionError: HTTPSConnectionPool(host='s3.us-east-2.amazonaws.com', port=443): Max retries exceeded with url: /innovation-mapping-tutorials/cordis/mysql/cordis_organisations.pkl.bz2 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f80a27f9358>: Failed to establish a new connection: [Errno -2] Name or service not known',))

# Add labels and features

In [None]:
# Add num_success, group_multiple_success
cordis_full_df_sorted = cordis_full_df.sort_values(by=['organization_id', 'start_date_code'])
cordis_full_df_sorted = cordis_full_df_sorted.reset_index(drop=True)
cordis_full_df_sorted['num_success'] = cordis_full_df_sorted.groupby('organization_id').cumcount()
cordis_full_df_sorted_temp = cordis_full_df_sorted[['organization_id','num_success']]
cordis_full_df_sorted_temp = cordis_full_df_sorted_temp.groupby(by=['organization_id']).max()
cordis_full_df_sorted_temp.rename(columns={'num_success':'multiple_success'}, inplace=True)
cordis_full_df_sorted = cordis_full_df_sorted.merge(
    cordis_full_df_sorted_temp, on='organization_id', how='left'
)
cordis_full_df_sorted['group_multiple_success'] = (cordis_full_df_sorted['multiple_success'] > 0) * 1
cordis_full_df_sorted.columns

In [None]:
cordis_full_df_sorted['group_multiple_success'].value_counts()

In [None]:
# Add funded_under_title
def fundedUnder2Title(x):
    return x[0]['title']
cordis_full_df_sorted['funded_under_title'] = cordis_full_df_sorted['funded_under'].apply(fundedUnder2Title)

In [None]:
# Add funding_scheme_mean_ec_contribution
cordis_full_df_sorted['funding_scheme_mean_ec_contribution'] = cordis_full_df_sorted.groupby(['funding_scheme'])['ec_contribution'].transform(np.mean)

In [None]:
# Add importance
cordis_full_df_sorted['importance'] = cordis_full_df_sorted['contribution'] / cordis_full_df_sorted['ec_contribution']

In [None]:
# Add num_of_partners
cordis_full_df_sorted['num_of_partners'] = cordis_full_df_sorted.groupby(['project_rcn'])['organization_id'].transform('count')

In [None]:
# Add funding_country_total_ec_contribution
cordis_full_df_sorted['country_total_ec_contribution'] = cordis_full_df_sorted.groupby(['country_code'])['ec_contribution'].transform(np.sum)

In [None]:
# Add funding_country_mean_ec_contribution
cordis_full_df_sorted['country_mean_ec_contribution'] = cordis_full_df_sorted.groupby(['country_code'])['ec_contribution'].transform(np.mean)

In [None]:
# Add "CLOSED","ONGOING","SIGNED","TERMINATED"
cordis_full_df_sorted = pd.concat([cordis_full_df_sorted, pd.get_dummies(cordis_full_df_sorted['status'])], axis=1)

In [None]:
# Idea: funded_under_mean_ec_contribution, partner_multiple_success, 

In [None]:
cordis_full_df_sorted

# Prepare data set

In [None]:
input_var_cont = ['contribution',
             'ec_contribution',
             'total_cost',
             'funding_scheme_mean_ec_contribution',
             'importance','num_of_partners',
             'country_total_ec_contribution',
             'country_mean_ec_contribution']

input_var_disc = ["CLOSED",
                 "ONGOING",
                 "SIGNED",
                 "TERMINATED"]

output_var = ['group_multiple_success']

all_var = input_var_cont + input_var_disc + output_var

In [None]:
cordis_full_df_filtered_num_success = cordis_full_df_sorted[cordis_full_df_sorted['num_success'] == 0]
cordis_full_df_group_multiple_success = cordis_full_df_filtered_num_success[all_var]

In [None]:
(cordis_full_df_group_multiple_success[input_var_cont] - cordis_full_df_group_multiple_success[input_var_cont].mean())/cordis_full_df_group_multiple_success[input_var_cont].std()

In [None]:
normalized_df_group_multiple_success[input_var_cont] = (cordis_full_df_group_multiple_success[input_var_cont] - cordis_full_df_group_multiple_success[input_var_cont].mean())/cordis_full_df_group_multiple_success[input_var_cont].std()
#normalized_df_group_multiple_success['group_multiple_success'] = cordis_full_df_group_multiple_success['group_multiple_success']

In [None]:
normalized_df_group_multiple_success[10000:10020]

In [None]:
sns.pairplot(normalized_df_group_multiple_success, hue='group_multiple_success', diag_kind='hist')

# Logistic regrssion + SVM 1

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as ss
from sklearn.metrics import classification_report

In [None]:
X, y = normalized_df_group_multiple_success[['contribution', 'ec_contribution','total_cost','funding_scheme_mean_ec_contribution','importance','num_of_partners','country_total_ec_contribution','country_mean_ec_contribution']], normalized_df_group_multiple_success['group_multiple_success']
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.8,  # Iris is a pretty easy task so we make it a little harder
    shuffle=True,
    random_state=42,
)

In [None]:
lr_model = LogisticRegression(solver='lbfgs', penalty='none')
lr_model.fit(X_train, y_train)

In [None]:
(pd.DataFrame(lr_model.coef_, columns=X.columns, index=['Results'])
 .assign(intercept=lr_model.intercept_)
 .assign(train_accuracy=lr_model.score(X_train, y_train)) # Train score
 .assign(test_accuracy=lr_model.score(X_test, y_test)) # Test score
 .T
)

In [None]:
y_train.value_counts()

In [None]:
cv_kwargs = dict(scoring='accuracy', cv=5, n_jobs=-1, verbose=0)
scores = cross_val_score(lr_model, X_train, y_train, **cv_kwargs)
print(f"Average cross val score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

In [None]:
param_grid = [{'C': [0.1, 1, 10, 100],
               'penalty': ['l1', 'l2'],
               'solver': ['liblinear'],
               'max_iter': [1000],
              'intercept_scaling': [0.00010, 0.1, 0.5, 1, 5, 10]},
             {'solver': ['lbfgs'], 'penalty': ['none']}]

grid = GridSearchCV(lr_model, param_grid, **cv_kwargs)
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}, with score {grid.best_score_:.3f}")

In [None]:
(pd.DataFrame(
    classification_report(y_test, grid.predict(X_test), output_dict=True)
 ).T.round(3)
)

In [None]:
svm_model = SVC(C=1, kernel='rbf', gamma='auto')
cross_val_score(svm_model, X_train, y_train, **cv_kwargs).mean()

In [None]:
'''
cv_kwargs['scoring'] = 'f1'
param_grid = {'C': ss.expon(scale=5),
              'gamma': ss.expon(scale=5),
              'kernel': ['rbf', 'poly'],
              'degree': [2, 3, 4]}
grid = RandomizedSearchCV(svm_model, param_grid, n_iter=20, random_state=0, **cv_kwargs)

grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}, with score {grid.best_score_:.3f}")
'''

In [None]:
'''
(pd.DataFrame(
    classification_report(y_test, grid.predict(X_test), output_dict=True)
 ).T.round(3)
)
'''