In [None]:
# Importing relevant libraries

import pandas as pd
import numpy as np
import datetime as dt
import pickle

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_score, recall_score,\
precision_recall_curve, f1_score, fbeta_score,\
accuracy_score, confusion_matrix, roc_auc_score, roc_curve, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from sklearn import preprocessing
from sklearn.utils import resample, shuffle
from imblearn.over_sampling import ADASYN, SMOTE
from sklearn.neural_network import MLPClassifier



import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Loading datasets as pickle

#with open("data/clean_eu.pickle", 'rb') as eu:
#    eu_df = pickle.load(eu)
    
#with open("data/clean_us.pickle", 'rb') as us:
#    us_df = pickle.load(us)

In [None]:
# Loading datasets as csv

eu_df = pd.read_csv("data/clean_eu.csv")
us_df = pd.read_csv("data/clean_us.csv")

# Dropping "status" column in both datasets as target boolean "label" took over 
# to indicate whether successful or not
# Also dropping "first_funding_at" and "last_funding_at" as well as a random column
# "Unnamed: 0"

eu_df = eu_df.drop(["status", "first_funding_at", "last_funding_at"], 1)
eu_df = eu_df.drop(columns=eu_df.columns[0])

us_df = us_df.drop(["status", "first_funding_at", "last_funding_at"], 1)
us_df = us_df.drop(columns=us_df.columns[0])

In [None]:
eu_df.info()

In [None]:
us_df.info()

In [None]:
# Count target values EU
target_count =  eu_df['label'].value_counts()

# Printing class balance
print(f'Class 0: {target_count[0]}')
print(f'Class 1: {target_count[1]}')
print(f'Proportion: {round(target_count[0] / target_count[1], 2)} : 1')
#print('Percentage of Majority Class: {:f}'.format(
#    round(target_count[0] / sum(target_count), 4) * 100))

# Count target values US
target_count = us_df['label'].value_counts()

# Printing class balance
print("\n")
print(f'Class 0: {target_count[0]}')
print(f'Class 1: {target_count[1]}')
print(f'Proportion: {round(target_count[0] / target_count[1], 2)} : 1')
#print('Percentage of Majority Class: {:f}'.format(
#    round(target_count[0] / sum(target_count), 4) * 100))

In [None]:
# Shuffle and split EU dataset into train and test
X_eu = eu_df.drop(columns=['label'])
y_eu = eu_df['label']
X_train_eu, X_test_eu, y_train_eu, y_test_eu = train_test_split(X_eu, 
                                                                 y_eu, test_size=0.25, 
                                                                 random_state=42,
                                                                 stratify=y_eu, shuffle=True)
# Splitting further into EU validation set
#X_val_eu, X_test_eu, y_val_eu, y_test_eu = train_test_split(X_test_eu, 
#                                                                 y_test_eu, test_size=0.5, 
#                                                                 random_state=42)

# Shuffle and split US dataset into train and test
X_us = us_df.drop(columns=['label'])
y_us = us_df['label']
X_train_us, X_test_us, y_train_us, y_test_us = train_test_split(X_us, 
                                                                 y_us, test_size=0.25, 
                                                                 random_state=42,
                                                                 stratify=y_us, shuffle=True)

# Splitting further into US validation set
#X_val_us, X_test_us, y_val_us, y_test_us = train_test_split(X_test_us, 
#                                                                 y_test_us, test_size=0.5, 
#                                                                 random_state=42)

In [None]:
print("X_eu shape:", X_eu.shape)
print("X_train_eu shape:", X_train_eu.shape)
print("X_test_eu shape:", X_test_eu.shape)
#print("X_val_eu shape:", X_val_eu.shape)

print("\nX_us shape:", X_us.shape)
print("X_train_us shape:", X_train_us.shape)
print("X_test_us shape:", X_test_us.shape)
#print("X_val_us shape:", X_val_us.shape)

In [None]:
'''# Upsampling minority class

# Separate majority and minority classes
X_train_eu['label'] = y_train_eu
X_train_us['label'] = y_train_us

#EU
eu_minority = X_train_eu[X_train_eu.label==0]
eu_majority = X_train_eu[X_train_eu.label==1]

# n is the number of majority class (label = 1)
n = X_train_eu.label.value_counts()[1]

# Upsample minority class
eu_minority_upsampled = resample(eu_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=n,    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
eu_upsampled = pd.concat([eu_majority, eu_minority_upsampled])
eu_upsampled = shuffle(eu_upsampled)


#US
us_minority = X_train_us[X_train_us.label==0]
us_majority = X_train_us[X_train_us.label==1]

# n is the number of majority class (label = 1)
n = X_train_us.label.value_counts()[1]

# Upsample minority class
us_minority_upsampled = resample(us_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=n,    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
us_upsampled = pd.concat([us_majority, us_minority_upsampled])
us_upsampled = shuffle(us_upsampled)'''

In [None]:
'''# EU seperating X y
X_train_eu = eu_upsampled.drop(columns=['label'])
y_train_eu = eu_upsampled['label']

# US seperating X y
X_train_us = us_upsampled.drop(columns=['label'])
y_train_us = us_upsampled['label']'''

In [None]:
# Count target values EU Post train/test split
target_count_upsampled = y_train_eu.value_counts()

# Printing class balance
print(f'Class 0: {target_count_upsampled[0]}')
print(f'Class 1: {target_count_upsampled[1]}')
print(f'Proportion: {round(target_count_upsampled[0] / target_count_upsampled[1], 2)} : 1')
print('Percentage of Majority Class: {:f}'.format(
    round(target_count_upsampled[0] / sum(target_count_upsampled), 4) * 100))

# Count target values US
target_count_upsampled_us = y_train_us.value_counts()

# Printing class balance
print("\n")
print(f'Class 0: {target_count_upsampled_us[0]}')
print(f'Class 1: {target_count_upsampled_us[1]}')
print(f'Proportion: {round(target_count_upsampled_us[0] / target_count_upsampled_us[1], 2)} : 1')
print('Percentage of Majority Class: {:f}'.format(
    round(target_count_upsampled_us[0] / sum(target_count_upsampled_us), 4) * 100))

In [None]:
X_train_us.shape

In [None]:
# Seperating 4 types of features EU
X_train_industry_eu = X_train_eu.industry
X_train_country_eu = X_train_eu.country_code
X_train_city_eu = X_train_eu.city
X_train_eu_nums = X_train_eu.drop(columns=["industry", "country_code", "city"])

X_test_industry_eu = X_train_eu.industry
X_test_country_eu = X_test_eu.country_code
X_test_city_eu = X_test_eu.city
X_test_eu_nums = X_test_eu.drop(columns=["industry", "country_code", "city"])

# Seperating 4 types of features US
X_train_industry_us = X_train_us.industry
X_train_state_us = X_train_us.state_code
X_train_region_us = X_train_us.region
X_train_us_nums = X_train_us.drop(columns=["industry", "state_code", "region"])

X_test_industry_us = X_train_us.industry
X_test_state_us = X_test_us.state_code
X_test_region_us = X_test_us.region
X_test_us_nums = X_test_us.drop(columns=["industry", "state_code", "region"])

In [None]:
# Feature encoding EU

# Encoding text feature - industry
X_train_eu.industry = X_train_eu.industry.astype(str)
vectorizer_industry = CountVectorizer(min_df=5)
vectorizer_industry.fit(X_train_eu.industry)

X_train_industry_eu = vectorizer_industry.transform(X_train_eu.industry)
X_test_industry_eu = vectorizer_industry.transform(X_test_eu.industry)


# Encoding categorical feature - country
X_train_eu.country_code = X_train_eu.country_code.astype(str)
vectorizer_country = CountVectorizer(min_df=1)
vectorizer_country.fit(X_train_eu.country_code)

X_train_country_eu = vectorizer_country.transform(X_train_eu.country_code)
X_test_country_eu = vectorizer_country.transform(X_test_eu.country_code)


# Encoding categorical feature - city
X_train_eu.city = X_train_eu.city.astype(str)
vectorizer_city = CountVectorizer(min_df=1)
vectorizer_city.fit(X_train_eu.city)

X_train_city_eu = vectorizer_city.transform(X_train_eu.city)
X_test_city_eu = vectorizer_city.transform(X_test_eu.city)



# Feature encoding US
# Encoding text feature - industry
X_train_us.industry = X_train_us.industry.astype(str)
vectorizer_industry_us = CountVectorizer(min_df=5)
vectorizer_industry_us.fit(X_train_us.industry)

X_train_industry_us = vectorizer_industry_us.transform(X_train_us.industry)
X_test_industry_us = vectorizer_industry_us.transform(X_test_us.industry)


# Encoding categorical feature - state
X_train_us.state_code = X_train_us.state_code.astype(str)
vectorizer_state = CountVectorizer(min_df=1)
vectorizer_state.fit(X_train_us.state_code)

X_train_state_us = vectorizer_state.transform(X_train_us.state_code)
X_test_state_us = vectorizer_state.transform(X_test_us.state_code)


# Encoding categorical feature - region
X_train_us.region = X_train_us.region.astype(str)
vectorizer_region = CountVectorizer(min_df=1)
vectorizer_region.fit(X_train_us.region)

X_train_region_us = vectorizer_region.transform(X_train_us.region)
X_test_region_us = vectorizer_region.transform(X_test_us.region)

In [None]:
X_train_industry_eu.toarray()
X_train_industry_us.toarray()

In [None]:
# Scaling Features
scaler_eu = preprocessing.StandardScaler()
scaler_us = preprocessing.StandardScaler()

# EU
X_train_eu_nums_scaled = scaler_eu.fit_transform(X_train_eu_nums)
X_test_eu_nums_scaled = scaler_eu.transform(X_test_eu_nums)

# US
X_train_us_nums_scaled = scaler_us.fit_transform(X_train_us_nums)
X_test_us_nums_scaled = scaler_us.transform(X_test_us_nums)

In [None]:
X_train_city_eu.shape

In [None]:
# Concatinating inputs into a single input X for classifiers

# EU
X_train_eu_full = hstack([X_train_eu_nums_scaled, X_train_country_eu, X_train_city_eu, X_train_industry_eu])
X_test_eu_full = hstack([X_test_eu_nums_scaled, X_test_country_eu, X_test_city_eu, X_test_industry_eu])

#X_train_eu_full = np.concatenate((X_train_eu_nums_scaled, X_train_country_eu, X_train_city_eu, X_train_industry_eu), axis=1)
#X_test_eu_full = np.concatenate((X_test_eu_nums_scaled, X_test_country_eu, X_test_city_eu, X_test_industry_eu), axis=1)


# US
X_train_us_full = hstack([X_train_us_nums_scaled, X_train_state_us, X_train_region_us, X_train_industry_us])
X_test_us_full = hstack([X_test_us_nums_scaled, X_test_state_us, X_test_region_us, X_test_industry_us])

#X_train_us_full = np.concatenate((X_train_us_nums_scaled, X_train_state_us, X_train_region_us, X_train_industry_us), axis=1)
#X_test_us_full = np.concatenate((X_test_us_nums_scaled, X_test_state_us, X_test_region_us, X_test_industry_us), axis=1)

In [None]:
# Oversamling with ADASYN
ada = ADASYN(random_state=42)

# EU
X_train_eu_full, y_train_eu = ada.fit_resample(X_train_eu_full, y_train_eu)

# US
X_train_us_full, y_train_us = ada.fit_resample(X_train_us_full, y_train_us)

# Experimenting with SMOTE Oversampling
smote = SMOTE(random_state=42)

# EU
#X_train_eu_full, y_train_eu = smote.fit_resample(X_train_eu_full, y_train_eu)

# US
#X_train_us_full, y_train_us = smote.fit_resample(X_train_us_full, y_train_us)

In [None]:
# Count target values EU Post upsampling
target_count_upsampled = y_train_eu.value_counts()

# Printing class balance
print(f'Class 0: {target_count_upsampled[0]}')
print(f'Class 1: {target_count_upsampled[1]}')
print(f'Proportion: {round(target_count_upsampled[0] / target_count_upsampled[1], 2)} : 1')
print('Percentage of Majority Class: {:f}'.format(
    round(target_count_upsampled[0] / sum(target_count_upsampled), 4) * 100))

# Count target values US
target_count_upsampled_us = y_train_us.value_counts()

# Printing class balance
print("\n")
print(f'Class 0: {target_count_upsampled_us[0]}')
print(f'Class 1: {target_count_upsampled_us[1]}')
print(f'Proportion: {round(target_count_upsampled_us[0] / target_count_upsampled_us[1], 2)} : 1')
print('Percentage of Majority Class: {:f}'.format(
    round(target_count_upsampled_us[0] / sum(target_count_upsampled_us), 4) * 100))

# Classifiers

In [None]:
# Remove warnings in output
import warnings
warnings.filterwarnings('ignore')

# GridSearch, Cross-validation and Model Tuning to pick best model parameters

# Stratified KFold
#EU
cv_eu = StratifiedKFold(n_splits=5).split(X_train_eu_full, y_train_eu)

#US
cv_us = StratifiedKFold( n_splits=5).split(X_train_us_full, y_train_us)


# EU

# LogReg
lr_eu_grid = LogisticRegression()

grid_lr_eu_values = {'penalty': ['none', 'elasticnet', 'l1', 'l2'],
                     'C':[0.001, 0.01, 0.1, 1, 10, 100], 
                     'solver': ['lbfgs', 'liblinear']}

grid_lr_eu = GridSearchCV(estimator = lr_eu_grid, 
                          param_grid = grid_lr_eu_values, 
                          scoring = 'recall', 
                          cv=cv_eu)

grid_lr_eu.fit(X_train_eu_full, y_train_eu)

print("EU LogReg best parameters:", grid_lr_eu.best_params_)

y_pred_lr_eu_grid = grid_lr_eu.predict(X_test_eu_full)

acc_lr_eu_grid = accuracy_score(y_test_eu, y_pred_lr_eu_grid)
f1_lr_eu_grid = f1_score(y_test_eu, y_pred_lr_eu_grid)
fb_lr_eu_grid = fbeta_score(y_test_eu, y_pred_lr_eu_grid, beta=3)

print("Logistic Regression EU Accuracy: ", acc_lr_eu_grid)
print("Logistic Regression EU f1 Score: ", f1_lr_eu_grid)
print("Logistic Regression EU f-beta Score: ", fb_lr_eu_grid)

# Calculating ROC curve for EU LogReg Grid
fpr_lr_eu_grid, tpr_lr_eu_grid, thresholds_lr_eu_grid = roc_curve(
    y_test_eu, grid_lr_eu.predict_proba(X_test_eu_full)[:, 1])


# Calculating area under the curve (AUC) for ROC for EU LogReg Grid
auc_lr_eu_grid = roc_auc_score(y_test_eu, grid_lr_eu.predict_proba(X_test_eu_full)[:, 1])




# US

# LogReg
lr_us_grid = LogisticRegression()

grid_lr_us_values = {'penalty': ['none', 'elasticnet', 'l1', 'l2'],
                     'C':[0.001, 0.01, 0.1, 1, 10, 100], 
                     'solver': ['lbfgs', 'liblinear']}

grid_lr_us = GridSearchCV(estimator = lr_us_grid, 
                          param_grid = grid_lr_us_values, 
                          scoring = 'recall', 
                          cv=cv_us)

grid_lr_us.fit(X_train_us_full, y_train_us)

print("US LogReg best parameters:", grid_lr_us.best_params_)

y_pred_lr_us_grid = grid_lr_us.predict(X_test_us_full)

acc_lr_us_grid = accuracy_score(y_test_us, y_pred_lr_us_grid)
f1_lr_us_grid = f1_score(y_test_us, y_pred_lr_us_grid)
fb_lr_us_grid = fbeta_score(y_test_us, y_pred_lr_us_grid, beta=3)

print("Logistic Regression US Accuracy: ", acc_lr_us_grid)
print("Logistic Regression US f1 Score: ", f1_lr_us_grid)
print("Logistic Regression US f-beta Score: ", fb_lr_us_grid)

# Calculating ROC curve for US LogReg Grid
fpr_lr_us_grid, tpr_lr_us_grid, thresholds_lr_us_grid = roc_curve(
    y_test_us, grid_lr_us.predict_proba(X_test_us_full)[:, 1])


# Calculating area under the curve (AUC) for ROC for US LogReg Grid
auc_lr_us_grid = roc_auc_score(y_test_us, grid_lr_us.predict_proba(X_test_us_full)[:, 1])

In [None]:
#Gridsearch SVM

# EU

svm_eu_grid = svm.SVC(probability=True)

grid_svm_eu_values = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001]} 

grid_svm_eu = GridSearchCV(estimator = svm_eu_grid, 
                          param_grid = grid_svm_eu_values, 
                          scoring = 'recall', 
                          cv=5, verbose=1, n_jobs=-1)

grid_svm_eu.fit(X_train_eu_full, y_train_eu)

print("EU SVM best parameters:", grid_svm_eu.best_params_)

y_pred_svm_eu_grid = grid_svm_eu.predict(X_test_eu_full)

acc_svm_eu_grid = accuracy_score(y_test_eu, y_pred_svm_eu_grid)
f1_svm_eu_grid = f1_score(y_test_eu, y_pred_svm_eu_grid)
fb_svm_eu_grid = fbeta_score(y_test_eu, y_pred_svm_eu_grid, beta=3)

print("SVM EU Accuracy: ", acc_svm_eu_grid)
print("SVM EU f1 Score: ", f1_svm_eu_grid)
print("SVM EU f-beta Score: ", fb_svm_eu_grid)

# Calculating ROC curve for EU SVM Grid
fpr_svm_eu_grid, tpr_svm_eu_grid, thresholds_svm_eu_grid = roc_curve(
    y_test_eu, grid_svm_eu.predict_proba(X_test_eu_full)[:, 1])


# Calculating area under the curve (AUC) for ROC for EU SVM Grid
auc_svm_eu_grid = roc_auc_score(y_test_eu, grid_svm_eu.predict_proba(X_test_eu_full)[:, 1])


# US

svm_us_grid = svm.SVC(probability=True)

grid_svm_us_values = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001]} 

grid_svm_us = GridSearchCV(estimator = svm_us_grid, 
                          param_grid = grid_svm_us_values, 
                          scoring = 'recall', 
                          cv=5, verbose=1, n_jobs=-1)

grid_svm_us.fit(X_train_us_full, y_train_us)

print("US SVM best parameters:", grid_svm_us.best_params_)

y_pred_svm_us_grid = grid_svm_us.predict(X_test_us_full)

acc_svm_us_grid = accuracy_score(y_test_us, y_pred_svm_us_grid)
f1_svm_us_grid = f1_score(y_test_us, y_pred_svm_us_grid)
fb_svm_us_grid = fbeta_score(y_test_us, y_pred_svm_us_grid, beta=3)

print("SVM US Accuracy: ", acc_svm_us_grid)
print("SVM US f1 Score: ", f1_svm_us_grid)
print("SVM US f-beta Score: ", fb_svm_us_grid)

# Calculating ROC curve for US SVM Grid
fpr_svm_us_grid, tpr_svm_us_grid, thresholds_svm_us_grid = roc_curve(
    y_test_us, grid_svm_us.predict_proba(X_test_us_full)[:, 1])


# Calculating area under the curve (AUC) for ROC for US SVM Grid
auc_svm_us_grid = roc_auc_score(y_test_us, grid_svm_us.predict_proba(X_test_us_full)[:, 1])

In [None]:
#Gridsearch Random Forest

# EU

rf_eu_grid = RandomForestClassifier()

grid_rf_eu_values = {'n_estimators': [200, 500],
                     'max_features': ['auto', 'sqrt', 'log2'],
                     'max_depth' : [4,5,6,7,8],
                     'criterion' :['gini', 'entropy']} 

grid_rf_eu = GridSearchCV(estimator = rf_eu_grid, 
                          param_grid = grid_rf_eu_values, 
                          scoring = 'recall', 
                          cv=5, verbose=1, n_jobs=-1)

grid_rf_eu.fit(X_train_eu_full, y_train_eu)

print("EU Random Forest best parameters:", grid_rf_eu.best_params_)

y_pred_rf_eu_grid = grid_rf_eu.predict(X_test_eu_full)

acc_rf_eu_grid = accuracy_score(y_test_eu, y_pred_rf_eu_grid)
f1_rf_eu_grid = f1_score(y_test_eu, y_pred_rf_eu_grid)
fb_rf_eu_grid = fbeta_score(y_test_eu, y_pred_rf_eu_grid, beta=3)

print("Random Forest EU Accuracy: ", acc_rf_eu_grid)
print("Random Forest EU f1 Score: ", f1_rf_eu_grid)
print("Random Forest EU f-beta Score: ", fb_rf_eu_grid)

# Calculating ROC curve for EU Random Forest Grid
fpr_rf_eu_grid, tpr_rf_eu_grid, thresholds_rf_eu_grid = roc_curve(
    y_test_eu, grid_rf_eu.predict_proba(X_test_eu_full)[:, 1])


# Calculating area under the curve (AUC) for ROC for EU Random Forest Grid
auc_rf_eu_grid = roc_auc_score(y_test_eu, grid_rf_eu.predict_proba(X_test_eu_full)[:, 1])



# US

rf_us_grid = RandomForestClassifier()

grid_rf_us_values = {'n_estimators': [200, 500],
                     'max_features': ['auto', 'sqrt', 'log2'],
                     'max_depth' : [4,5,6,7,8],
                     'criterion' :['gini', 'entropy']} 

grid_rf_us = GridSearchCV(estimator = rf_us_grid, 
                          param_grid = grid_rf_us_values, 
                          scoring = 'recall', 
                          cv=5, verbose=1, n_jobs=-1)

grid_rf_us.fit(X_train_us_full, y_train_us)

print("US Random Forest best parameters:", grid_rf_us.best_params_)

y_pred_rf_us_grid = grid_rf_us.predict(X_test_us_full)

acc_rf_us_grid = accuracy_score(y_test_us, y_pred_rf_us_grid)
f1_rf_us_grid = f1_score(y_test_us, y_pred_rf_us_grid)
fb_rf_us_grid = fbeta_score(y_test_us, y_pred_rf_us_grid, beta=3)

print("Random Forest US Accuracy: ", acc_rf_us_grid)
print("Random Forest US f1 Score: ", f1_rf_us_grid)
print("Random Forest US f-beta Score: ", fb_rf_us_grid)

# Calculating ROC curve for US Random Forest Grid
fpr_rf_us_grid, tpr_rf_us_grid, thresholds_rf_us_grid = roc_curve(
    y_test_us, grid_rf_us.predict_proba(X_test_us_full)[:, 1])


# Calculating area under the curve (AUC) for ROC for US Random Forest Grid
auc_rf_us_grid = roc_auc_score(y_test_us, grid_rf_us.predict_proba(X_test_us_full)[:, 1])

In [None]:
#Gridsearch XGBoost

# EU

xgb_eu_grid = XGBClassifier()

grid_xgb_eu_values = {"subsample":[0.5, 0.75, 1],
                      "colsample_bytree":[0.5, 0.75, 1],
                      "max_depth":[2, 6, 12],
                      "min_child_weight":[1, 5, 15],
                      "learning_rate":[0.3, 0.1, 0.03],
                      "n_estimators":[50, 100, 150, 200]} 

grid_xgb_eu = GridSearchCV(estimator = xgb_eu_grid, 
                          param_grid = grid_xgb_eu_values, 
                          scoring = 'recall', 
                          cv=5, verbose=1, n_jobs=-1)

grid_xgb_eu.fit(X_train_eu_full, y_train_eu)

print("EU XGBoost best parameters:", grid_xgb_eu.best_params_)

y_pred_xgb_eu_grid = grid_xgb_eu.predict(X_test_eu_full)

acc_xgb_eu_grid = accuracy_score(y_test_eu, y_pred_xgb_eu_grid)
f1_xgb_eu_grid = f1_score(y_test_eu, y_pred_xgb_eu_grid)
fb_xgb_eu_grid = fbeta_score(y_test_eu, y_pred_xgb_eu_grid, beta=3)

print("XGB EU Accuracy: ", acc_xgb_eu_grid)
print("XGB EU f1 Score: ", f1_xgb_eu_grid)
print("XGB EU f-beta Score: ", fb_xgb_eu_grid)

# Calculating ROC curve for EU XGB Grid
fpr_xgb_eu_grid, tpr_xgb_eu_grid, thresholds_xgb_eu_grid = roc_curve(
    y_test_eu, grid_xgb_eu.predict_proba(X_test_eu_full)[:, 1])


# Calculating area under the curve (AUC) for ROC for EU XGB Grid
auc_xgb_eu_grid = roc_auc_score(y_test_eu, grid_xgb_eu.predict_proba(X_test_eu_full)[:, 1])



# US

xgb_us_grid = XGBClassifier()

grid_xgb_us_values = {"subsample":[0.5, 0.75, 1],
                      "colsample_bytree":[0.5, 0.75, 1],
                      "max_depth":[2, 6, 12],
                      "min_child_weight":[1, 5, 15],
                      "learning_rate":[0.3, 0.1, 0.03],
                      "n_estimators":[50, 100, 150, 200]} 

grid_xgb_us = GridSearchCV(estimator = xgb_us_grid, 
                          param_grid = grid_xgb_us_values, 
                          scoring = 'recall', 
                          cv=5, verbose=1, n_jobs=-1)

grid_xgb_us.fit(X_train_us_full, y_train_us)

print("US XGBoost best parameters:", grid_xgb_us.best_params_)

y_pred_xgb_us_grid = grid_xgb_us.predict(X_test_us_full)

acc_xgb_us_grid = accuracy_score(y_test_us, y_pred_xgb_us_grid)
f1_xgb_us_grid = f1_score(y_test_us, y_pred_xgb_us_grid)
fb_xgb_us_grid = fbeta_score(y_test_us, y_pred_xgb_us_grid, beta=3)

print("XGB US Accuracy: ", acc_xgb_us_grid)
print("XGB US f1 Score: ", f1_xgb_us_grid)
print("XGB US f-beta Score: ", fb_xgb_us_grid)

# Calculating ROC curve for US XGB Grid
fpr_xgb_us_grid, tpr_xgb_us_grid, thresholds_xgb_us_grid = roc_curve(
    y_test_us, grid_xgb_us.predict_proba(X_test_us_full)[:, 1])


# Calculating area under the curve (AUC) for ROC for US XGB Grid
auc_xgb_us_grid = roc_auc_score(y_test_us, grid_xgb_us.predict_proba(X_test_us_full)[:, 1])

In [None]:
#Gridsearch Multilayer Perceptron

# EU

mlp_eu_grid = MLPClassifier()

grid_mlp_eu_values = {'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
                      'solver': ['sgd', 'adam'],
                      'activation': ['relu', 'tanh', 'logistic'],
                      'alpha': [0.0001, 0.05],
                      'learning_rate': ['constant','adaptive'],} 

grid_mlp_eu = GridSearchCV(estimator = mlp_eu_grid, 
                          param_grid = grid_mlp_eu_values, 
                          scoring = 'recall', 
                          cv=5, verbose=1, n_jobs=-1)

grid_mlp_eu.fit(X_train_eu_full, y_train_eu)

print("EU Multilayer Perceptron best parameters:", grid_mlp_eu.best_params_)

y_pred_mlp_eu_grid = grid_mlp_eu.predict(X_test_eu_full)

acc_mlp_eu_grid = accuracy_score(y_test_eu, y_pred_mlp_eu_grid)
f1_mlp_eu_grid = f1_score(y_test_eu, y_pred_mlp_eu_grid)
fb_mlp_eu_grid = fbeta_score(y_test_eu, y_pred_mlp_eu_grid, beta=3)

print("MLP EU Accuracy: ", acc_mlp_eu_grid)
print("MLP EU f1 Score: ", f1_mlp_eu_grid)
print("MLP EU f-beta Score: ", fb_mlp_eu_grid)

# Calculating ROC curve for EU MLP Grid
fpr_mlp_eu_grid, tpr_mlp_eu_grid, thresholds_mlp_eu_grid = roc_curve(
    y_test_eu, grid_mlp_eu.predict_proba(X_test_eu_full)[:, 1])


# Calculating area under the curve (AUC) for ROC for EU MLP Grid
auc_mlp_eu_grid = roc_auc_score(y_test_eu, grid_mlp_eu.predict_proba(X_test_eu_full)[:, 1])


# US

mlp_us_grid = MLPClassifier()

grid_mlp_us_values = {'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
                      'solver': ['sgd', 'adam'],
                      'activation': ['relu', 'tanh', 'logistic'],
                      'alpha': [0.0001, 0.05],
                      'learning_rate': ['constant','adaptive'],} 

grid_mlp_us = GridSearchCV(estimator = mlp_us_grid, 
                          param_grid = grid_mlp_us_values, 
                          scoring = 'recall', 
                          cv=5, verbose=1, n_jobs=-1)

grid_mlp_us.fit(X_train_us_full, y_train_us)

print("US Multilayer Perceptron best parameters:", grid_mlp_us.best_params_)

y_pred_mlp_us_grid = grid_mlp_us.predict(X_test_us_full)

acc_mlp_us_grid = accuracy_score(y_test_us, y_pred_mlp_us_grid)
f1_mlp_us_grid = f1_score(y_test_us, y_pred_mlp_us_grid)
fb_mlp_us_grid = fbeta_score(y_test_us, y_pred_mlp_us_grid, beta=3)

print("MLP US Accuracy: ", acc_mlp_us_grid)
print("MLP US f1 Score: ", f1_mlp_us_grid)
print("MLP US f-beta Score: ", fb_mlp_us_grid)

# Calculating ROC curve for US MLP Grid
fpr_mlp_us_grid, tpr_mlp_us_grid, thresholds_mlp_us_grid = roc_curve(
    y_test_us, grid_mlp_us.predict_proba(X_test_us_full)[:, 1])


# Calculating area under the curve (AUC) for ROC for US MLP Grid
auc_mlp_us_grid = roc_auc_score(y_test_us, grid_mlp_us.predict_proba(X_test_us_full)[:, 1])

In [None]:
# Logistic Regression

# Instantiate EU model
lr_eu = LogisticRegression(C=0.01, penalty="l2", solver='lbfgs')

# Fit model to the EU training data
lr_eu.fit(X_train_eu_full, y_train_eu)

# Predicting and calculating accuracy, f1 and f-beta score
y_pred_lr_eu = lr_eu.predict(X_test_eu_full)
acc_lr_eu = accuracy_score(y_test_eu, y_pred_lr_eu)

f1_lr_eu = f1_score(y_test_eu, y_pred_lr_eu)
fb_lr_eu = fbeta_score(y_test_eu, y_pred_lr_eu, beta=3)

print("Logistic Regression EU Accuracy: ", acc_lr_eu)
print("Logistic Regression EU f1 Score: ", f1_lr_eu)
print("Logistic Regression EU f-beta Score: ", fb_lr_eu)

# Calculating ROC curve for EU LogReg
fpr_lr_eu, tpr_lr_eu, thresholds_lr_eu = roc_curve(
    y_test_eu, lr_eu.predict_proba(X_test_eu_full)[:, 1])

# Calculating area under the curve (AUC) for ROC for EU LogReg
auc_lr_eu = roc_auc_score(y_test_eu, lr_eu.predict_proba(X_test_eu_full)[:, 1])

 

    
# Instantiate US model
lr_us = LogisticRegression(C=0.01, penalty = 'l2', solver='lbfgs')

# Fit model to the US training data
lr_us.fit(X_train_us_full, y_train_us)

# Predicting and calculating accuracy and f1 score
y_pred_lr_us = lr_us.predict(X_test_us_full)
acc_lr_us = accuracy_score(y_test_us, y_pred_lr_us)

f1_lr_us = f1_score(y_test_us, y_pred_lr_us)
fb_lr_us = fbeta_score(y_test_us, y_pred_lr_us, beta=3)

print("\nLogistic Regression US Accuracy: ", acc_lr_us)
print("Logistic Regression US f1 Score: ", f1_lr_us)
print("Logistic Regression US f-beta Score: ", fb_lr_us)

# Calculating ROC curve for US LogReg
fpr_lr_us, tpr_lr_us, thresholds_lr_us = roc_curve(
    y_test_us, lr_us.predict_proba(X_test_us_full)[:, 1])

# Calculating area under the curve (AUC) for ROC for US LogReg
auc_lr_us = roc_auc_score(y_test_us, lr_us.predict_proba(X_test_us_full)[:, 1])

In [None]:
# Support Vector Machine (SVM)

# EU

svm_eu = svm.SVC(C=0.1,
                 gamma=1,
                 kernel="linear", 
                 probability=True)

svm_eu.fit(X_train_eu_full, y_train_eu)

# Predicting and calculating accuracy and f1 score
y_pred_svm_eu = svm_eu.predict(X_test_eu_full)
acc_svm_eu = accuracy_score(y_test_eu, y_pred_svm_eu)

f1_svm_eu = f1_score(y_test_eu, y_pred_svm_eu)
fb_svm_eu = fbeta_score(y_test_eu, y_pred_svm_eu, beta=3)


print("SVM EU Accuracy: ", acc_svm_eu)
print("SVM EU f1 Score: ", f1_svm_eu)
print("SVM EU f-beta Score: ", fb_svm_eu)

# Calculating ROC curve for EU SVM
fpr_svm_eu, tpr_svm_eu, thresholds_svm_eu = roc_curve(
    y_test_eu, svm_eu.predict_proba(X_test_eu_full)[:, 1])

# Calculating area under the curve (AUC) for ROC for EU SVM
auc_svm_eu = roc_auc_score(y_test_eu, svm_eu.predict_proba(X_test_eu_full)[:, 1])


# US

svm_us = svm.SVC(C=0.1,
                 gamma=1,
                 kernel="linear", 
                 probability=True)

svm_us.fit(X_train_us_full, y_train_us)

# Predicting and calculating accuracy and f1 score
y_pred_svm_us = svm_us.predict(X_test_us_full)
acc_svm_us = accuracy_score(y_test_us, y_pred_svm_us)

f1_svm_us = f1_score(y_test_us, y_pred_svm_us)
fb_svm_us = fbeta_score(y_test_us, y_pred_svm_us, beta=3)

print("\nSVM US Accuracy: ", acc_svm_us)
print("SVM US f1 Score: ", f1_svm_us)
print("SVM US f-beta Score: ", fb_svm_us)

# Calculating ROC curve for US SVM
fpr_svm_us, tpr_svm_us, thresholds_svm_us = roc_curve(
    y_test_us, svm_us.predict_proba(X_test_us_full)[:, 1])

# Calculating area under the curve (AUC) for ROC for US SVM
auc_svm_us = roc_auc_score(y_test_us, svm_us.predict_proba(X_test_us_full)[:, 1])

In [None]:
# Random Forest

# EU
rf_eu = RandomForestClassifier(n_estimators=200,
                               max_features='log2',
                               max_depth=8,
                               criterion='gini',
                               bootstrap=True,
                               oob_score=True,
                               random_state=42,
                               n_jobs=-1)

rf_eu.fit(X_train_eu_full, y_train_eu)

# Predicting and calculating accuracy and f1 score
y_pred_rf_eu = rf_eu.predict(X_test_eu_full)
acc_rf_eu = accuracy_score(y_test_eu, y_pred_rf_eu)

f1_rf_eu = f1_score(y_test_eu, y_pred_rf_eu)
fb_rf_eu = fbeta_score(y_test_eu, y_pred_rf_eu, beta=3)

print("Random Forest EU Accuracy: ", acc_rf_eu)
print("Random Forest EU f1 Score: ", f1_rf_eu)
print("Random Forest EU f-beta Score: ", fb_rf_eu)

# Calculating ROC curve for EU Random Forest
fpr_rf_eu, tpr_rf_eu, thresholds_rf_eu = roc_curve(
    y_test_eu, rf_eu.predict_proba(X_test_eu_full)[:, 1])

# Calculating area under the curve (AUC) for ROC for EU Random Forest
auc_rf_eu = roc_auc_score(y_test_eu, rf_eu.predict_proba(X_test_eu_full)[:, 1])


# US
rf_us = RandomForestClassifier(n_estimators=200,
                               max_features='log2',
                               max_depth=7,
                               criterion='gini',
                               bootstrap=True,
                               oob_score=True,
                               random_state=42,
                               n_jobs=-1)

rf_us.fit(X_train_us_full, y_train_us)

# Predicting and calculating accuracy and f1 score
y_pred_rf_us = rf_us.predict(X_test_us_full)
acc_rf_us = accuracy_score(y_test_us, y_pred_rf_us)

f1_rf_us = f1_score(y_test_us, y_pred_rf_us)
fb_rf_us = fbeta_score(y_test_us, y_pred_rf_us, beta=3)

print("\nRandom Forest US Accuracy: ", acc_rf_us)
print("Random Forest US f1 Score: ", f1_rf_us)
print("Random Forest US f-beta Score: ", fb_rf_us)

# Calculating ROC curve for US Random Forest
fpr_rf_us, tpr_rf_us, thresholds_rf_us = roc_curve(
    y_test_us, rf_us.predict_proba(X_test_us_full)[:, 1])

# Calculating area under the curve (AUC) for ROC for US Random Forest
auc_rf_us = roc_auc_score(y_test_us, rf_us.predict_proba(X_test_us_full)[:, 1])


'''
# Feature importance

rf_eu_featuress = pd.DataFrame({
    'feature': X_eu.columns,
    'importance': rf_eu.feature_importances_
}).sort_values(by='importance', ascending=False)

# Look at top 10 features
rf_eu_featuress[0:10]
'''

In [None]:
# XGBoost

# EU
xgb_eu = XGBClassifier(colsample_bytree=0.75, 
                       learning_rate=0.1,
                       max_depth=12, 
                       min_child_weight=1,
                       n_estimators=100,
                       subsample=1)

xgb_eu.fit(X_train_eu_full, y_train_eu)

# Predicting and calculating accuracy and f1 score
y_pred_xgb_eu = xgb_eu.predict(X_test_eu_full)
acc_xgb_eu = accuracy_score(y_test_eu, y_pred_xgb_eu)

f1_xgb_eu = f1_score(y_test_eu, y_pred_xgb_eu)
fb_xgb_eu = fbeta_score(y_test_eu, y_pred_xgb_eu, beta=3)

print("XGBoost EU Accuracy: ", acc_xgb_eu)
print("XGBoost EU f1 Score: ", f1_xgb_eu)
print("XGBoost EU f-beta Score: ", fb_xgb_eu)

# Calculating ROC curve for EU XGBoost
fpr_xgb_eu, tpr_xgb_eu, thresholds_xgb_eu = roc_curve(
    y_test_eu, xgb_eu.predict_proba(X_test_eu_full)[:, 1])

# Calculating area under the curve (AUC) for ROC for EU XGBoost
auc_xgb_eu = roc_auc_score(y_test_eu, xgb_eu.predict_proba(X_test_eu_full)[:, 1])


# US
xgb_us = XGBClassifier(colsample_bytree=0.5, 
                       learning_rate=0.3,
                       max_depth=6, 
                       min_child_weight=1,
                       n_estimators=100,
                       subsample=1)

xgb_us.fit(X_train_us_full, y_train_us)

# Predicting and calculating accuracy and f1 score
y_pred_xgb_us = xgb_us.predict(X_test_us_full)
acc_xgb_us = accuracy_score(y_test_us, y_pred_xgb_us)

f1_xgb_us = f1_score(y_test_us, y_pred_xgb_us)
fb_xgb_us = fbeta_score(y_test_us, y_pred_xgb_us, beta=3)

print("\nXGBoost US Accuracy: ", acc_xgb_us)
print("XGBoost US f1 Score: ", f1_xgb_us)
print("XGBoost US f-beta Score: ", fb_xgb_us)

# Calculating ROC curve for US XGBoost
fpr_xgb_us, tpr_xgb_us, thresholds_xgb_us = roc_curve(
    y_test_us, xgb_us.predict_proba(X_test_us_full)[:, 1])

# Calculating area under the curve (AUC) for ROC for US XGBoost
auc_xgb_us = roc_auc_score(y_test_us, xgb_us.predict_proba(X_test_us_full)[:, 1])

In [None]:
# Multilayer Perceptron EU
from sklearn.neural_network import MLPClassifier

mlp_eu = MLPClassifier(
    hidden_layer_sizes=(50, 50, 50),
    activation='relu',
    alpha=0.05,
    solver="adam",
    verbose=False,
    random_state=42,
    learning_rate='constant'
)

mlp_eu.fit(X_train_eu_full, y_train_eu)

y_pred_mlp_eu = mlp_eu.predict(X_test_eu_full)
acc_mlp_eu = accuracy_score(y_test_eu, y_pred_mlp_eu)
f1_mlp_eu = f1_score(y_test_eu, y_pred_mlp_eu)
fb_mlp_eu = fbeta_score(y_test_eu, y_pred_mlp_eu, beta=3)

print("MLP EU Accuracy: ", acc_mlp_eu)
print("MLP EU f1 Score: ", f1_mlp_eu)
print("MLP EU f-beta Score: ", fb_mlp_eu)

# Calculating ROC curve for EU MLP
fpr_mlp_eu, tpr_mlp_eu, thresholds_mlp_eu = roc_curve(
    y_test_eu, mlp_eu.predict_proba(X_test_eu_full)[:, 1])

# Calculating area under the curve (AUC) for ROC for EU MLP
auc_mlp_eu = roc_auc_score(y_test_eu, mlp_eu.predict_proba(X_test_eu_full)[:, 1])


# US
mlp_us = MLPClassifier(
    hidden_layer_sizes=(50, 50, 50),
    activation='relu',
    alpha=0.001,
    solver="sgd",
    verbose=False,
    random_state=1,
    learning_rate='constant'
)

mlp_us.fit(X_train_us_full, y_train_us)

y_pred_mlp_us = mlp_us.predict(X_test_us_full)
acc_mlp_us = accuracy_score(y_test_us, y_pred_mlp_us)
f1_mlp_us = f1_score(y_test_us, y_pred_mlp_us)
fb_mlp_us = fbeta_score(y_test_us, y_pred_mlp_us, beta=3)

print("\nMLP US Accuracy: ", acc_mlp_us)
print("MLP US f1 Score: ", f1_mlp_us)
print("MLP US f-beta Score: ", fb_mlp_us)

# Calculating ROC curve for EU MLP
fpr_mlp_us, tpr_mlp_us, thresholds_mlp_us = roc_curve(
    y_test_us, mlp_us.predict_proba(X_test_us_full)[:, 1])

# Calculating area under the curve (AUC) for ROC for US MLP
auc_mlp_us = roc_auc_score(y_test_us, mlp_us.predict_proba(X_test_us_full)[:, 1])

In [None]:
# Model Comparison Accuracy, f1 scores and ROC/AUC

# EU
clfs_eu = ["lr", "svm", "rf", "xgb", "mlp"]

model_aucs_eu = [
    auc_lr_eu_grid, auc_svm_eu_grid, auc_rf_eu_grid, auc_xgb_eu_grid, auc_mlp_eu_grid]

model_acc_eu = [
    acc_lr_eu_grid, acc_svm_eu_grid, acc_rf_eu_grid, acc_xgb_eu_grid, acc_mlp_eu_grid]

model_f1_eu = [
    f1_lr_eu_grid, f1_svm_eu_grid, f1_rf_eu_grid, f1_xgb_eu_grid, f1_mlp_eu_grid]

model_fb_eu = [
    fb_lr_eu_grid, fb_svm_eu_grid, fb_rf_eu_grid, fb_xgb_eu_grid, fb_mlp_eu_grid]

model_names = [
    "Logistic Regression", "Support Vector Machine", "Random Forest", "XGBoost", "Multilayer Perceptron"]


# US
clfs_us = ["lr", "svm", "rf", "xgb", "mlp"]
model_aucs_us = [
    auc_lr_us_grid, auc_svm_us_grid, auc_rf_us_grid, auc_xgb_us_grid, auc_mlp_us_grid]

model_acc_us = [
    acc_lr_us_grid, acc_svm_us_grid, acc_rf_us_grid, acc_xgb_us_grid, acc_mlp_us_grid]

model_f1_us = [
    f1_lr_us_grid, f1_svm_us_grid, f1_rf_us_grid, f1_xgb_us_grid, f1_mlp_us_grid]

model_fb_us = [
    fb_lr_us_grid, fb_svm_us_grid, fb_rf_us_grid, fb_xgb_us_grid, fb_mlp_us_grid]

model_names = [
    "Logistic Regression", "Support Vector Machine", "Random Forest", "XGBoost", "Multilayer Perceptron"]

In [None]:
# Plotting ROC Curves

# EU
plt.plot(fpr_lr_eu_grid, tpr_lr_eu_grid, lw=1, label="Logistic Regression")
plt.plot(fpr_svm_eu_grid, tpr_svm_eu_grid, lw=1, label="SVM - Linear")
plt.plot(fpr_rf_eu_grid, tpr_rf_eu_grid, lw=1, label="Random Forest")
plt.plot(fpr_xgb_eu_grid, tpr_xgb_eu_grid, lw=1, label="XGBoost")
plt.plot(fpr_mlp_eu_grid, tpr_mlp_eu_grid, lw=1, label="Multilayer Perceptron")


plt.plot([0, 1], [0, 1], c='violet', ls='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])

plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('EU Model Comparison - ROC curve')
plt.legend(ncol=2, fontsize='small')
sns.despine()

# Print EU AUC Scores
for model in list(zip(model_names, model_aucs_eu)):
    print("EU ROC AUC score = {:3f} for {}".format(model[1], model[0]))
print("\n")
    
# Print EU Accuracy Scores
for model in list(zip(model_names, model_acc_eu)):
    print("EU Accuracy score = {:3f} for {}".format(model[1], model[0]))
print("\n")

# Print EU f1 Scores
for model in list(zip(model_names, model_f1_eu)):
    print("EU f1 score = {:3f} for {}".format(model[1], model[0]))
print("\n")

# Print EU f-beta Scores
for model in list(zip(model_names, model_fb_eu)):
    print("EU f-beta score = {:3f} for {}".format(model[1], model[0]))

In [None]:
# Plotting ROC Curves

# US
plt.plot(fpr_lr_us_grid, tpr_lr_us_grid, lw=1, label="Logistic Regression")
plt.plot(fpr_svm_us_grid, tpr_svm_us_grid, lw=1, label="SVM - Linear")
plt.plot(fpr_rf_us_grid, tpr_rf_us_grid, lw=1, label="Random Forest")
plt.plot(fpr_xgb_us_grid, tpr_xgb_us_grid, lw=1, label="XGBoost")
plt.plot(fpr_mlp_us_grid, tpr_mlp_us_grid, lw=1, label="Multilayer Perceptron")

plt.plot([0, 1], [0, 1], c='violet', ls='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])

plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('US Model Comparison - ROC curve')
plt.legend(ncol=2, fontsize='small')
sns.despine()

# Print US AUC Scores
for model in list(zip(model_names, model_aucs_us)):
    print("US ROC AUC score = {:3f} for {}".format(model[1], model[0])) 
print("\n")
    
# Print US Accuracy Scores
for model in list(zip(model_names, model_acc_us)):
    print("US Accuracy score = {:3f} for {}".format(model[1], model[0]))
print("\n")

# Print US f1 Scores
for model in list(zip(model_names, model_f1_us)):
    print("US f1 score = {:3f} for {}".format(model[1], model[0]))
print("\n")
    
# Print US f-beta Scores
for model in list(zip(model_names, model_fb_us)):
    print("US f-beta score = {:3f} for {}".format(model[1], model[0]))

In [None]:
# Look at LogReg Coefficients
best_log_eu = grid_lr_eu.best_estimator_

# EU
lr_eu_coefs = pd.DataFrame(sorted(list(zip(X_train_eu.columns, best_log_eu.coef_[0])),
                                   key=(lambda x: x[1]),
                                   reverse=True),
                            columns=['Feature', 'Coefficient'])
lr_eu_coefs

In [None]:
#US Coefficients

best_log_us = grid_lr_us.best_estimator_

lr_us_coefs = pd.DataFrame(sorted(list(zip(X_train_us.columns, best_log_us.coef_[0])),
                                   key=(lambda x: x[1]),
                                   reverse=True),
                            columns=['Feature', 'Coefficient'])
lr_us_coefs

In [None]:
# EU Intercept

grid_lr_eu.intercept_[0]

# convert intercept log-odds to probability
logodds = grid_lr_eu.intercept_
odds = np.exp(logodds)
prob = odds / (1 + odds)
prob[0]
print(
    'All else considered, companies that make it past their\
    first funding round, probability of success is {:.2f}%'
    .format(100 * prob[0]))


# US Intercept
grid_lr_us.intercept_[0]

# convert intercept log-odds to probability
logodds = grid_lr_us.intercept_
odds = np.exp(logodds)
prob = odds / (1 + odds)
prob[0]
print(
    'All else considered, companies that make it past their\
    first funding round, probability of success is {:.2f}%'
    .format(100 * prob[0]))

In [None]:
#Confusing Matrix LogReg EU
conf_lr_eu = confusion_matrix(y_test_eu, y_pred_lr_eu_grid)
conf_lr_eu_show = ConfusionMatrixDisplay(confusion_matrix=conf_lr_eu)
conf_lr_eu_show.plot()
plt.title("LogReg EU")
plt.show()

# EU LogReg Classification report
print(classification_report(y_test_eu, y_pred_lr_eu_grid))

#Confusing Matrix LogReg US
conf_lr_us = confusion_matrix(y_test_us, y_pred_lr_us_grid)
conf_lr_us_show = ConfusionMatrixDisplay(confusion_matrix=conf_lr_us)
conf_lr_us_show.plot()
plt.title("LogReg US")
plt.show()

# US LogReg Classification report
print(classification_report(y_test_us, y_pred_lr_us_grid))

In [None]:
#Confusing Matrix SVM EU
conf_svm_eu = confusion_matrix(y_test_eu, y_pred_svm_eu_grid)
conf_svm_eu_show = ConfusionMatrixDisplay(confusion_matrix=conf_svm_eu)
conf_svm_eu_show.plot()
plt.title("SVM EU")
plt.show()

# EU SVM Classification report
print(classification_report(y_test_eu, y_pred_svm_eu_grid))

#Confusing Matrix SVM US
conf_svm_us = confusion_matrix(y_test_us, y_pred_svm_us_grid)
conf_svm_us_show = ConfusionMatrixDisplay(confusion_matrix=conf_svm_us)
conf_svm_us_show.plot()
plt.title("SVM US")
plt.show()

# US SVM Classification report
print(classification_report(y_test_us, y_pred_svm_us_grid))

In [None]:
#Confusing Matrix Random Forest EU
conf_rf_eu = confusion_matrix(y_test_eu, y_pred_rf_eu_grid)
conf_rf_eu_show = ConfusionMatrixDisplay(confusion_matrix=conf_rf_eu)
conf_rf_eu_show.plot()
plt.title("Random Forest EU")
plt.show()

# EU Random Forest Classification report
print(classification_report(y_test_eu, y_pred_rf_eu_grid))

#Confusing Matrix Random Forest US
conf_rf_us = confusion_matrix(y_test_us, y_pred_rf_us_grid)
conf_rf_us_show = ConfusionMatrixDisplay(confusion_matrix=conf_rf_us)
conf_rf_us_show.plot()
plt.title("Random Forest US")
plt.show()

# US Random Forest Classification report
print(classification_report(y_test_us, y_pred_rf_us_grid))

In [None]:
#Confusing Matrix XGBoost EU
conf_xgb_eu = confusion_matrix(y_test_eu, y_pred_xgb_eu_grid)
conf_xgb_eu_show = ConfusionMatrixDisplay(confusion_matrix=conf_xgb_eu)
conf_xgb_eu_show.plot()
plt.title("XGBoost EU")
plt.show()

# EU XGBoost Classification report
print(classification_report(y_test_eu, y_pred_xgb_eu_grid))

#Confusing Matrix XGBoost US
conf_xgb_us = confusion_matrix(y_test_us, y_pred_xgb_us_grid)
conf_xgb_us_show = ConfusionMatrixDisplay(confusion_matrix=conf_xgb_us)
conf_xgb_us_show.plot()

plt.title("XGBoost US")
plt.show()

# US XGBoost Classification report
print(classification_report(y_test_us, y_pred_xgb_us_grid))

In [None]:
#Confusing Matrix Multilayer Perceptron EU
conf_mlp_eu = confusion_matrix(y_test_eu, y_pred_mlp_eu_grid)
conf_mlp_eu_show = ConfusionMatrixDisplay(confusion_matrix=conf_mlp_eu)
conf_mlp_eu_show.plot()
plt.title("Multilayer Perceptron EU")
plt.show()

# EU MLP Classification report
print(classification_report(y_test_eu, y_pred_mlp_eu_grid))

#Confusing Matrix Multilayer Perceptron US
conf_mlp_us = confusion_matrix(y_test_us, y_pred_mlp_us_grid)
conf_mlp_us_show = ConfusionMatrixDisplay(confusion_matrix=conf_mlp_us)
conf_mlp_us_show.plot()
plt.title("Multilayer Perceptron US")
plt.show()

# US MLP Classification report
print(classification_report(y_test_us, y_pred_mlp_us_grid))

In [None]:
# Feature Importance XGBoost
# XGBoost
# eu
'''xgb_eu_feats = pd.DataFrame({
    'feature': X_eu.columns,
    'importance': xgb_eu.feature_importances_
}).sort_values(by='importance', ascending=False)

# Look at top 10 features
print("Feature Importance XGBoost top 10 EU: ")
xgb_eu_feats[0:10]'''

In [None]:
'''#xgb_eu.get_booster().get_score(importance_type="gain")

sorted_idx = np.argsort(xgb_eu.feature_importances_)[::-1]

for index in sorted_idx:
    print([X_train_eu_full.columns[index], xgb_eu.feature_importances_[index]]) '''

In [None]:
'''import matplotlib.pyplot as plt

plot_importance(xgb_eu, max_num_features = 15)
plt.show()'''