In [None]:
# Importing relevant libraries

import pandas as pd
import numpy as np
import datetime as dt
import pickle

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_score, recall_score,\
precision_recall_curve, f1_score, fbeta_score,\
accuracy_score, confusion_matrix, roc_auc_score, roc_curve, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import svm
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from sklearn import preprocessing
from sklearn.utils import resample, shuffle
from imblearn.over_sampling import ADASYN


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Loading datasets as csv

eu_df = pd.read_csv("data/clean_eu.csv")
us_df = pd.read_csv("data/clean_us.csv")

# Dropping "status" column in both datasets as target boolean "label" took over 
# to indicate whether successful or not
# Also dropping "first_funding_at" and "last_funding_at" as well as a random colum
# "Unnamed: 0"

eu_df = eu_df.drop(["status", "first_funding_at", "last_funding_at"], 1)
eu_df = eu_df.drop(columns=eu_df.columns[0])

us_df = us_df.drop(["status", "first_funding_at", "last_funding_at"], 1)
us_df = us_df.drop(columns=us_df.columns[0])

In [None]:
us_df.info()

In [None]:
# Creating Dummies

# EU
# Industry Dummies
industry_dummies_eu = pd.get_dummies(eu_df["industry"], drop_first=True)
# Country Dummies
country_dummies_eu = pd.get_dummies(eu_df["country_code"], drop_first=True)
# City Dummies
city_dummies_eu = pd.get_dummies(eu_df["city"], drop_first=True)

# US
# Industry Dummies
industry_dummies_us = pd.get_dummies(us_df["industry"], drop_first=True)
# State Dummies
state_dummies_us = pd.get_dummies(us_df["state_code"], drop_first=True)
# City Dummies
region_dummies_us = pd.get_dummies(us_df["region"], drop_first=True)

In [None]:
us_df["region"].unique()

In [None]:
# Getting numerical columns

X_col_num = [
    'funding_rounds', "funding_total_usd", 'avg_time_between_rounds', 'avg_raised_amount_usd',
    'num_of_investors']

#EU
X_num_eu = eu_df[X_col_num]

#US
X_num_us = us_df[X_col_num]


In [None]:
# Merge feature matrix

# EU
X_eu = X_num_eu.merge(industry_dummies_eu, left_index=True, right_index=True).merge(
    country_dummies_eu, left_index=True, right_index=True).merge(city_dummies_eu,
                                                              left_index=True,
                                                              right_index=True)

# Add intercept column
X_eu['intercept'] = 1


# US
X_us = X_num_us.merge(industry_dummies_us, left_index=True, right_index=True).merge(
    state_dummies_us, left_index=True, right_index=True).merge(region_dummies_us,
                                                              left_index=True,
                                                              right_index=True)

# Add intercept column
X_us['intercept'] = 1

In [None]:
# Target

# EU
y_eu = eu_df.label

# us
y_us = us_df.label

In [None]:
sns.pairplot(eu_df[X_col_num + ['label']], hue='label')


In [None]:
# Take a closer look at some of the features

# Average raise amount
plt.figure(figsize=(5, 5))
sns.kdeplot(eu_df[eu_df.label == 0]['avg_raised_amount_usd'],
            label='Fail',
            shade=True,
            color='teal')
sns.kdeplot(eu_df[eu_df.label == 1]['avg_raised_amount_usd'],
            label='Success',
            shade=True,
            color='royalblue')
sns.despine()

In [None]:
# Number of Investors
plt.figure(figsize=(5, 5))
sns.kdeplot(eu_df[eu_df.label == 0]['num_of_investors'],
            label='Fail',
            shade=True,
            color='teal')
sns.kdeplot(eu_df[eu_df.label == 1]['num_of_investors'],
            label='Success',
            shade=True,
            color='royalblue')
sns.despine()

In [None]:
# Count target values

#EU
target_count_eu = y_eu.value_counts()

# # print class balance
print("EU: ")
print(f'Class 0: {target_count_eu[0]}')
print(f'Class 1: {target_count_eu[1]}')
print(f'Proportion: {round(target_count_eu[0] / target_count_eu[1], 2)} : 1')
print('Percentage of Majority Class: {:f}'.format(
    round(target_count_eu[0] / sum(target_count_eu), 4) * 100))

#US
target_count_us = y_us.value_counts()

# # print class balance
print("\nUS: ")
print(f'Class 0: {target_count_us[0]}')
print(f'Class 1: {target_count_us[1]}')
print(f'Proportion: {round(target_count_us[0] / target_count_us[1], 2)} : 1')
print('Percentage of Majority Class: {:f}'.format(
    round(target_count_us[0] / sum(target_count_us), 4) * 100))

In [None]:
# Oversampling
ada = ADASYN()

# EU
X_eu, y_eu = ada.fit_resample(X_eu, y_eu)

# US
X_us, y_us = ada.fit_resample(X_us, y_us)

In [None]:
# Count target values after Upsampling

#EU
target_count_upsampled_eu = y_eu.value_counts()

# # print class balance
print("EU: ")
print(f'Class 0: {target_count_upsampled_eu[0]}')
print(f'Class 1: {target_count_upsampled_eu[1]}')
print(f'Proportion: {round(target_count_upsampled_eu[0] / target_count_upsampled_eu[1], 2)} : 1')
print('Percentage of Majority Class: {:f}'.format(
    round(target_count_upsampled_eu[0] / sum(target_count_upsampled_eu), 4) * 100))

#US
target_count_upsampled_us = y_us.value_counts()

# # print class balance
print("\nUS: ")
print(f'Class 0: {target_count_upsampled_us[0]}')
print(f'Class 1: {target_count_upsampled_us[1]}')
print(f'Proportion: {round(target_count_upsampled_us[0] / target_count_upsampled_us[1], 2)} : 1')
print('Percentage of Majority Class: {:f}'.format(
    round(target_count_upsampled_us[0] / sum(target_count_upsampled_us), 4) * 100))

In [None]:
# Split the data with 80% to train and 20% to test
# Stratify to ensure train and test sets have 
# similar proportions of either target class

# EU
X_train_eu, X_test_eu, y_train_eu, y_test_eu = train_test_split(X_eu,
                                                    y_eu,
                                                    test_size=0.2,
                                                    random_state=40,
                                                    stratify=y_eu)

# US
X_train_us, X_test_us, y_train_us, y_test_us = train_test_split(X_us,
                                                    y_us,
                                                    test_size=0.2,
                                                    random_state=40,
                                                    stratify=y_us)

In [None]:
# Standardizing

scaler = StandardScaler()

# EU

# Fit the scaler using the training data and scale it
X_train_scaled_eu = pd.DataFrame(scaler.fit_transform(X_train_eu.values),
                              columns=X_eu.columns)

# Scale the test data
X_test_scaled_eu = pd.DataFrame(scaler.transform(X_test_eu.values),
                             columns=X_eu.columns)

# US

# Fit the scaler using the training data and scale it
X_train_scaled_us = pd.DataFrame(scaler.fit_transform(X_train_us.values),
                              columns=X_us.columns)

# Scale the test data
X_test_scaled_us = pd.DataFrame(scaler.transform(X_test_us.values),
                             columns=X_us.columns)

In [None]:
# Logistic Regression

# EU
# Instantiate model
logreg_eu = LogisticRegression(C=10, solver='lbfgs')

# Fit model to the training data
logreg_eu.fit(X_train_scaled_eu, y_train_eu)

y_pred_lr_eu = logreg_eu.predict(X_test_scaled_eu)
acc_lr_eu = accuracy_score(y_test_eu, y_pred_lr_eu)

f1_lr_eu = f1_score(y_test_eu, y_pred_lr_eu)

print("Logistic Regression EU Accuracy: ", acc_lr_eu)
print("Logistic Regression EU f1 Score: ", f1_lr_eu)

# Calculate ROC curve 
fpr_lr_eu, tpr_lr_eu, thresholds_lr_eu = roc_curve(
    y_test_eu,
    logreg_eu.predict_proba(X_test_scaled_eu)[:, 1])

# Calculate area under the curve (AUC) for ROC
auc_lr_eu = roc_auc_score(y_test_eu, logreg_eu.predict_proba(X_test_scaled_eu)[:, 1])


# US
# Instantiate model
logreg_us = LogisticRegression(C=10, solver='lbfgs')

# Fit model to the training data
logreg_us.fit(X_train_scaled_us, y_train_us)

y_pred_lr_us = logreg_us.predict(X_test_scaled_us)
acc_lr_us = accuracy_score(y_test_us, y_pred_lr_us)

f1_lr_us = f1_score(y_test_us, y_pred_lr_us)

print("\nLogistic Regression US Accuracy: ", acc_lr_us)
print("Logistic Regression US f1 Score: ", f1_lr_us)

# Calculate ROC curve 
fpr_lr_us, tpr_lr_us, thresholds_lr_us = roc_curve(
    y_test_us,
    logreg_us.predict_proba(X_test_scaled_us)[:, 1])

# Calculate area under the curve (AUC) for ROC
auc_lr_us = roc_auc_score(y_test_us, logreg_us.predict_proba(X_test_scaled_us)[:, 1])

In [None]:
# Features

# EU
lr_coefs_eu = list(zip(X_eu.columns, logreg_eu.coef_[0]))
lr_coefs_df_eu = pd.DataFrame(lr_coefs_eu)
lr_top_coefs_eu = [x for x in lr_coefs_eu if np.abs(x[1]) > .07]
lr_top_coefs_eu = sorted(lr_top_coefs_eu, key=(lambda x: x[1]), reverse=True)
lr_top_coefs_df_eu = pd.DataFrame(lr_top_coefs_eu)

plt.barh([x[0] for x in lr_top_coefs_eu], width=[x[1] for x in lr_top_coefs_eu])
plt.title('LogOdds')
plt.grid(b=False)
sns.despine()

In [None]:
# SVM

#EU
svm_eu = svm.SVC(kernel="linear", probability=True)
svm_eu.fit(X_train_scaled_eu, y_train_eu)

y_pred_svm_eu = svm_eu.predict(X_test_scaled_eu)
acc_svm_eu = accuracy_score(y_test_eu, y_pred_svm_eu)

f1_svm_eu = f1_score(y_test_eu, y_pred_svm_eu)

print("SVM EU Accuracy: ", acc_svm_eu)
print("SVM EU f1 Score: ", f1_svm_eu)

# Calculate ROC curve 
fpr_svm_eu, tpr_svm_eu, thresholds_svm_eu = roc_curve(
    y_test_eu,
    svm_eu.predict_proba(X_test_scaled_eu)[:, 1])

# Calculate area under the curve (AUC) for ROC
auc_svm_eu = roc_auc_score(y_test_eu, svm_eu.predict_proba(X_test_scaled_eu)[:, 1])


#US
svm_us = svm.SVC(kernel="linear", probability=True)
svm_us.fit(X_train_scaled_us, y_train_us)

y_pred_svm_us = svm_us.predict(X_test_scaled_us)
acc_svm_us = accuracy_score(y_test_us, y_pred_svm_us)

f1_svm_us = f1_score(y_test_us, y_pred_svm_us)

print("SVM US Accuracy: ", acc_svm_us)
print("SVM US f1 Score: ", f1_svm_us)

# Calculate ROC curve 
fpr_svm_us, tpr_svm_us, thresholds_svm_us = roc_curve(
    y_test_us,
    svm_us.predict_proba(X_test_scaled_us)[:, 1])

# Calculate area under the curve (AUC) for ROC
auc_svm_us = roc_auc_score(y_test_us, svm_us.predict_proba(X_test_scaled_us)[:, 1])

In [None]:
# Random Forest

# EU
rf_eu = RandomForestClassifier(n_estimators=500,
                            bootstrap=True,
                            oob_score=True,
                            random_state=1234,
                            n_jobs=-1)

# fit
rf_eu.fit(X_train_scaled_eu, y_train_eu)

y_pred_rf_eu = rf_eu.predict(X_test_scaled_eu)
acc_rf_eu = accuracy_score(y_test_eu, y_pred_rf_eu)

f1_rf_eu = f1_score(y_test_eu, y_pred_rf_eu)

print("Random Forest EU Accuracy: ", acc_rf_eu)
print("Random Forest EU f1 Score: ", f1_rf_eu)

# Calculate ROC curve 
fpr_rf_eu, tpr_rf_eu, thresholds_rf_eu = roc_curve(
    y_test_eu,
    rf_eu.predict_proba(X_test_scaled_eu)[:, 1])

# Calculate area under the curve (AUC) for ROC
auc_rf_eu = roc_auc_score(y_test_eu, rf_eu.predict_proba(X_test_scaled_eu)[:, 1])


# US
rf_us = RandomForestClassifier(n_estimators=500,
                            bootstrap=True,
                            oob_score=True,
                            random_state=1234,
                            n_jobs=-1)

# fit
rf_us.fit(X_train_scaled_us, y_train_us)

y_pred_rf_us = rf_us.predict(X_test_scaled_us)
acc_rf_us = accuracy_score(y_test_us, y_pred_rf_us)

f1_rf_us = f1_score(y_test_us, y_pred_rf_us)

print("Random Forest US Accuracy: ", acc_rf_us)
print("Random Forest US f1 Score: ", f1_rf_us)

# Calculate ROC curve 
fpr_rf_us, tpr_rf_us, thresholds_rf_us = roc_curve(
    y_test_us,
    rf_us.predict_proba(X_test_scaled_us)[:, 1])

# Calculate area under the curve (AUC) for ROC
auc_rf_us = roc_auc_score(y_test_us, rf_us.predict_proba(X_test_scaled_us)[:, 1])

In [None]:
# XGBoost

xgb_eu = XGBClassifier()
xgb_eu.fit(X_train_scaled_eu, y_train_eu)

y_pred_xgb_eu = xgb_eu.predict(X_test_scaled_eu)
acc_xgb_eu = accuracy_score(y_test_eu, y_pred_xgb_eu)

f1_xgb_eu = f1_score(y_test_eu, y_pred_xgb_eu)

print("XGBoost EU Accuracy: ", acc_xgb_eu)
print("XGBoost EU f1 Score: ", f1_xgb_eu)

# Calculate ROC curve 
fpr_xgb_eu, tpr_xgb_eu, thresholds_xgb_eu = roc_curve(
    y_test_eu,
    xgb_eu.predict_proba(X_test_scaled_eu)[:, 1])

# Calculate area under the curve (AUC) for ROC
auc_xgb_eu = roc_auc_score(y_test_eu, xgb_eu.predict_proba(X_test_scaled_eu)[:, 1])


# US
xgb_us = RandomForestClassifier(n_estimators=500,
                            bootstrap=True,
                            oob_score=True,
                            random_state=1234,
                            n_jobs=-1)

# fit
xgb_us.fit(X_train_scaled_us, y_train_us)

y_pred_xgb_us = xgb_us.predict(X_test_scaled_us)
acc_xgb_us = accuracy_score(y_test_us, y_pred_xgb_us)

f1_xgb_us = f1_score(y_test_us, y_pred_xgb_us)

print("XGBoost US Accuracy: ", acc_xgb_us)
print("XGBoost US f1 Score: ", f1_xgb_us)

# Calculate ROC curve 
fpr_xgb_us, tpr_xgb_us, thresholds_xgb_us = roc_curve(
    y_test_us,
    xgb_us.predict_proba(X_test_scaled_us)[:, 1])

# Calculate area under the curve (AUC) for ROC
auc_xgb_us = roc_auc_score(y_test_us, xgb_us.predict_proba(X_test_scaled_us)[:, 1])

In [None]:
# Feature importance extraction

# Random Forest
# EU
rf_eu_feats = pd.DataFrame({
    'feature': X_eu.columns,
    'importance': rf_eu.feature_importances_
}).sort_values(by='importance', ascending=False)

# Look at top 10 features
print("Feature Importance Random Forest top 10 EU: ")
rf_eu_feats[0:10]

In [None]:
# Random Forest
# US
rf_us_feats = pd.DataFrame({
    'feature': X_us.columns,
    'importance': rf_us.feature_importances_
}).sort_values(by='importance', ascending=False)

# Look at top 10 features
print("Feature Importance Random Forest top 10 US: ")
rf_us_feats[0:10]

In [None]:
# XGBoost
# eu
xgb_eu_feats = pd.DataFrame({
    'feature': X_eu.columns,
    'importance': xgb_eu.feature_importances_
}).sort_values(by='importance', ascending=False)

# Look at top 10 features
print("Feature Importance XGBoost top 10 EU: ")
xgb_eu_feats[0:10]

In [None]:
# XGBoost
# US
xgb_us_feats = pd.DataFrame({
    'feature': X_us.columns,
    'importance': xgb_us.feature_importances_
}).sort_values(by='importance', ascending=False)

# Look at top 10 features
print("Feature Importance XGBoost top 10 US: ")
xgb_us_feats[0:10]

In [None]:
# Model Comparison EU
models = ['lr', 'svm', 'rf', 'xgb']

model_acc_eu = [
    acc_lr_eu, acc_svm_eu, acc_rf_eu, acc_xgb_eu]

model_aucs_eu = [
    auc_lr_eu, auc_svm_eu, auc_rf_eu, auc_xgb_eu]

model_fbetas_eu = [f1_lr_eu, f1_svm_eu, f1_rf_eu, f1_xgb_eu]

model_names = [
    'Logistic Regression', 'Support Vector Machine', 'Random Forest', 'XGBoost',]

# Plot ROC Curves

plt.plot(fpr_lr_eu, tpr_lr_eu, lw=1, label='Logistic Regression')
plt.plot(fpr_svm_eu, tpr_svm_eu, lw=1, label='SVM')
plt.plot(fpr_rf_eu, tpr_rf_eu, lw=1, label='Random Forest')
plt.plot(fpr_xgb_eu, tpr_xgb_eu, lw=1, label='XGBoost')

plt.plot([0, 1], [0, 1], c='violet', ls='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])

plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('Model Comparison EU - ROC curve')
plt.legend(ncol=2, fontsize='small')
sns.despine()

# Print EU AUC Scores
for model in list(zip(model_names, model_aucs_eu)):
    print("EU ROC AUC score = {:3f} for {}".format(model[1], model[0]))
print("\n")
    
# Print EU Accuracy Scores
for model in list(zip(model_names, model_acc_eu)):
    print("EU Accuracy score = {:3f} for {}".format(model[1], model[0]))
print("\n")

# Print EU f1 Scores
for model in list(zip(model_names, model_fbetas_eu)):
    print("EU f1 score = {:3f} for {}".format(model[1], model[0]))

In [None]:
# Model Comparison US
model_acc_us = [
    acc_lr_us, acc_svm_us, acc_rf_us, acc_xgb_us]

model_aucs_us = [
    auc_lr_us, auc_svm_us, auc_rf_us, auc_xgb_us]

model_fbetas_us = [f1_lr_us, f1_svm_us, f1_rf_us, f1_xgb_us]

# Plot ROC Curves

plt.plot(fpr_lr_us, tpr_lr_us, lw=1, label='Logistic Regression')
plt.plot(fpr_svm_us, tpr_svm_us, lw=1, label='SVM')
plt.plot(fpr_rf_us, tpr_rf_us, lw=1, label='Random Forest')
plt.plot(fpr_xgb_us, tpr_xgb_us, lw=1, label='XGBoost')

plt.plot([0, 1], [0, 1], c='violet', ls='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])

plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('Model Comparison US - ROC curve')
plt.legend(ncol=2, fontsize='small')
sns.despine()

# Print EU AUC Scores
for model in list(zip(model_names, model_aucs_us)):
    print("US ROC AUC score = {:3f} for {}".format(model[1], model[0]))
print("\n")
    
# Print EU Accuracy Scores
for model in list(zip(model_names, model_acc_us)):
    print("US Accuracy score = {:3f} for {}".format(model[1], model[0]))
print("\n")

# Print EU f1 Scores
for model in list(zip(model_names, model_fbetas_us)):
    print("US f1 score = {:3f} for {}".format(model[1], model[0]))