In [228]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Load and Pre-Process the Data

In [229]:
df = pd.read_csv('../data/founder_V0.3_founder.csv')

In [230]:
df.columns = df.columns.str.replace(' ', '_')
df.columns

Index(['Full_Name', 'Primary_Job_Title', 'Bio', 'Gender',
       'Number_of_News_Articles', 'Number_of_Founded_Organizations',
       'Number_of_Portfolio_Companies', 'Number_of_Investments_x',
       'Number_of_Partner_Investments', 'Number_of_Lead_Investments_x',
       'Number_of_Exits_x', 'Number_of_Events_x', 'Categories',
       'Headquarters_Location_', 'Operating_Status', 'Founded_Date',
       'Closed_Date', 'Company_Type', 'Number_of_Founders', 'Success',
       'Founder'],
      dtype='object')

In [231]:
df.shape

(18361, 21)

In [232]:
filtered_df = df.drop(['Full_Name', 'Primary_Job_Title', 'Bio', 'Number_of_Portfolio_Companies', 'Number_of_Investments_x','Number_of_Partner_Investments', 'Number_of_Lead_Investments_x', 'Number_of_Events_x', 'Founded_Date', 'Closed_Date', 'Company_Type', 'Founder'], axis=1)


In [233]:
print(filtered_df.shape)
if '0' in filtered_df.columns:
    print('T')

(18361, 9)


In [234]:
categories = df['Categories']
split_categories = categories.str.split(',', expand=True)
categories_firstCol = split_categories.iloc[:, 0]
category_type_dummies = pd.get_dummies(categories_firstCol)
print(category_type_dummies.shape)

(18361, 584)


In [235]:
filtered_df = pd.concat([filtered_df, category_type_dummies], axis=1)
print(filtered_df.shape)
for colName in filtered_df.columns:
    if colName == '0':
        print('True')

filtered_df = filtered_df.drop(['0'], axis = 1)
print(filtered_df.shape)

(18361, 593)
True
(18361, 592)


In [236]:
hq = df['Headquarters_Location_']
hq_dummies = pd.get_dummies(hq)
print(hq_dummies.shape)

hq_dummies = hq_dummies.drop(['0'], axis = 1)
print(hq_dummies.shape)

(18361, 142)
(18361, 141)


In [237]:
filtered_df = pd.concat([filtered_df, hq_dummies], axis=1)
print(filtered_df.shape)

(18361, 733)


In [238]:
# Drop the initial columns
filtered_df = filtered_df.drop(['Headquarters_Location_', 'Categories'], axis=1)
print(filtered_df.columns[:25])
# filtered_df = filtered_df.drop(['0'], axis = 1)
print(filtered_df.shape)

Index(['Gender', 'Number_of_News_Articles', 'Number_of_Founded_Organizations',
       'Number_of_Exits_x', 'Operating_Status', 'Number_of_Founders',
       'Success', '3D Printing', '3D Technology', 'A/B Testing', 'Accounting',
       'Ad Network', 'Ad Retargeting', 'Ad Targeting', 'Advanced Materials',
       'Adventure Travel', 'Advertising', 'Advertising Platforms', 'Advice',
       'Aerospace', 'Affiliate Marketing', 'AgTech', 'Agriculture',
       'Air Transportation', 'Alternative Medicine'],
      dtype='object')
(18361, 731)


# Inspecting the Dataset

In [239]:
y = filtered_df['Success'].values
X = filtered_df.drop('Success', axis=1)
print("Shape of X: " + str(X.values.shape))
print("Shape of y: " + str(y.shape))


Shape of X: (18361, 730)
Shape of y: (18361,)


### Create a DMatrix

In [240]:
data_dMatrix = xgboost.DMatrix(data = X, label=y)

# Train & Test Split

In [241]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 123) 


### XGBoost

In [242]:
# instantiate
xgb_reg = xgboost.XGBRegressor(objective='binary:logistic', learning_rate=.3, colsample_bytree=.3, reg_alpha= 9)

# fit the model to training data
xgb_reg.fit(x_train, y_train)

# y prediction
y_pred_xgb = xgb_reg.predict(x_test)

xgb_score_TEST = accuracy_score(y_test, y_pred_xgb.round())

print("XGB testing data accuracy: " + str(xgb_score_TEST))

XGB testing data accuracy: 0.6966781630059902


In [243]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb))

print("RMSE: " + str(rmse))
print("MSE: " + str(mean_squared_error(y_test, y_pred_xgb)))

RMSE: 0.4494758165169884
MSE: 0.20202850963361346


In [245]:
params = {
    "objective": "binary:logistic", # classification, dependent has two levels 0 and 1
    "colsame_bytree": .3,
    "learning_rate": .3,
    "reg_alpha":9,
}

cv_results = xgb.cv(dtrain = data_dMatrix, params=params, nfold = 5, metrics="rmse", num_boost_round = 50, early_stopping_rounds = 5,as_pandas = True)

In [246]:
print(cv_results['test-rmse-mean'].tail(1))

49    0.451171
Name: test-rmse-mean, dtype: float64


In [247]:
xgb_reg2 = xgb.train(params=params, dtrain= data_dMatrix, num_boost_round=10)