In [31]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

# Load and Pre-Process the Data

In [32]:
df = pd.read_csv('/Users/danielmurphy/Desktop/predict-startup-success/data/founder_V0.3_founder.csv')

In [33]:
df.columns

Index(['Full Name', 'Primary Job Title', 'Bio', 'Gender',
       'Number of News Articles', 'Number of Founded Organizations',
       'Number of Portfolio Companies', 'Number of Investments_x',
       'Number of Partner Investments', 'Number of Lead Investments_x',
       'Number of Exits_x', 'Number of Events_x', 'Categories',
       'Headquarters Location ', 'Operating Status', 'Founded Date',
       'Closed Date', 'Company Type', 'Number of Founders', 'Success',
       'Founder'],
      dtype='object')

In [34]:
df.shape

(18361, 21)

In [35]:
filtered_df = df.drop(['Full Name', 'Primary Job Title', 'Bio', 'Number of Portfolio Companies', 'Number of Investments_x','Number of Partner Investments', 'Number of Lead Investments_x', 'Number of Events_x', 'Founded Date', 'Closed Date', 'Company Type', 'Founder'], axis=1)


In [36]:
print(filtered_df)

       Gender  Number of News Articles  Number of Founded Organizations  \
0           1                    14445                                7   
1           1                      100                                2   
2           1                    19678                               10   
3           1                      128                                1   
4           1                      861                                1   
...       ...                      ...                              ...   
18356       1                        0                                1   
18357       1                        0                                1   
18358       1                        0                                1   
18359       1                        0                                1   
18360       1                        0                                2   

       Number of Exits_x                                         Categories  \
0                   

In [37]:
categories_dummy = df['Categories']
categories_dummy = categories_dummy.str.split(',', expand=True)
categories_dummy = categories_dummy[0]
categories_dummy = pd.get_dummies(categories_dummy)
print(categories_dummy)

       0  3D Printing  3D Technology  A/B Testing  Accounting  Ad Network  \
0      0            0              0            0           0           0   
1      0            0              0            0           0           0   
2      0            0              0            0           0           0   
3      0            0              0            0           0           0   
4      0            0              0            0           0           0   
...   ..          ...            ...          ...         ...         ...   
18356  0            0              0            0           0           0   
18357  0            0              0            0           0           0   
18358  0            0              0            0           0           0   
18359  0            0              0            0           0           0   
18360  0            0              0            0           0           0   

       Ad Retargeting  Ad Targeting  Advanced Materials  Adventure Travel  

In [38]:
filtered_df = pd.concat([filtered_df, categories_dummy], axis=1)
print(filtered_df)

       Gender  Number of News Articles  Number of Founded Organizations  \
0           1                    14445                                7   
1           1                      100                                2   
2           1                    19678                               10   
3           1                      128                                1   
4           1                      861                                1   
...       ...                      ...                              ...   
18356       1                        0                                1   
18357       1                        0                                1   
18358       1                        0                                1   
18359       1                        0                                1   
18360       1                        0                                2   

       Number of Exits_x                                         Categories  \
0                   

In [39]:
hq_dummy = df['Headquarters Location ']
hq_dummy = pd.get_dummies(hq_dummy)
print(hq_dummy)

        Africa   Alabama   Albania   Argentina   Arizona   Arkansas   Armenia  \
0            0         0         0           0         0          0         0   
1            0         0         0           0         0          0         0   
2            0         0         0           0         0          0         0   
3            0         0         0           0         0          0         0   
4            0         0         0           0         0          0         0   
...        ...       ...       ...         ...       ...        ...       ...   
18356        0         0         0           0         0          0         0   
18357        0         0         0           0         0          0         0   
18358        0         0         0           0         0          0         0   
18359        0         0         0           0         0          0         0   
18360        0         0         0           0         0          0         0   

        Asia   Australia   

In [40]:
filtered_df = pd.concat([filtered_df, hq_dummy], axis=1)
print(filtered_df)

       Gender  Number of News Articles  Number of Founded Organizations  \
0           1                    14445                                7   
1           1                      100                                2   
2           1                    19678                               10   
3           1                      128                                1   
4           1                      861                                1   
...       ...                      ...                              ...   
18356       1                        0                                1   
18357       1                        0                                1   
18358       1                        0                                1   
18359       1                        0                                1   
18360       1                        0                                2   

       Number of Exits_x                                         Categories  \
0                   

In [43]:
filtered_df = filtered_df.drop(['Headquarters Location ', 'Categories'], axis=1)
print(filtered_df)

       Gender  Number of News Articles  Number of Founded Organizations  \
0           1                    14445                                7   
1           1                      100                                2   
2           1                    19678                               10   
3           1                      128                                1   
4           1                      861                                1   
...       ...                      ...                              ...   
18356       1                        0                                1   
18357       1                        0                                1   
18358       1                        0                                1   
18359       1                        0                                1   
18360       1                        0                                2   

       Number of Exits_x  Operating Status  Number of Founders  Success  0  \
0                    

# Inspecting the Dataset

In [59]:
y = filtered_df['Success'].values
X = filtered_df.drop('Success', axis=1)
print("Shape of X: " + str(X.values.shape))
print("Shape of y: " + str(y.shape))

Shape of X: (18361, 732)
Shape of y: (18361,)


# Train & Test Split

In [60]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 42)

# Random Forest

### n_estimators = 500

In [61]:
rand_forest = RandomForestClassifier(n_estimators=500)
rand_forest.fit(x_train, y_train)

y_pred_rf = rand_forest.predict(x_test)

rf_train_score = round(rand_forest.score(x_train, y_train)*100, 2)
rf_test_score = round(rand_forest.score(x_test, y_test)*100, 2)

print("Train Score : " + str(rf_train_score))
print("Test Score : " + str(rf_test_score))

Train Score : 96.79
Test Score : 72.34


### n_estimators = 1000

In [50]:
rand_forest = RandomForestClassifier(n_estimators=1000)
rand_forest.fit(x_train, y_train)

y_pred_rf = rand_forest.predict(x_test)

rf_train_score = round(rand_forest.score(x_train, y_train)*100, 2)
rf_test_score = round(rand_forest.score(x_test, y_test)*100, 2)

print("Train Score : " + str(rf_train_score))
print("Test Score : " + str(rf_test_score))

Train Score : 96.79
Test Score : 72.32


### n_estimators = 2000

In [51]:
rand_forest = RandomForestClassifier(n_estimators=2000)
rand_forest.fit(x_train, y_train)

y_pred_rf = rand_forest.predict(x_test)

rf_train_score = round(rand_forest.score(x_train, y_train)*100, 2)
rf_test_score = round(rand_forest.score(x_test, y_test)*100, 2)

print("Train Score : " + str(rf_train_score))
print("Test Score : " + str(rf_test_score))

Train Score : 96.79
Test Score : 72.41


In [66]:
X = pd.DataFrame(X)
rand_forest_feature_importance = pd.DataFrame({
    'feature' : X.columns,
    'importance': rand_forest.feature_importances_
}).sort_values(by = 'importance', ascending=False)

rand_forest_feature_importance[:10]

Unnamed: 0,feature,importance
1,Number of News Articles,0.174112
5,Number of Founders,0.087164
2,Number of Founded Organizations,0.035062
607,California,0.018907
675,New York,0.013277
0,Gender,0.012476
612,China,0.009464
507,Software,0.009361
3,Number of Exits_x,0.008397
356,Mobile,0.007979


In [62]:
print(pd.DataFrame(X))

       Gender  Number of News Articles  Number of Founded Organizations  \
0           1                    14445                                7   
1           1                      100                                2   
2           1                    19678                               10   
3           1                      128                                1   
4           1                      861                                1   
...       ...                      ...                              ...   
18356       1                        0                                1   
18357       1                        0                                1   
18358       1                        0                                1   
18359       1                        0                                1   
18360       1                        0                                2   

       Number of Exits_x  Operating Status  Number of Founders  0  \
0                      6      