In [115]:
# must go first
%matplotlib inline
%config InlineBackend.figure_format='retina'

# Reloads functions each time so you can edit a script 
# and not need to restart the kernel
%load_ext autoreload
%autoreload 2

# plotting
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_context("poster", font_scale=1.3)

import sys
import os
import datetime

sns.set()
sns.set_context('poster', font_scale=1.3)
sns.set_style("white")

import warnings
warnings.filterwarnings('ignore')

# basic wrangling
import pandas as pd
import numpy as np
import yaml
import json
import re

# eda tools
import missingno as msno
import collections
import itertools
from tqdm import tqdm
import nltk

# model building
from sklearn import feature_selection
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.pipeline import Pipeline

# pandas options
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 75)
pd.set_option('display.width', 1000)

# Update matplotlib defaults to something nicer
mpl_update = {
    'font.size': 16,
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'figure.figsize': [12.0, 8.0],
    'axes.labelsize': 20,
    'axes.labelcolor': '#677385',
    'axes.titlesize': 20,
    'lines.color': '#0055A7',
    'lines.linewidth': 3,
    'text.color': '#677385',
    'font.family': 'sans-serif',
    'font.sans-serif': 'Tahoma'
}
mpl.rcParams.update(mpl_update)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [116]:
# Create helper functions for specifying paths and appending
# directories with relevant python source code.
# This is a lot at the top of your notebook but if you get the jupyter
# extension for collapsing headings, you can always have this and the
# imports collapsed

root_dir = os.curdir
max_nest = 10  # arbitrary, 3 would probably suffice
nest = 0
while "src" not in os.listdir(root_dir) and nest < max_nest:
    # Look up the directory structure for a src directory
    root_dir = os.path.join(os.pardir, root_dir)
    nest += 1
    
# If you don't find the src directory, the root directory is this directory
root_dir = os.path.abspath(root_dir) if nest < max_nest else os.path.abspath(
    os.curdir)

# Add the root directory to be able to import from src, etc
sys.path.append(root_dir)

# Get the source directory and append path to access
# python packages/scripts within directory
if "src" in os.listdir(root_dir):
    src_dir = os.path.join(root_dir, "src")

# If data or figures directory don't exist in project directory,
# they will be saved to this directory
data_dir = os.path.join(
    root_dir, "data") if "data" in os.listdir(root_dir) else os.curdir
external_data_dir = os.path.join(
    data_dir, "external") if "external" in os.listdir(data_dir) else os.curdir
figure_dir = os.path.join(
    root_dir,
    "figures") if "figures" in os.listdir(root_dir) else os.curdir
models_dir = os.path.join(
    root_dir,
    "models") if "models" in os.listdir(root_dir) else os.curdir

# Prepends the directory path for specifying paths to data or figures
# dataplus("data.csv") -> "/Users/cmawer/project/data/data.csv"
# figplus("cool.png") -> "/Users/cmawer/project/figures/cool.png"
dataplus = lambda x: os.path.join(data_dir, x)
dataextplus = lambda x: os.path.join(external_data_dir, x)
figplus = lambda x: os.path.join(figure_dir, x)
modelsplus = lambda x: os.path.join(models_dir, x)

# Prepends the date to a string (e.g. to save dated files)
# dateplus("cool-figure.png") -> "2018-12-05-cool-figure.png"
now = datetime.datetime.now().strftime("%Y-%m-%d")
dateplus = lambda x: "%s-%s" % (now, x)

In [117]:
selected_features = ['funding_rounds',
 'founded_month',
 'founded_quarter',
 'founded_year',
 'country_esp',
 'country_ind',
 'country_other',
 'country_usa',
 'days_to_fund',
 'months_to_fund',
 'days_between_rounds',
 'months_between_rounds',
 'funding_round_type_debt_financing',
 'funding_round_type_post_ipo_debt',
 'funding_round_type_post_ipo_equity',
 'funding_round_type_private_equity',
 'funding_round_type_venture',
 'unique_investors',
 'median_investor_value',
 'no_acquisitions',
 'no_ipos',
 'market_biotechnology',
 'market_clean technology',
 'market_enterprise software',
 'market_finance',
 'market_health and wellness',
 'market_hospitality',
 'market_internet',
 'market_mobile',
 'market_other']

In [118]:
df = pd.read_csv('../data/auxiliary/aggregated_data.csv')

In [143]:
X = df[selected_features]
y = np.log(df['raised_amount_usd_mean']+1)

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [146]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=200, random_state=0)  
regressor.fit(X_train, y_train)  
y_pred = regressor.predict(X_test)  

In [147]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))  

Mean Absolute Error: 2.9283272297945597
Mean Squared Error: 19.777838261760547
Root Mean Squared Error: 4.447228154902843


In [148]:
metrics.r2_score(y_test, y_pred)

0.32313067749947755

In [149]:
list(zip(y_test, y_pred))

[(13.623139877437296, 14.669270026440572),
 (0.0, 17.93177468191317),
 (17.188582433059054, 13.96957462536487),
 (14.914123179965662, 8.892927243039598),
 (12.899222326086994, 11.122064831196228),
 (12.429220196836383, 14.219790867180125),
 (15.127216331453326, 16.065132805956434),
 (15.424948670398354, 13.550985768699507),
 (13.563021225467923, 10.534196679562328),
 (16.03335453134676, 16.10396462571131),
 (17.19250325547961, 9.050941934228005),
 (0.0, 3.085592520178251),
 (13.910821646859095, 14.26992616539661),
 (15.65247721933023, 16.609795328710113),
 (14.811821556234522, 13.464963592925514),
 (0.0, 9.509499168747762),
 (12.509368436980559, 10.852963809302912),
 (15.607270193858982, 13.979732733338423),
 (14.309611195634162, 14.851337451072077),
 (14.038654909278163, 14.33314334160587),
 (0.0, 10.9136985628765),
 (16.88820396524698, 14.07057635556127),
 (11.952659945988438, 10.924600695817201),
 (0.0, 7.089309766794393),
 (13.122365377402328, 14.441802433825906),
 (14.752975655609

In [173]:
df2 = df[df['raised_amount_usd_mean'] > 0]
X = df2[selected_features]
y = np.log(df2['raised_amount_usd_mean'])
X.head()

Unnamed: 0,funding_rounds,founded_month,founded_quarter,founded_year,country_esp,country_ind,country_other,country_usa,days_to_fund,months_to_fund,days_between_rounds,months_between_rounds,funding_round_type_debt_financing,funding_round_type_post_ipo_debt,funding_round_type_post_ipo_equity,funding_round_type_private_equity,funding_round_type_venture,unique_investors,median_investor_value,no_acquisitions,no_ipos,market_biotechnology,market_clean technology,market_enterprise software,market_finance,market_health and wellness,market_hospitality,market_internet,market_mobile,market_other
0,1,6.0,2.0,2012.0,0,0,0,1,29.0,0.952792,29.0,0.952792,0.0,0.0,0.0,0.0,0.0,6.0,9.0,0.0,146.0,0,0,0,0,0,0,0,0,0
1,2,12.0,4.0,2008.0,0,0,0,1,541.0,17.774492,55.5,1.823446,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,50.0,0,0,0,0,0,0,0,0,0
2,1,10.0,4.0,2012.0,0,0,1,0,-78.0,-2.562681,-78.0,-2.562681,0.0,0.0,0.0,0.0,0.0,1.0,8.0,0.0,146.0,0,0,0,0,0,0,0,0,1
3,1,4.0,2.0,2011.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,143.0,0,0,0,0,0,0,0,0,1
4,2,1.0,1.0,2014.0,0,0,0,1,228.0,7.490914,20.0,0.657098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,261.0,0,0,0,0,0,0,0,0,1


In [181]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

regressor = RandomForestRegressor(n_estimators = 100, oob_score = True, n_jobs = -1,random_state =50, max_features = "auto", min_samples_leaf = 50)  
regressor.fit(X_train, y_train)  
y_pred = regressor.predict(X_test)  

In [182]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))  
print('R2: ', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 1.0197772765419382
Mean Squared Error: 1.85138324993821
Root Mean Squared Error: 1.3606554486489995


In [187]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   44.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 648 out of 648 | elapsed: 12.7min finished


{'bootstrap': True,
 'max_depth': 110,
 'max_features': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 300}

In [189]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))  
    print('R2: ', metrics.r2_score(y_test, y_pred))

In [190]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)

Mean Absolute Error: 1.0197772765419382
Mean Squared Error: 1.85138324993821
Root Mean Squared Error: 1.3606554486489995
R2:  0.5779953306072414


In [192]:
X_test

Unnamed: 0,funding_rounds,founded_month,founded_quarter,founded_year,country_esp,country_ind,country_other,country_usa,days_to_fund,months_to_fund,days_between_rounds,months_between_rounds,funding_round_type_debt_financing,funding_round_type_post_ipo_debt,funding_round_type_post_ipo_equity,funding_round_type_private_equity,funding_round_type_venture,unique_investors,median_investor_value,no_acquisitions,no_ipos,market_biotechnology,market_clean technology,market_enterprise software,market_finance,market_health and wellness,market_hospitality,market_internet,market_mobile,market_other
25194,1,12.0,4.0,2002.0,0,0,0,1,4261.0,139.994661,4261.000000,139.994661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0,0,0,0,0,0,0,0,0,1
7580,1,1.0,1.0,2009.0,0,0,0,0,1803.0,59.237356,1803.000000,59.237356,0.0,0.0,0.0,0.0,0.0,4.0,7.5,0.0,63.0,1,0,0,0,0,0,0,0,0
31687,5,1.0,1.0,1993.0,0,0,0,1,4527.0,148.734060,274.800000,9.028522,1.0,0.0,0.0,0.0,4.0,5.0,9.0,1.0,0.0,0,0,0,0,0,0,0,0,1
4814,1,1.0,1.0,2003.0,0,0,0,0,4083.0,134.146492,4083.000000,134.146492,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,84.0,1,0,0,0,0,0,0,0,0
20835,1,3.0,1.0,2012.0,0,0,1,0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,146.0,0,0,0,0,0,0,0,0,0
39758,3,8.0,3.0,2007.0,0,1,0,0,1910.0,62.752829,173.333333,5.694847,0.0,0.0,0.0,0.0,2.0,1.0,2.0,0.0,279.0,0,0,0,0,0,0,0,0,0
18005,1,5.0,2.0,2012.0,0,0,1,0,153.0,5.026797,153.000000,5.026797,0.0,0.0,0.0,0.0,0.0,1.0,8.0,0.0,146.0,0,0,0,0,0,0,0,0,1
16927,3,1.0,1.0,2005.0,0,0,0,1,789.0,25.922504,375.000000,12.320582,1.0,0.0,0.0,0.0,2.0,2.0,9.0,0.0,236.0,0,0,0,0,0,0,0,0,0
34112,3,2.0,1.0,2013.0,0,0,0,1,21.0,0.689953,112.666667,3.701650,0.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,230.0,0,0,0,0,0,0,0,0,1
36017,1,5.0,2.0,2013.0,0,0,1,0,235.0,7.720898,235.000000,7.720898,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,230.0,0,0,0,0,0,0,0,0,0
