In [1]:
import pandas as pd
import math
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt
%matplotlib notebook

In [2]:
# Read in Data
TicketData = pd.read_csv('ProcessedTicketDataLogs.csv') # See "Examine Variable Distributions & Skews" notebook

In [3]:
# Set Desired Prediction Variables
X = TicketData[['face_value', 'sold_out', 'days_to_show_log', 'num_blogs_log', 'num_news_log', 'num_reviews_log',
                'discovery', 'familiarity', 'hotttnesss', 'num_years_active']]
y = TicketData['FV_delta_log']
y.describe()

count    1260.000000
mean        3.650238
std         0.610114
min         1.982380
25%         3.249793
50%         3.647667
75%         4.056037
max         5.342334
Name: FV_delta_log, dtype: float64

### Tune Random Forest model for number of features

In [65]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [5]:
# Tune for ideal number of features
Features = range(1,11) # We have 10 features in total.
oob_score_RF = []
MSE_Test_RF = []
for i in Features:
    RF = RandomForestRegressor(n_estimators = 1000, #Number of trees - the more the better!
                           max_features = i,     #How many features to randomly choose in each node 
                           min_samples_leaf = 5, #Minimum number of observations at each terminal node
                           oob_score = True)
    RF.fit(X_train,y_train)
    oob_score_RF.append(RF.oob_score_)
    y_hat_test = RF.predict(X_test)
    MSE_Test  = metrics.mean_squared_error(y_test, y_hat_test)
    MSE_Test_RF.append(MSE_Test)

# Check OOB scores
Depth_Choice_df = pd.DataFrame({'Out of Bag Score': oob_score_RF ,'Number of Features': Features})
Depth_Choice_df.plot(x ='Number of Features',y = 'Out of Bag Score' )

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x10e742810>

#### Looks like we maxmize OOB score at 3 features.

In [6]:
# Check MSE's
Depth_Choice_df = pd.DataFrame({'MSE': MSE_Test_RF ,'Number of Features': Features})
Depth_Choice_df.plot(x ='Number of Features',y = 'MSE' )

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1108c7e10>

#### Looks like we minimize MSE at 8 features.

### Tune for ideal number of trees with max_features set to 3

In [7]:
NumberOfTrees = [1000,2000,5000,10000,15000]
oob_score_RF = []
MSE_Test_RF = []
for i in NumberOfTrees:
    RF = RandomForestRegressor(n_estimators = i, 
                               max_features = 3, 
                               min_samples_leaf = 5, 
                               oob_score = True)
    RF.fit(X_train,y_train)
    oob_score_RF.append(RF.oob_score_)
    y_hat_test = RF.predict(X_test)
    MSE_Test  = metrics.mean_squared_error(y_test, y_hat_test)
    MSE_Test_RF.append(MSE_Test)

Depth_Choice_df = pd.DataFrame({'Out of Bag Score': oob_score_RF ,'Number of Trees': NumberOfTrees})
Depth_Choice_df.plot(x ='Number of Trees',y = 'Out of Bag Score' )

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x11799e410>

#### Looks like we maximize OOB score at around 5,000 of trees

In [8]:
Depth_Choice_df = pd.DataFrame({'MSE': MSE_Test_RF ,'Number of Trees': NumberOfTrees})
Depth_Choice_df.plot(x ='Number of Trees',y = 'MSE' )

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x117785c50>

#### Looks like we minimize MSE at around 10,000 trees.

### Check Feature Importances

In [9]:
# Check feature importances
sorted(zip(RF.feature_importances_,X.columns.values))

[(0.01759317220397277, 'sold_out'),
 (0.067421645149920426, 'num_reviews_log'),
 (0.083239651512945073, 'discovery'),
 (0.097645869034532193, 'familiarity'),
 (0.11027874655610719, 'num_news_log'),
 (0.11346697609278358, 'days_to_show_log'),
 (0.11363251207459972, 'num_blogs_log'),
 (0.11821640405415777, 'face_value'),
 (0.1259803270842329, 'hotttnesss'),
 (0.15252469623674789, 'num_years_active')]

#### Generate final Random Forest Model with ideal parameters

In [10]:
RF_final = RandomForestRegressor(n_estimators = 10000, 
                                 max_features = 3, 
                                 min_samples_leaf = 5, 
                                 oob_score = True)
RF_final.fit(X_train,y_train)
print("OOB Score is: %s" % RF_final.oob_score_)
y_hat_test = RF_final.predict(X_test)
MSE_Test  = metrics.mean_squared_error(y_test, y_hat_test)
print("MSE is: %s" % MSE_Test)

OOB Score is: 0.233869584
MSE is: 0.309592783755


## Boosting

In [11]:
# Run boosting without tuning
GBR_Tree = GradientBoostingRegressor(learning_rate = 0.01, # This is lambda, a tuning parameter, usually between 0.01 and 0.1
                                     n_estimators = 10000, #This is B, a tuning parameter, using large B can cause overfitting
                                     max_depth = 2, #This is d, another tuning parameter, usually max_depth < 5
                                     min_samples_leaf = 5  )
GBR_Tree.fit(X_train,y_train)
y_hat_test = GBR_Tree.predict(X_test)
MSE_Test  = metrics.mean_squared_error(y_test, y_hat_test)
print("MSE for untuned boosting is: %s" % MSE_Test)

MSE for untuned boosting is: 0.343617137215


### Tune our algorithm for number of trees and lambda

In [12]:
Depth = range(1,5)
Scores = []
for i in Depth:
    lambdas = [0.01, 0.03, 0.06, 0.1]
    lambda_scores = []
    for j in lambdas:
        NumberOfTrees = [1000,2000,5000,10000]
        tree_scores = []
        for k in NumberOfTrees:
            GBR_Tree = GradientBoostingRegressor(learning_rate = j,
                                             n_estimators = k,
                                             max_depth = i,
                                             min_samples_leaf = 5)
            GBR_Tree.fit(X_train,y_train)
            y_hat_test = GBR_Tree.predict(X_test)
            MSE_Test = metrics.mean_squared_error(y_test, y_hat_test)
            tree_scores.append(MSE_Test)
        print("Tree scores for depth %s and lambda %s:" % (i, j))
        print tree_scores
        lambda_scores.append(tree_scores)
    Scores.append(lambda_scores)

print("\nDone!")

Tree scores for depth 1 and lambda 0.01:
[0.33623938431043493, 0.33586521098890232, 0.33070489788032326, 0.3303267532114843]
Tree scores for depth 1 and lambda 0.03:
[0.33350422947405473, 0.33025922863585805, 0.33034935211131228, 0.33314307281735422]
Tree scores for depth 1 and lambda 0.06:
[0.33003352739116304, 0.33057635822863196, 0.33305662527812474, 0.33959283658182565]
Tree scores for depth 1 and lambda 0.1:
[0.33070486642434394, 0.33007126351903643, 0.33831520374040613, 0.34592644885404655]
Tree scores for depth 2 and lambda 0.01:
[0.32633929140590756, 0.32394666118145704, 0.33181055546219318, 0.34361713721546255]
Tree scores for depth 2 and lambda 0.03:
[0.32673837057355165, 0.33484021247051771, 0.34857610684165274, 0.37289778965566334]
Tree scores for depth 2 and lambda 0.06:
[0.3343005386367302, 0.3437989681558416, 0.37553879237718613, 0.412467880585765]
Tree scores for depth 2 and lambda 0.1:
[0.33836131023709481, 0.35660778155936979, 0.40112577859956033, 0.44024860514287339]

#### We get the lowest MSE at depth=3, lambda=0.01, and trees=1000.
#### Let's try to localize around a depth of 3 to further minimize MSE

In [15]:
Scores = []
lambdas = [0.01, 0.01001, 0.01002]
lambda_scores = []
for i in lambdas:
    NumberOfTrees = [1025, 1050, 1075]
    tree_scores = []
    for j in NumberOfTrees:
        GBR_Tree = GradientBoostingRegressor(learning_rate = i,
                                         n_estimators = j,
                                         max_depth = 3,
                                         min_samples_leaf = 5)
        GBR_Tree.fit(X_train,y_train)
        y_hat_test = GBR_Tree.predict(X_test)
        MSE_Test = metrics.mean_squared_error(y_test, y_hat_test)
        tree_scores.append(MSE_Test)
    print("Tree scores for lambda %s:" % i)
    print tree_scores
    lambda_scores.append(tree_scores)
Scores.append(lambda_scores)

print("\nDone!")

Tree scores for lambda 0.01:
[0.31105952723309194, 0.31112807391373221, 0.31127447006317205]
Tree scores for lambda 0.01001:
[0.31153522848580734, 0.31122142702615613, 0.3110241691979449]
Tree scores for lambda 0.01002:
[0.31411273799315464, 0.31428113287838205, 0.31409061194480725]

Done!


#### I think the best we can do is lambda of 0.01, max_depth of 3, and # of trees at 1,025.

In [66]:
# Create Boosting model
GBR_Tree = GradientBoostingRegressor(learning_rate = 0.01,
                                 n_estimators = 1025,
                                 max_depth = 3,
                                 min_samples_leaf = 5)
GBR_Tree.fit(X_train,y_train)
y_hat_test = GBR_Tree.predict(X_test)
MSE_Test = metrics.mean_squared_error(y_hat_test, y_test)
print MSE_Test

0.350218090356


## Test out the Boosting Model

In [69]:
instance = 137
X_test1 = X.iloc[instance]
print TicketData[['FV_delta', 'FV_delta_log']].iloc[instance]
y_test1 = GBR_Tree.predict(X_test1)
print("Prediction is: %s" % math.exp(y_test1))

FV_delta        11.550000
FV_delta_log     2.446685
Name: 137, dtype: float64
Prediction is: 21.5359303438


