In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt
%matplotlib notebook

In [4]:
# Read in Data
TicketData = pd.read_csv('Data/ProcessedTicketDataLogs.csv') # See "Examine Variable Distributions & Skews" notebook
TicketData['FV_delta'].describe()

count    1260.000000
mean       46.480698
std        31.878243
min         7.260000
25%        25.785000
50%        38.385000
75%        57.745000
max       209.000000
Name: FV_delta, dtype: float64

In [5]:
# Box plot of markups
TicketData['FV_delta'].plot.box()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x11203bc10>

### Remove outliers

In [6]:
# Remove outliers function
def RemoveOutliersFromDataFrame(df, column_name):
    q1 = df[column_name].quantile(0.25)
    q3 = df[column_name].quantile(0.75)
    IQR = q3 - q1
    df.drop(df[df[column_name] > q3+1.5*IQR].index, inplace = True)
    df.drop(df[df[column_name] < q1-1.5*IQR].index, inplace = True)
    return df

In [7]:
# Remove markup outliers
TicketData = RemoveOutliersFromDataFrame(TicketData, 'FV_delta')
TicketData['FV_delta'].describe()

count    1192.000000
mean       40.869807
std        20.696057
min         7.260000
25%        25.302500
50%        37.120000
75%        51.900000
max       104.900000
Name: FV_delta, dtype: float64

In [8]:
# Box plot of markups with outliers remove
TicketData['FV_delta'].plot.box()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x11203a550>

In [9]:
# Set Desired Prediction Variables
X = TicketData[['face_value', 'sold_out', 'days_to_show', 'num_blogs', 'num_news', 'num_reviews',
                'discovery', 'familiarity', 'hotttnesss', 'num_years_active']]
y = TicketData['FV_delta']
y.describe()

count    1192.000000
mean       40.869807
std        20.696057
min         7.260000
25%        25.302500
50%        37.120000
75%        51.900000
max       104.900000
Name: FV_delta, dtype: float64

### Tune Random Forest model for number of features

In [8]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [9]:
# Tune for ideal number of features
Features = range(1,11) # We have 10 features in total.
oob_score_RF = []
MSE_Test_RF = []
for i in Features:
    RF = RandomForestRegressor(n_estimators = 1000, #Number of trees - the more the better!
                           max_features = i,     #How many features to randomly choose in each node 
                           min_samples_leaf = 5, #Minimum number of observations at each terminal node
                           oob_score = True)
    RF.fit(X_train,y_train)
    oob_score_RF.append(RF.oob_score_)
    y_hat_test = RF.predict(X_test)
    MSE_Test  = metrics.mean_squared_error(y_test, y_hat_test)
    MSE_Test_RF.append(MSE_Test)

# Check OOB scores
Depth_Choice_df = pd.DataFrame({'Out of Bag Score': oob_score_RF ,'Number of Features': Features})
Depth_Choice_df.plot(x ='Number of Features',y = 'Out of Bag Score' )

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1148d7750>

#### Looks like we maxmize OOB score at 2  features.

In [10]:
# Check MSE's
Depth_Choice_df = pd.DataFrame({'MSE': MSE_Test_RF ,'Number of Features': Features})
Depth_Choice_df.plot(x ='Number of Features',y = 'MSE' )

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x114ce45d0>

#### Looks like we minimize MSE at 8 features.

### Tune for ideal number of trees with max_features set to 8

In [11]:
NumberOfTrees = [1000,2000,5000,10000,15000,20000]
oob_score_RF = []
MSE_Test_RF = []
for i in NumberOfTrees:
    RF = RandomForestRegressor(n_estimators = i, 
                               max_features = 8, 
                               min_samples_leaf = 5, 
                               oob_score = True)
    RF.fit(X_train,y_train)
    oob_score_RF.append(RF.oob_score_)
    y_hat_test = RF.predict(X_test)
    MSE_Test  = metrics.mean_squared_error(y_test, y_hat_test)
    MSE_Test_RF.append(MSE_Test)

Depth_Choice_df = pd.DataFrame({'Out of Bag Score': oob_score_RF ,'Number of Trees': NumberOfTrees})
Depth_Choice_df.plot(x ='Number of Trees',y = 'Out of Bag Score' )

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x116cabe90>

#### Looks like we maximize OOB score at 5,000 of trees

In [12]:
Depth_Choice_df = pd.DataFrame({'MSE': MSE_Test_RF ,'Number of Trees': NumberOfTrees})
Depth_Choice_df.plot(x ='Number of Trees',y = 'MSE' )

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x117520110>

#### Looks like we minimize MSE at 5,000 trees.

### Check Feature Importances

In [13]:
# Check feature importances
sorted(zip(RF.feature_importances_,X.columns.values))

[(0.019122638683807328, 'sold_out'),
 (0.07638689615620757, 'num_reviews'),
 (0.088009621941953622, 'familiarity'),
 (0.097252680838548961, 'discovery'),
 (0.098578076025170588, 'num_news'),
 (0.10871390046650277, 'num_blogs'),
 (0.1169508361476027, 'hotttnesss'),
 (0.12461576593001537, 'face_value'),
 (0.13091751663667109, 'days_to_show'),
 (0.13945206717351824, 'num_years_active')]

#### Generate final Random Forest Model with ideal parameters

In [133]:
RF_final = RandomForestRegressor(n_estimators = 5000, 
                                 max_features = 8, 
                                 min_samples_leaf = 5, 
                                 oob_score = True)
RF_final.fit(X_train,y_train)
print("OOB Score is: %s" % RF_final.oob_score_)
y_hat_test = RF_final.predict(X_test)
MSE_Test  = metrics.mean_squared_error(y_test, y_hat_test)
print("MSE is: %s" % MSE_Test)

OOB Score is: 0.112973522368
MSE is: 386.646618235


## Boosting

In [15]:
# Run boosting without tuning
GBR_Tree = GradientBoostingRegressor(learning_rate = 0.01, # This is lambda, a tuning parameter, usually between 0.01 and 0.1
                                     n_estimators = 10000, #This is B, a tuning parameter, using large B can cause overfitting
                                     max_depth = 2, #This is d, another tuning parameter, usually max_depth < 5
                                     min_samples_leaf = 5  )
GBR_Tree.fit(X_train,y_train)
y_hat_test = GBR_Tree.predict(X_test)
MSE_Test  = metrics.mean_squared_error(y_test, y_hat_test)
print("MSE for untuned boosting is: %s" % MSE_Test)

MSE for untuned boosting is: 454.357979026


### Tune our algorithm for number of trees and lambda

In [16]:
Depth = range(1,5)
Scores = []
for i in Depth:
    lambdas = [0.01, 0.03, 0.06, 0.1]
    lambda_scores = []
    for j in lambdas:
        NumberOfTrees = [1000,2000,5000,10000]
        tree_scores = []
        for k in NumberOfTrees:
            GBR_Tree = GradientBoostingRegressor(learning_rate = j,
                                             n_estimators = k,
                                             max_depth = i,
                                             min_samples_leaf = 5)
            GBR_Tree.fit(X_train,y_train)
            y_hat_test = GBR_Tree.predict(X_test)
            MSE_Test = metrics.mean_squared_error(y_test, y_hat_test)
            tree_scores.append(MSE_Test)
        print("Tree scores for depth %s and lambda %s:" % (i, j))
        print tree_scores
        lambda_scores.append(tree_scores)
    Scores.append(lambda_scores)

print("\nDone!")

Tree scores for depth 1 and lambda 0.01:
[420.65348633138569, 421.02822923577367, 427.45406876449744, 438.52781252475029]
Tree scores for depth 1 and lambda 0.03:
[422.64765076352495, 430.57165801458922, 441.48732477270426, 440.17319845463737]
Tree scores for depth 1 and lambda 0.06:
[430.73091124716569, 440.12119134033389, 440.22615727826377, 439.85721882091497]
Tree scores for depth 1 and lambda 0.1:
[438.8168269328707, 440.77735684247705, 439.87421000365077, 441.18967878375173]
Tree scores for depth 2 and lambda 0.01:
[412.25047848572251, 419.47718675071303, 437.1108129659512, 454.19807353319692]
Tree scores for depth 2 and lambda 0.03:
[426.28124138685416, 441.30739106472697, 477.30784266717478, 516.83375081613792]
Tree scores for depth 2 and lambda 0.06:
[442.71573023723414, 464.51067514949801, 516.37108718717388, 559.98027530954096]
Tree scores for depth 2 and lambda 0.1:
[458.8689876238904, 491.66186446512506, 546.96541139392423, 594.95831422473373]
Tree scores for depth 3 and l

#### We get the lowest MSE (~404.305) at depth=4, lambda=0.01, and trees=1000.
#### Let's try to localize around a depth of 4 to further minimize MSE

In [23]:
Scores = []
lambdas = [0.01049, 0.0105, 0.01051]
lambda_scores = []
for i in lambdas:
    NumberOfTrees = [615, 620, 625, 630]
    tree_scores = []
    for j in NumberOfTrees:
        GBR_Tree = GradientBoostingRegressor(learning_rate = i,
                                         n_estimators = j,
                                         max_depth = 4,
                                         min_samples_leaf = 5)
        GBR_Tree.fit(X_train,y_train)
        y_hat_test = GBR_Tree.predict(X_test)
        MSE_Test = metrics.mean_squared_error(y_test, y_hat_test)
        tree_scores.append(MSE_Test)
    print("Tree scores for lambda %s:" % i)
    print tree_scores
    lambda_scores.append(tree_scores)
Scores.append(lambda_scores)

print("\nDone!")

Tree scores for lambda 0.01048:
[409.60584462757453, 409.78998597415409, 409.9600803628378, 409.94442899520971]
Tree scores for lambda 0.01049:
[406.39671958857298, 406.61883644055223, 406.5066534000984, 406.56886285427441]
Tree scores for lambda 0.0105:
[403.55500223636193, 403.18477927423601, 403.69060595044704, 403.57519622866778]
Tree scores for lambda 0.01051:
[405.25116706011823, 405.14882015483943, 405.02366074243326, 404.63643074045586]

Done!


#### I think the best we can do is lambda of 0.0105, max_depth of 4, and # of trees at 620.

In [33]:
# Plot results of Boosting at lambda of 0.0105 and max_depth of 4
Scores_df = pd.DataFrame({'MSE': Scores[0][2],'Number of Trees': NumberOfTrees})
Scores_df.plot(x ='Number of Trees',y = 'MSE' )

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x117b0da90>

In [24]:
# Create Boosting model
GBR_Tree = GradientBoostingRegressor(learning_rate = 0.0105,
                                 n_estimators = 620,
                                 max_depth = 4,
                                 min_samples_leaf = 5)
GBR_Tree.fit(X_train,y_train)
y_hat_test = GBR_Tree.predict(X_test)
MSE_Test = metrics.mean_squared_error(y_test, y_hat_test)
print MSE_Test

403.15368308


## Not sure why tuned boosting is giving us a worse MSE than Random Forest

## Let's test our Random Forest Model

In [122]:
#instances to test: 200 (Avett Bros - low error!), 190 (Avett Brothers, high error), 850

instance = 190 # Choose a row in the dataframe

X_instance = X.iloc[instance]
print TicketData[['date', 'artist', 'venue', 'city', 'face_value', 'min_price']].iloc[instance]
print "\n"
print("Actual markup is %s" % TicketData['FV_delta'].iloc[instance])
prediction_result = RF_final.predict(X_instance)

error = TicketData['FV_delta'].iloc[instance] - prediction_result
    
print("\nPredicted markup is %s" % prediction_result)
print("Prediction is off by %s" % error)

date          2016-04-23T20:00:00-0500
artist              The Avett Brothers
venue                  Chicago Theatre
city                           Chicago
face_value                          45
min_price                        120.5
Name: 204, dtype: object


Actual markup is 75.5

Predicted markup is [ 37.00667236]
Prediction is off by [ 38.49332764]




In [123]:
# Check Avett Bros ticket data
TicketData[TicketData['artist'] == "The Avett Brothers"][['date', 'artist', 'venue', 'sold_out']].head(10)

Unnamed: 0,date,artist,venue,sold_out
77,2016-06-19T20:00:00-0500,The Avett Brothers,ACL Live at The Moody Theater,1
203,2016-04-22T20:00:00-0500,The Avett Brothers,Chicago Theatre,0
204,2016-04-23T20:00:00-0500,The Avett Brothers,Chicago Theatre,0
214,2016-04-21T19:00:00-0500,The Avett Brothers,Chicago Theatre,0
265,2016-06-18T20:30:00-0500,The Avett Brothers,Gexa Energy Pavilion,0
415,2016-04-29T18:45:00-0700,The Avett Brothers,Greek Theatre Los Angeles,0
644,2016-05-06T20:00:00-0500,The Avett Brothers,Bridgestone Arena,0
771,2016-04-08T20:00:00-0400,The Avett Brothers,Madison Square Garden,0
953,2016-05-14T19:30:00-0400,The Avett Brothers,Mann Center for the Performing Arts,0


In [128]:
# Modify Avett Brows record to have sold_out = 1
Avett_bros_test = TicketData.loc[204]
Avett_bros_test['sold_out'] = 1
print Avett_bros_test

event_id                              9467026
date                 2016-04-23T20:00:00-0500
artist                     The Avett Brothers
venue                         Chicago Theatre
min_price                               120.5
max_price                                 900
total_postings                             71
total_tickets                             201
city                                  Chicago
state                                      IL
ticket_vendor                    Ticketmaster
face_value                                 45
sold_out                                    1
sk_artist_id                           348455
days_to_show                               41
num_blogs                                2884
num_news                                 1242
num_reviews                                72
discovery                            0.369704
familiarity                          0.678902
hotttnesss                           0.594076
num_years_active                  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [129]:
X_avett_bros_test = Avett_bros_test[['face_value', 'sold_out', 'days_to_show', 'num_blogs', 'num_news', 'num_reviews',
                'discovery', 'familiarity', 'hotttnesss', 'num_years_active']]

In [131]:
# Re-run prediction with new input
X_instance = X_avett_bros_test
print TicketData[['date', 'artist', 'venue', 'city', 'face_value', 'min_price']].iloc[instance]
print "\n"
print("Actual markup is %s" % TicketData['FV_delta'].iloc[instance])
prediction_result = RF_final.predict(X_instance)

error = TicketData['FV_delta'].iloc[instance] - prediction_result
    
print("\nPredicted markup is %s" % prediction_result)
print("Prediction is off by %s" % error)

date          2016-04-23T20:00:00-0500
artist              The Avett Brothers
venue                  Chicago Theatre
city                           Chicago
face_value                          45
min_price                        120.5
Name: 204, dtype: object


Actual markup is 75.5

Predicted markup is [ 45.49741609]
Prediction is off by [ 30.00258391]


