In [15]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt
%matplotlib notebook

In [2]:
# Read in Data
TicketData = pd.read_csv('ProcessedTicketDataLogs.csv') # See "Examine Variable Distributions & Skews" notebook
TicketData.head()

Unnamed: 0,event_id,date,artist,venue,min_price,max_price,total_postings,total_tickets,city,state,...,hotttnesss,num_years_active,FV_delta,maxPrice_FV_delta,FV_delta_log,face_value_log,days_to_show_log,num_blogs_log,num_news_log,num_reviews_log
0,9408915,2016-06-09T19:30:00-0400,Selena Gomez,Philips Arena,85.04,1500,106,359,Atlanta,GA,...,0.862321,8,50.04,1465.0,3.912823,3.555348,4.477337,9.156412,7.697121,1.94591
1,9411812,2016-04-23T20:00:00-0400,Ciara,Center Stage Theatre,55.99,119,14,89,Atlanta,GA,...,0.729409,14,26.99,90.0,3.295466,3.367296,3.713572,9.003193,6.869014,3.951244
2,9425600,2016-06-29T19:00:00-0400,Demi Lovato and Nick Jonas,Philips Arena,46.86,1749,182,780,Atlanta,GA,...,0.835224,14,16.91,1719.05,2.827905,3.399529,4.682131,8.709795,7.482119,2.564949
3,9428103,2016-04-08T20:30:00-0400,They Might Be Giants,Variety Playhouse,60.32,1112,13,73,Atlanta,GA,...,0.619015,34,35.32,1087.0,3.564449,3.218876,3.258097,7.641564,5.442418,5.117994
4,9432150,2016-05-04T19:00:00-0400,Prong,Masquerade Atlanta,48.37,70,4,18,Atlanta,GA,...,0.589147,30,32.37,54.0,3.477232,2.772589,3.951244,7.012115,5.666427,2.639057


In [None]:
TicketData.columns.values

In [19]:
# Set desired variables
X = TicketData[['face_value_log', 'sold_out', 'days_to_show_log', 'num_blogs_log', 'num_news_log', 'num_reviews_log',
                'discovery', 'familiarity', 'hotttnesss', 'num_years_active']]
y = TicketData['FV_delta_log']
y.describe()

count    1260.000000
mean        3.650238
std         0.610114
min         1.982380
25%         3.249793
50%         3.647667
75%         4.056037
max         5.342334
Name: FV_delta_log, dtype: float64

In [None]:
X.head()

### Run Random Forest Regressor

In [11]:
RF = RandomForestRegressor(n_estimators = 1000,
                          max_features = 4,
                          min_samples_leaf = 5,
                          oob_score = True)

RF.fit(X,y)
print("oob_score = %s" % RF.oob_score)
scores = cross_val_score(RF_nolog, X, y, cv=10)
print("cv scores = %s" % scores.mean())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=4, max_leaf_nodes=None, min_samples_leaf=5,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10000, n_jobs=1, oob_score=True, random_state=None,
           verbose=0, warm_start=False)

In [12]:
# Check feature importances
sorted(zip(RF.feature_importances_,X.columns.values))

[(0.012630130929927074, 'sold_out'),
 (0.0757319373689678, 'num_reviews_log'),
 (0.086413740799612393, 'discovery'),
 (0.099307499336176483, 'num_news_log'),
 (0.10236455722581117, 'familiarity'),
 (0.10881001923622323, 'num_blogs_log'),
 (0.1125606636327402, 'face_value_log'),
 (0.11377638591247612, 'days_to_show_log'),
 (0.13690023668296541, 'hotttnesss'),
 (0.15150482887510189, 'num_years_active')]

#### Our out of bag score here is ~0.259 which is good given our number of observations (~1,200). It's better than the R^2 we got from linear regression (~0.09)

### Tune for ideal number of features

In [14]:
Features = range(1,10)
oob_score_RF = []
for i in Features:
    RF = RandomForestRegressor(n_estimators = 1000, #Number of trees - the more the better!
                           max_features = i,     #How many features to randomly choose in each node 
                           min_samples_leaf = 5, #Minimum number of observations at each terminal node
                           oob_score = True)
    RF.fit(X,y)  
    oob_score_RF.append(RF.oob_score_)

Depth_Choice_df = pd.DataFrame({'Out of Bag Score': oob_score_RF ,'Number of Features': Features})
Depth_Choice_df.plot(x ='Number of Features',y = 'Out of Bag Score' )

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1205e8350>

### Non log variables

In [24]:
X_nolog = TicketData[['face_value', 'sold_out', 'days_to_show', 'num_blogs', 'num_news', 'num_reviews',
                'discovery', 'familiarity', 'hotttnesss', 'num_years_active']]
y_nolog = TicketData['FV_delta']
y_nolog.describe()

count    1260.000000
mean       46.480698
std        31.878243
min         7.260000
25%        25.785000
50%        38.385000
75%        57.745000
max       209.000000
Name: FV_delta, dtype: float64

In [67]:
RF_nolog = RandomForestRegressor(n_estimators = 1000, 
                                 max_features = 4, 
                                 min_samples_leaf = 5, 
                                 oob_score = True)
RF_nolog.fit(X_nolog,y_nolog)
print("oob_score = %s" % RF_nolog.oob_score_)
scores = cross_val_score(RF_nolog, X_nolog, y_nolog, cv=10, scoring='mean_squared_error')
mean_score = (np.mean(np.sqrt(-scores)))
print("cv scores = %s" % mean_score)

True
0.200035939332


In [71]:
#print("oob_score = %s" % RF_nolog.oob_score_)
scores = cross_val_score(RF_nolog, X_nolog, y_nolog, cv=10, scoring='mean_squared_error')
print (np.mean(np.sqrt(-scores)))

27.7506927226


### Try classification on sold_out values

In [27]:
X_class = TicketData[['face_value', 'days_to_show', 'num_blogs', 'num_news', 'num_reviews',
                'discovery', 'familiarity', 'hotttnesss', 'num_years_active']]
y_class = TicketData['sold_out']

In [28]:
RFClass = RandomForestClassifier(n_estimators = 1000, 
                                 max_features = 4, 
                                 min_samples_leaf = 5,
                                 oob_score = True)
RFClass.fit(X_class,y_class)
print("oob score = %s" % RFClass.oob_score_)
scores = cross_val_score(RFClass, X_class, y_class, cv=10)
print("cv score = %s" % scores.mean())

0.947619047619
0.948417647794


In [29]:
# Check feature importances
sorted(zip(RF.feature_importances_,X.columns.values))

[(0.012784408424418215, 'sold_out'),
 (0.075970101788573177, 'discovery'),
 (0.077265495383765342, 'num_reviews_log'),
 (0.083608449391464168, 'num_news_log'),
 (0.085203320323721521, 'familiarity'),
 (0.096145046011877802, 'num_blogs_log'),
 (0.11222645778722984, 'face_value_log'),
 (0.13000207079591314, 'days_to_show_log'),
 (0.16067160994001317, 'num_years_active'),
 (0.16612304015302379, 'hotttnesss')]

In [47]:
X_class.head(9)

Unnamed: 0,face_value,days_to_show,num_blogs,num_news,num_reviews,discovery,familiarity,hotttnesss,num_years_active
0,35.0,88,9475,2202,7,0.439948,0.770825,0.862321,8
1,29.0,41,8129,962,52,0.391567,0.749624,0.729409,14
2,29.95,108,6062,1776,13,0.427074,0.769929,0.835224,14
3,25.0,26,2083,231,167,0.368564,0.70152,0.619015,34
4,16.0,52,1110,289,14,0.409728,0.61652,0.589147,30
5,14.0,13,353,10,0,0.447226,0.455172,0.504101,6
6,50.5,30,30702,11724,108,0.376889,0.842991,0.91195,8
7,29.5,10,2292,483,43,0.330652,0.749976,0.61644,46
8,20.0,19,4497,397,27,0.484888,0.61957,0.700804,6


In [58]:
y_class.sum()

62

In [72]:
## Check the confusion matrix
# If we have a nice mixture of type 1 and type 2 error, then the model can be used for interpretation of the coefficients

from sklearn.metrics import confusion_matrix
y_hat = RFClass.predict(X_class)
confusion_matrix(y_class, y_hat)

array([[1198,    0],
       [  54,    8]])

## TODOs 3/30/16:
### -Use validation instead of cross validation since our feature size (~10) is much smaller than the # of observations (1260)
### -Could be interesting to look at interpretability for sold_out (e.g., as X variable increases, what happens to probability of being sold_out