In [1]:
import math
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt
%matplotlib notebook

In [2]:
# Read in Data
TicketData = pd.read_csv('ProcessedTicketDataLogs.csv') # See "Examine Variable Distributions & Skews" notebook

In [3]:
print TicketData['FV_delta'].describe()

count    1260.000000
mean       46.480698
std        31.878243
min         7.260000
25%        25.785000
50%        38.385000
75%        57.745000
max       209.000000
Name: FV_delta, dtype: float64


In [4]:
# Remove outliers function
def RemoveOutliersFromDataFrame(df, column_name):
    q1 = df[column_name].quantile(0.25)
    q3 = df[column_name].quantile(0.75)
    IQR = q3 - q1
    df.drop(df[df[column_name] > q3+1.5*IQR].index, inplace = True)
    df.drop(df[df[column_name] < q1-1.5*IQR].index, inplace = True)
    return df

In [5]:
# Remove markup outliers
TicketData = RemoveOutliersFromDataFrame(TicketData, 'FV_delta')
TicketData['FV_delta'].describe()

count    1192.000000
mean       40.869807
std        20.696057
min         7.260000
25%        25.302500
50%        37.120000
75%        51.900000
max       104.900000
Name: FV_delta, dtype: float64

In [6]:
TicketData['FV_delta'].plot.box()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1125f3cd0>

### Create Markup Buckets
#### Bucket 1: 0-25
#### Bucket 2: 25-37
#### Bucket 3: 37-52
#### Bucket 4: >52

In [16]:
TicketData['FV_delta_bucket'] = 4

mask_1 = (TicketData['FV_delta'] <= 25)
mask_2 = ((TicketData['FV_delta'] > 25) & 
            (TicketData['FV_delta'] <= 37))  
mask_3 = ((TicketData['FV_delta'] > 37) & 
            (TicketData['FV_delta'] <= 52))

TicketData.loc[mask_1,'FV_delta_bucket'] = 1
TicketData.loc[mask_2,'FV_delta_bucket'] = 2
TicketData.loc[mask_3,'FV_delta_bucket'] = 3

TicketData[['FV_delta','FV_delta_bucket']].head(10)

Unnamed: 0,FV_delta,FV_delta_bucket
0,50.04,3
1,26.99,2
2,16.91,1
3,35.32,2
4,32.37,2
5,15.62,1
7,13.51,1
8,22.58,1
10,44.3,3
11,43.44,3


In [17]:
TicketData['FV_delta_bucket'].value_counts()

3    303
2    299
4    297
1    293
Name: FV_delta_bucket, dtype: int64

In [18]:
# Set Desired Prediction Variables
X = TicketData[['face_value', 'sold_out', 'days_to_show', 'num_blogs', 'num_news', 'num_reviews',
                'discovery', 'familiarity', 'hotttnesss', 'num_years_active']]
y = TicketData['FV_delta_bucket']

### Tune Random Forest model for number of features

In [19]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [20]:
# Function for getting the % that we predicted correctly as compared to the actual values 
# Inputs should both be series'
def GetCorrectPct(prediction, actual):
    count = 0
    for i, x in enumerate(actual):
        if prediction[i] == x:
            count = count + 1
    correct_pct = float(count)/len(prediction)
    return correct_pct

In [21]:
# Tune for ideal number of features
Features = range(1,11) # We have 10 features in total.
oob_score_RF = []
Scores = []
for i in Features:
    RF = RandomForestClassifier(n_estimators = 1000, #Number of trees - the more the better!
                           max_features = i,     #How many features to randomly choose in each node 
                           min_samples_leaf = 5, #Minimum number of observations at each terminal node
                           oob_score = True)
    RF.fit(X_train,y_train)
    oob_score_RF.append(RF.oob_score_)
    y_hat_test = RF.predict(X_test)
    correct_pct = GetCorrectPct(y_hat_test, y_test)
    Scores.append(correct_pct)

# Check OOB scores
Depth_Choice_df = pd.DataFrame({'Out of Bag Score': oob_score_RF ,'Number of Features': Features})
Depth_Choice_df.plot(x ='Number of Features',y = 'Out of Bag Score' )

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x11265e410>

#### Looks like we maxmize OOB score at 7 features.

In [22]:
# Check % correct
Depth_Choice_df = pd.DataFrame({'Scores': Scores ,'Number of Features': Features})
Depth_Choice_df.plot(x ='Number of Features',y = 'Scores' )

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x115739850>

In [24]:
# Check to see which buckets have highest errors
def GetCorrectBuckets(prediction, actual):
    correct_buckets = []
    for i, x in enumerate(actual):
        if prediction[i] == x:
            correct_buckets.append(x)
    return correct_buckets

y_hat_test = RF.predict(X_test)
correct_buckets = GetCorrectBuckets(y_hat_test, y_test)
print correct_buckets.count(1)
print correct_buckets.count(2)
print correct_buckets.count(3)
print correct_buckets.count(4)
print "Total number of observations in each bucket: "
print np.bincount(y_test)

43
27
37
36
Total number of observations in each bucket: 
[  0 105  93  92 104]


### Tune for ideal number of trees with max_features set to 4

In [25]:
NumberOfTrees = [1000,2000,5000,10000,15000]
oob_score_RF = []
Scores = []
for i in NumberOfTrees:
    RF = RandomForestClassifier(n_estimators = i, 
                               max_features = 4, 
                               min_samples_leaf = 5, 
                               oob_score = True)
    RF.fit(X_train,y_train)
    oob_score_RF.append(RF.oob_score_)
    y_hat_test = RF.predict(X_test)
    correct_pct = GetCorrectPct(y_hat_test, y_test)
    Scores.append(correct_pct)

Depth_Choice_df = pd.DataFrame({'Out of Bag Score': oob_score_RF ,'Number of Trees': NumberOfTrees})
Depth_Choice_df.plot(x ='Number of Trees',y = 'Out of Bag Score' )

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1180df0d0>

#### Looks like we maximize OOB score at either 2,000 or 10,000 trees

In [26]:
Depth_Choice_df = pd.DataFrame({'Score': Scores ,'Number of Trees': NumberOfTrees})
Depth_Choice_df.plot(x ='Number of Trees',y = 'Score' )

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x115c31d50>

#### Looks like we maximize the correct percentage at around 10,000 trees
### Let's use 10,000 trees.

### Check Feature Importances

In [27]:
# Check feature importances
sorted(zip(RF.feature_importances_,X.columns.values))

[(0.0040644606068993647, 'sold_out'),
 (0.09045094151019939, 'num_reviews'),
 (0.09492208511071773, 'discovery'),
 (0.10199329204394217, 'num_news'),
 (0.1057769306615686, 'num_years_active'),
 (0.1078924401911653, 'familiarity'),
 (0.11481216440592178, 'num_blogs'),
 (0.11594624672589547, 'hotttnesss'),
 (0.13050175155128135, 'face_value'),
 (0.13363968719240946, 'days_to_show')]

#### Generate final Random Forest Model with ideal parameters

In [28]:
RF_final = RandomForestClassifier(n_estimators = 10000, 
                                 max_features = 4, 
                                 min_samples_leaf = 5, 
                                 oob_score = True)
RF_final.fit(X_train,y_train)
print("OOB Score is: %s" % RF_final.oob_score_)
y_hat_test = RF_final.predict(X_test)
correct_pct = GetCorrectPct(y_hat_test, y_test)
print("Correct percent is: %s" % correct_pct)

OOB Score is: 0.34335839599
Correct percent is: 0.383248730964


## Boosting

In [29]:
# Run boosting without tuning
GBR_Tree = GradientBoostingClassifier(learning_rate = 0.01, # This is lambda, a tuning parameter, usually between 0.01 and 0.1
                                     n_estimators = 10000, #This is B, a tuning parameter, using large B can cause overfitting
                                     max_depth = 2, #This is d, another tuning parameter, usually max_depth < 5
                                     min_samples_leaf = 5  )
GBR_Tree.fit(X_train,y_train)
y_hat_test = GBR_Tree.predict(X_test)
correct_pct = GetCorrectPct(y_hat_test, y_test)
print("Correct pct for untuned boosting is: %s" % correct_pct)

Correct pct for untuned boosting is: 0.383248730964


### Tune our algorithm for number of trees and lambda

In [30]:
Depth = range(1,5)
Scores = []
for i in Depth:
    lambdas = [0.01, 0.03, 0.06, 0.1]
    lambda_scores = []
    for j in lambdas:
        NumberOfTrees = [1000,2000,5000,10000]
        tree_scores = []
        for k in NumberOfTrees:
            GBR_Tree = GradientBoostingClassifier(learning_rate = j,
                                             n_estimators = k,
                                             max_depth = i,
                                             min_samples_leaf = 5)
            GBR_Tree.fit(X_train,y_train)
            y_hat_test = GBR_Tree.predict(X_test)
            correct_pct = GetCorrectPct(y_hat_test, y_test)
            tree_scores.append(correct_pct)
        print("Tree scores for depth %s and lambda %s:" % (i, j))
        print tree_scores
        lambda_scores.append(tree_scores)
    Scores.append(lambda_scores)

print("\nDone!")

Tree scores for depth 1 and lambda 0.01:
[0.3096446700507614, 0.3248730964467005, 0.3604060913705584, 0.38071065989847713]
Tree scores for depth 1 and lambda 0.03:
[0.33756345177664976, 0.36548223350253806, 0.36548223350253806, 0.3629441624365482]
Tree scores for depth 1 and lambda 0.06:
[0.36548223350253806, 0.3730964467005076, 0.3604060913705584, 0.350253807106599]
Tree scores for depth 1 and lambda 0.1:
[0.38071065989847713, 0.3553299492385787, 0.350253807106599, 0.3680203045685279]
Tree scores for depth 2 and lambda 0.01:
[0.39086294416243655, 0.3883248730964467, 0.4010152284263959, 0.383248730964467]
Tree scores for depth 2 and lambda 0.03:
[0.383248730964467, 0.3883248730964467, 0.39086294416243655, 0.383248730964467]
Tree scores for depth 2 and lambda 0.06:
[0.3883248730964467, 0.39086294416243655, 0.37817258883248733, 0.39847715736040606]
Tree scores for depth 2 and lambda 0.1:
[0.38578680203045684, 0.3934010152284264, 0.39847715736040606, 0.39086294416243655]
Tree scores for d

#### We get the best correct % around a depth 4, lambda 0.06, and # of trees around 5000-10000:
#### Let's try to localize around a depth of 4 to further minimize MSE

In [33]:
Scores = []
lambdas = [ 0.048, 0.049, 0.05, 0.051]
lambda_scores = []
for i in lambdas:
    NumberOfTrees = [3900, 4000, 4100]
    tree_scores = []
    for j in NumberOfTrees:
        GBR_Tree = GradientBoostingClassifier(learning_rate = i,
                                         n_estimators = j,
                                         max_depth = 4,
                                         min_samples_leaf = 5)
        GBR_Tree.fit(X_train,y_train)
        y_hat_test = GBR_Tree.predict(X_test)
        correct_pct = GetCorrectPct(y_hat_test, y_test)
        tree_scores.append(correct_pct)
    print("Tree scores for lambda %s:" % i)
    print tree_scores
    lambda_scores.append(tree_scores)
Scores.append(lambda_scores)

print("\nDone!")

Tree scores for lambda 0.048:
[0.4035532994923858, 0.4035532994923858, 0.4035532994923858]
Tree scores for lambda 0.049:
[0.40609137055837563, 0.4086294416243655, 0.40609137055837563]
Tree scores for lambda 0.05:
[0.41624365482233505, 0.41624365482233505, 0.41624365482233505]
Tree scores for lambda 0.051:
[0.40609137055837563, 0.4086294416243655, 0.4010152284263959]

Done!


#### I think the best we can do is lambda of 0.05, max_depth of 4, and # of trees at about 4000.

In [34]:
# Create Boosting model
GBR_Tree = GradientBoostingClassifier(learning_rate = 0.05,
                                 n_estimators = 4000,
                                 max_depth = 4,
                                 min_samples_leaf = 5)
GBR_Tree.fit(X_train,y_train)
y_hat_test = GBR_Tree.predict(X_test)
correct_pct = GetCorrectPct(y_hat_test, y_test)
print correct_pct

0.416243654822


## Let's test the model!

In [None]:
instance = 920 # Choose a row in the dataframe

X_instance = X.iloc[instance]
print TicketData[['date', 'artist', 'venue', 'city']].iloc[instance]
print "\n"
print TicketData[['FV_delta', 'FV_delta_bucket']].iloc[instance]
prediction_result = GBR_Tree.predict(X_instance)

if prediction_result == TicketData['FV_delta_bucket'].iloc[instance]:
    right_or_wrong = "correct!"
else:
    right_or_wrong = "wrong!"

if prediction_result == 3:
    result = "more than $50"
elif prediction_result == 2:
    result = "between $25 and $50"
elif prediction_result == 1:
    result = "less than $25"
    
print("\nPredicted bucket is %s or %s" % (prediction_result, result))
print("Prediction is %s" % right_or_wrong)