In [2]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
%matplotlib notebook

from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [3]:
# Read in Data
TicketData = pd.read_csv('ProcessedTicketDataLogs.csv')

In [4]:
# Set desired variables (with logs)
X = TicketData[['face_value_log', 'sold_out', 'days_to_show_log', 'num_blogs_log', 'num_news_log', 'num_reviews_log',
                'discovery', 'familiarity', 'hotttnesss', 'num_years_active']]
y = TicketData['FV_delta_log']
y.describe()

count    1260.000000
mean        3.650238
std         0.610114
min         1.982380
25%         3.249793
50%         3.647667
75%         4.056037
max         5.342334
Name: FV_delta_log, dtype: float64

In [5]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [7]:
# Find ideal depth for Decision tree
Depth = range(1,50)
Scores = []
for i in Depth:
    treereg = DecisionTreeRegressor(max_depth=i)
    treereg.fit(X_train, y_train)
    y_hat_test = treereg.predict(X_test)
    MSE_Test = metrics.mean_squared_error(y_test, y_hat_test)
    Scores.append(np.sqrt(MSE_Test))

DepthChoice_df = pd.DataFrame({'MSE':Scores, 'Depth': Depth})
DepthChoice_df.plot(x='Depth', y='MSE')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x10bde5f50>

#### From the above graph, the depth which minimizes MSE appears to be 2

In [8]:
# Find most important variables
treereg = DecisionTreeRegressor(max_depth=2)
treereg.fit(X_train,y_train)
ImportanceDataFrame = pd.DataFrame({'feature':X.columns.values, 'importance':treereg.feature_importances_})
ImportanceDataFrame.sort_values(by=['importance'], ascending=False)

Unnamed: 0,feature,importance
8,hotttnesss,0.437088
9,num_years_active,0.425077
0,face_value_log,0.137836
1,sold_out,0.0
2,days_to_show_log,0.0
3,num_blogs_log,0.0
4,num_news_log,0.0
5,num_reviews_log,0.0
6,discovery,0.0
7,familiarity,0.0


#### Looks like hotttnesss, num_years_active, and face_value_log are the most important features for predicting markup.

## Look at Correlation

In [10]:
X.corr()

Unnamed: 0,face_value_log,sold_out,days_to_show_log,num_blogs_log,num_news_log,num_reviews_log,discovery,familiarity,hotttnesss,num_years_active
face_value_log,1.0,-0.118131,0.133465,0.306221,0.355113,0.289395,-0.289627,0.495419,0.297793,0.43066
sold_out,-0.118131,1.0,-0.046508,-0.010099,-0.055228,-0.008542,0.068325,-0.069765,-0.01053,-0.106128
days_to_show_log,0.133465,-0.046508,1.0,0.068224,0.099083,0.066102,-0.049983,0.123117,0.102635,0.112628
num_blogs_log,0.306221,-0.010099,0.068224,1.0,0.883017,0.572103,-0.42199,0.779726,0.480484,0.119415
num_news_log,0.355113,-0.055228,0.099083,0.883017,1.0,0.638935,-0.484872,0.776408,0.415826,0.21348
num_reviews_log,0.289395,-0.008542,0.066102,0.572103,0.638935,1.0,-0.585214,0.651261,0.143206,0.374543
discovery,-0.289627,0.068325,-0.049983,-0.42199,-0.484872,-0.585214,1.0,-0.588727,0.324259,-0.567568
familiarity,0.495419,-0.069765,0.123117,0.779726,0.776408,0.651261,-0.588727,1.0,0.511999,0.414604
hotttnesss,0.297793,-0.01053,0.102635,0.480484,0.415826,0.143206,0.324259,0.511999,1.0,-0.146628
num_years_active,0.43066,-0.106128,0.112628,0.119415,0.21348,0.374543,-0.567568,0.414604,-0.146628,1.0


hotttnesss seems strongly correlated with face_value_log and somewhat correlated with num_years_active, so we should probably not include it in the model.

In [22]:
# Remove hotttnesss from X
X = TicketData[['face_value', 'sold_out', 'days_to_show', 'num_blogs', 'num_news', 'num_reviews', 'discovery',
               'familiarity', 'num_years_active']]

In [23]:
# Find ideal depth for Decision tree
Depth = range(1,50)
score = []
for i in Depth:
    treereg = DecisionTreeRegressor(max_depth=i)
    scores = cross_val_score(treereg, X, y, cv=10, scoring="mean_squared_error")
    score.append(np.mean(np.sqrt(-scores)))

DepthChoice_df = pd.DataFrame({'cv_scores':score, 'Depth': Depth})
DepthChoice_df.plot(x='Depth', y='cv_scores')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x10c664350>

Ideal depth appears to be 3

In [24]:
# Find most important variables
treereg = DecisionTreeRegressor(max_depth=3)
treereg.fit(X,y)
ImportanceDataFrame = pd.DataFrame({'feature':X.columns.values, 'importance':treereg.feature_importances_})
ImportanceDataFrame.sort_values(by=['importance'], ascending=False)

Unnamed: 0,feature,importance
8,num_years_active,0.460742
4,num_news,0.366901
2,days_to_show,0.061944
0,face_value,0.058725
6,discovery,0.051688
1,sold_out,0.0
3,num_blogs,0.0
5,num_reviews,0.0
7,familiarity,0.0


## Create Decision Tree Regression Model

In [25]:
# Set Desired variables based on previous Decision tree analysis
X = TicketData[['num_news', 'num_years_active']]

In [34]:
kf = cross_validation.KFold(len(TicketData), n_folds=10, shuffle=True)
scores = []
for train_index, test_index in kf:
    treereg = DecisionTreeRegressor(max_depth=3)
    treereg.fit(X.iloc[train_index], y.iloc[train_index])
    scores.append(metrics.mean_squared_error(y.iloc[test_index], treereg.predict(X.iloc[test_index])))

print(np.mean(scores))

0.334123812239


In [35]:
treereg = DecisionTreeRegressor(max_depth=3)
scores = cross_val_score(treereg, X, y, cv=10, scoring='mean_squared_error')
print np.mean(np.sqrt(-scores))

0.576667146193
