In [16]:
import math
import matplotlib.pyplot as plt
%matplotlib notebook
import numpy as np
import pandas as pd
import random
import statsmodels.formula.api as smf

from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import feature_selection
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing # for scaling the data
import statsmodels.formula.api as smf

In [2]:
# Read in Data
TicketData = pd.read_csv('ProcessedTicketDataLogs.csv')

# Use Lasso Regression to find the most important variables

In [3]:
# Set desired variables
Variable_df = TicketData[['face_value_log', 'sold_out', 'days_to_show_log', 'num_blogs_log',
                          'num_news_log', 'num_reviews_log', 'discovery',
               'familiarity', 'hotttnesss', 'num_years_active', 'FV_delta_log']]
Variable_df.head()

Unnamed: 0,face_value_log,sold_out,days_to_show_log,num_blogs_log,num_news_log,num_reviews_log,discovery,familiarity,hotttnesss,num_years_active,FV_delta_log
0,3.555348,0,4.477337,9.156412,7.697121,1.94591,0.439948,0.770825,0.862321,8,3.912823
1,3.367296,0,3.713572,9.003193,6.869014,3.951244,0.391567,0.749624,0.729409,14,3.295466
2,3.399529,0,4.682131,8.709795,7.482119,2.564949,0.427074,0.769929,0.835224,14,2.827905
3,3.218876,0,3.258097,7.641564,5.442418,5.117994,0.368564,0.70152,0.619015,34,3.564449
4,2.772589,0,3.951244,7.012115,5.666427,2.639057,0.409728,0.61652,0.589147,30,3.477232


### Scale the data.

In [4]:
# Scale values in Variable_df dataframe
Variable_df_scale = preprocessing.scale(Variable_df) # TicketData_scale is now a numpy array
Variable_df_scale = pd.DataFrame(Variable_df_scale) # Put TicketData_scale into a pandas dataframe
Variable_df_scale.columns = Variable_df.columns.values # Give TicketData_scale original TicketData columns
Variable_df_scale.head()

Unnamed: 0,face_value_log,sold_out,days_to_show_log,num_blogs_log,num_news_log,num_reviews_log,discovery,familiarity,hotttnesss,num_years_active,FV_delta_log
0,0.372121,-0.227493,0.846768,1.375961,1.31266,-0.614585,0.239389,1.256185,2.319036,-0.827995,0.430556
1,-0.066195,-0.227493,0.076353,1.273687,0.843052,0.616312,-0.360665,1.084041,1.047406,-0.407772,-0.581715
2,0.008935,-0.227493,1.053345,1.077844,1.190735,-0.234611,0.079721,1.24891,2.059786,-0.407772,-1.34837
3,-0.412136,-0.227493,-0.383088,0.364802,0.03405,1.332477,-0.645959,0.693453,-0.008784,0.992971,-0.140667
4,-1.452351,-0.227493,0.316094,-0.055355,0.161083,-0.189123,-0.135414,0.003283,-0.294544,0.712822,-0.283676


### Run Lasso Regression

In [5]:
X_lasso = Variable_df_scale.copy()
del X_lasso['FV_delta_log']
y_lasso = Variable_df_scale['FV_delta_log']

In [6]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_lasso, y_lasso, test_size=0.33)

In [7]:
alphas = np.logspace(-10, 10, 21)
alphas_index = np.linspace(-10, 10, 21)
Scores = []
for a in alphas:
    lm = linear_model.Lasso(alpha=a).fit(X_train, y_train)
    Scores.append(metrics.mean_squared_error(y_test, lm.predict(X_test)))

MSE_Lasso_CV_df = pd.DataFrame({'MSE_Lasso':Scores, 'Log_alphas':alphas_index})
MSE_Lasso_CV_df.plot(x='Log_alphas', y='MSE_Lasso')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1117d2a90>

#### From the above graph, it looks like the ideal alpha is -3

### Use Lasso Regression to find most significant variables

In [8]:
# Find feature coefficients using Lasso regression with ideal alpha
lm = linear_model.Lasso(alpha=10**(-3))
lm.fit(X_lasso, y_lasso)
sorted(zip(lm.coef_, X_lasso.columns))

[(-0.33055393604368116, 'familiarity'),
 (-0.30831655082782616, 'discovery'),
 (-0.061658698486772807, 'num_blogs_log'),
 (-0.0010283055307501879, 'num_reviews_log'),
 (0.022689061491567599, 'num_news_log'),
 (0.057162538340669429, 'face_value_log'),
 (0.06264697551684871, 'days_to_show_log'),
 (0.12417420023515652, 'sold_out'),
 (0.22574460169029595, 'num_years_active'),
 (0.35455412807878184, 'hotttnesss')]

## Check variable correlations

In [9]:
Variable_df_scale.corr()

Unnamed: 0,face_value_log,sold_out,days_to_show_log,num_blogs_log,num_news_log,num_reviews_log,discovery,familiarity,hotttnesss,num_years_active,FV_delta_log
face_value_log,1.0,-0.118131,0.133465,0.306221,0.355113,0.289395,-0.289627,0.495419,0.297793,0.43066,0.16907
sold_out,-0.118131,1.0,-0.046508,-0.010099,-0.055228,-0.008542,0.068325,-0.069765,-0.01053,-0.106128,0.089191
days_to_show_log,0.133465,-0.046508,1.0,0.068224,0.099083,0.066102,-0.049983,0.123117,0.102635,0.112628,0.100003
num_blogs_log,0.306221,-0.010099,0.068224,1.0,0.883017,0.572103,-0.42199,0.779726,0.480484,0.119415,0.046993
num_news_log,0.355113,-0.055228,0.099083,0.883017,1.0,0.638935,-0.484872,0.776408,0.415826,0.21348,0.076709
num_reviews_log,0.289395,-0.008542,0.066102,0.572103,0.638935,1.0,-0.585214,0.651261,0.143206,0.374543,0.097296
discovery,-0.289627,0.068325,-0.049983,-0.42199,-0.484872,-0.585214,1.0,-0.588727,0.324259,-0.567568,-0.123464
familiarity,0.495419,-0.069765,0.123117,0.779726,0.776408,0.651261,-0.588727,1.0,0.511999,0.414604,0.121318
hotttnesss,0.297793,-0.01053,0.102635,0.480484,0.415826,0.143206,0.324259,0.511999,1.0,-0.146628,0.055042
num_years_active,0.43066,-0.106128,0.112628,0.119415,0.21348,0.374543,-0.567568,0.414604,-0.146628,1.0,0.228289


<h4>hotttnesss is the most significant variable, and it is only not correlated with sold_out so we could use those 2 variables <br>
num_years_active  is most correlated with FV_delta_log, but is correlated with everything else so we probably should drop it <br>
sold_out is weakly correlated with days_to_show_log, familiarity, and discovery. familiarity however is correlated with both discovery and days_to_show_log, so we should drop it <br>
</h4>

<h3>Best variables to use: <br>
sold_out, days_to_show_log, and discovery OR <br>
hotttnesss and sold_out</h3>

## Linear Regression for hotttnesss and sold_out

In [11]:
def get_linear_model_metrics(X, y, algo):
    # get the pvalue of X given y. 
    pvals = feature_selection.f_regression(X, y)[1]
    algo.fit(X,y)
    residuals = (y-algo.predict(X))

    # print the necessary values
    print 'P Values:', pvals
    print 'Coefficients:', algo.coef_
    print 'y-intercept:', algo.intercept_
    print 'R-Squared:', algo.score(X,y)
    plt.figure()
    plt.hist(residuals)
    # keep the model
    return algo

In [14]:
# Set desired variables from unscaled data
X = TicketData[['hotttnesss', 'sold_out']]
y = TicketData['FV_delta_log']

In [15]:
# Create Linear model
linreg = LinearRegression()
lm = get_linear_model_metrics(X,y,linreg)

P Values: [ 0.05077904  0.00152898]
Coefficients: [ 0.32668256  0.25314252]
y-intercept: 3.43526089114
R-Squared: 0.0110892052272


<IPython.core.display.Javascript object>

In [17]:
# Use R package to get p-values

lm1 = smf.ols(formula='y ~ X', data=TicketData).fit()
print(lm1.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     7.048
Date:                Sun, 03 Apr 2016   Prob (F-statistic):           0.000904
Time:                        17:49:50   Log-Likelihood:                -1157.8
No. Observations:                1260   AIC:                             2322.
Df Residuals:                    1257   BIC:                             2337.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      3.4353      0.103     33.349      0.0

<h4>Our R^2 is low (~0.01), so the linear model doesn't capture much of the variability of our data. However, the p values of our coefficients (0.046, 0.001) are all decently low, so our result is significant. 
</h4>

<h3>We can interpret the above coefficients as the following: <br>
<ul>
<li>All other variables constant, for every increase of 0.1 in the hotttnesss metric, the ticket markup on StubHub increases by ~3.3%.</li>
<li>All other variables constant, if a show sells out, the ticket markup Stubhub on increases by ~25%.</li>
</ul>
</h3>

### Additional analysis: Linear Regression for sold_out, days_to_show_log, and discovery

In [12]:
# Set desired variables from unscaled data
X = TicketData[['sold_out', 'days_to_show_log', 'discovery']]
y = TicketData['FV_delta_log']

In [13]:
# Create Linear model
linreg = LinearRegression()
lm = get_linear_model_metrics(X,y,linreg)

P Values: [  1.52897892e-03   3.77759430e-04   1.10758581e-05]
Coefficients: [ 0.28857962  0.06058843 -0.94954951]
y-intercept: 3.81504962213
R-Squared: 0.03447674575


<IPython.core.display.Javascript object>

<h4>Our R^2 is low (~0.03), so the linear model doesn't capture much of the variability of our data. However, the p values of our coefficients (1.52897892e-03, 3.77759430e-04, 1.10758581e-05) are all very low, so our result is significant. 
</h4>

<h3>We can interpret the above coefficients as the following: <br>
<ul>
<li>All other variables constant, if a show sells out, the ticket markup Stubhub on increases by ~29%.</li>
<li>All other variables constant, for every percentage increase in the number of days to the show, the ticket markup on StubHub increases by ~6%.</li>
<li>All other variables constant, for every % increase in EchoNest's discovery value, the ticket markup on Stubhub decreases by ~95%. (possible explanation of discovery here: http://blog.echonest.com/). EchoNest says discovery is: "a measure of how unexpectedly popular the artist is."</li>
</ul>
</h3>

In [None]:
# Use R package to get p-values

lm1 = smf.ols(formula='y ~ X', data=TicketData).fit()
print(lm1.summary())