# Running Models w/ SciKit Learn

## Load libraries and dataframes

In [1]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import sklearn.metrics

In [None]:
#Pickle in the log transformed ticket listing count
data_log = pd.read_pickle("./data_log")

In [2]:
#Pickle in the main dataframe with all features + target
c_df = pd.read_pickle('./c_df_dummies.pkl')
c_df = c_df.reset_index(drop=True)

In [None]:
# Drop un-logged data
c_df = c_df.drop(columns=['average_price', 'highest_price', 'lowest_price', 'ticket_listing_count'])

In [None]:
# Add main frame containing dummies with logged continuous values
c_df = pd.concat([c_df, data_log], axis=1)
c_df.head()

In [3]:
c_df.head()

Unnamed: 0,average_price,highest_price,lowest_price,ticket_listing_count,venue_score,performer_genre_alternative,performer_genre_blues,performer_genre_country,performer_genre_electronic,performer_genre_hip-hop,...,event_start_21,event_start_22,event_start_23,venue_capacity_XL_venue,venue_capacity_large_venue,venue_capacity_medium_,venue_capacity_small,venue_region_long island,venue_region_new york city,venue_region_upstate
0,337.0,584.0,253.0,4.0,0.431162,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,337.0,584.0,253.0,4.0,0.455722,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
2,22.0,24.0,20.0,3.0,0.636922,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,45.0,80.0,30.0,4.0,0.501232,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,92.0,120.0,71.0,10.0,0.681791,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


## Without lowest and highest price

In [4]:
X = c_df.drop(columns=['average_price', 'highest_price', 'lowest_price'])
y = pd.DataFrame(c_df, columns=['average_price'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size = 0.2)

print(len(X_train), len(X_test), len(y_train), len(y_test))

885 222 885 222


In [6]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

print('Intercept: \n', linreg.intercept_)
print('Coefficients: \n', linreg.coef_)

y_hat_train = linreg.predict(X_train)
y_hat_test = linreg.predict(X_test)


train_residuals = y_hat_train - y_train
test_residuals = y_hat_test - y_test

Intercept: 
 [-18.24614153]
Coefficients: 
 [[ 3.14384792e-02  3.91218957e+02 -4.13258379e+01  1.00159204e+01
   5.64483477e+00 -1.68901479e+01  2.00933674e+01  6.23879234e+00
   4.78367294e+00  2.95955055e+01 -1.81561075e+01  4.40437873e+00
   1.31720671e+01 -1.10617326e+01  2.64949717e+01  1.31580166e+01
   2.21834657e+01  5.91227449e+01 -9.98194594e+00 -5.00431524e+01
   0.00000000e+00 -6.74488138e+01 -7.74598068e+00 -2.74700041e+00
  -3.85068676e+00 -1.28151268e+01 -2.21019923e+01  1.62379663e+01
   3.30228207e+01  1.12095080e+01 -1.12095080e+01  1.20792265e-13
   1.05779428e+02 -6.77677265e+00 -6.61738529e+01 -2.16192945e+01
   2.50247117e+01 -7.08187841e-01 -3.38317785e+00 -5.71064543e+01
   3.90998745e+01 -1.41362742e+01  7.25581860e+00 -3.91137721e+01
  -2.55492525e+01  5.74072060e+01 -6.88188160e+00 -2.05939910e+00
   8.94128070e+00]]


In [7]:
mse_train = np.sum((y_train - y_hat_train)**2)/len(y_train)
mse_test = np.sum((y_test - y_hat_test)**2) / len(y_test)
rmse_train=np.sqrt(mse_train)
rmse_test=np.sqrt(mse_test)

print('Train Mean Squarred Error: \n', mse_train)
print('Test Mean Squarred Error: \n', mse_test)
print('Prediction: \n', linreg.predict(X_test))

from sklearn.metrics import mean_squared_error

train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
print('Train Mean Squarred Error:', train_mse)
print('Test Mean Squarred Error:', test_mse)

Train Mean Squarred Error: 
 average_price    12837.367967
dtype: float64
Test Mean Squarred Error: 
 average_price    14691.985182
dtype: float64
Prediction: 
 [[108.79525298]
 [151.04827254]
 [177.02877026]
 [242.32590655]
 [130.8215992 ]
 [325.89226326]
 [165.68830412]
 [208.3425132 ]
 [156.10895067]
 [281.76685605]
 [262.00137528]
 [188.11407552]
 [111.73396284]
 [181.31346173]
 [293.08367385]
 [190.31061752]
 [148.80032284]
 [403.8428178 ]
 [166.0325841 ]
 [157.24499651]
 [107.22984258]
 [ 68.24519797]
 [263.51888261]
 [225.09001733]
 [ 94.67489118]
 [151.73622053]
 [285.90104323]
 [146.70581511]
 [179.72874444]
 [137.58469667]
 [400.34363483]
 [196.78381986]
 [141.0987998 ]
 [364.74424548]
 [386.76069239]
 [207.17987029]
 [ 98.2185652 ]
 [347.82758985]
 [286.17026215]
 [200.33561428]
 [228.95304088]
 [183.73954936]
 [214.93130507]
 [145.8693896 ]
 [278.5200597 ]
 [219.54415388]
 [199.48699373]
 [292.56076602]
 [154.41271602]
 [119.10825858]
 [170.72549015]
 [161.2942777 ]
 [165.0

In [8]:
#Testing by guessing mean value
from sklearn import metrics
import numpy as np

y_null = np.zeros_like(y_hat_test, dtype=float)

# fill the array with the mean value of y_test
y_null.fill(y_hat_test.mean())
# print(y_null)

np.sqrt(metrics.mean_squared_error(y_hat_test, y_null))

78.04568794025796

### With lowest price and highest price

In [9]:
X = c_df.drop(columns=['average_price'])
y = pd.DataFrame(c_df, columns=['average_price'])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size = 0.2)

In [11]:
print(len(X_train), len(X_test), len(y_train), len(y_test))

885 222 885 222


In [12]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

print('Intercept: \n', linreg.intercept_)
print('Coefficients: \n', linreg.coef_)

y_hat_train = linreg.predict(X_train)
y_hat_test = linreg.predict(X_test)


train_residuals = y_hat_train - y_train
test_residuals = y_hat_test - y_test

mse_train = np.sum((y_train - y_hat_train)**2)/len(y_train)
mse_test = np.sum((y_test - y_hat_test)**2) / len(y_test)
rmse_train=np.sqrt(mse_train)
rmse_test=np.sqrt(mse_test)

print('Train Mean Squarred Error: \n', mse_train)
print('Test Mean Squarred Error: \n', mse_test)
print('Prediction: \n', linreg.predict(X_test))

Intercept: 
 [-57.78337073]
Coefficients: 
 [[ 3.99525577e-02  1.02946224e+00 -2.06841821e-02  2.40953679e+02
  -1.36217442e+01  3.50567186e+00 -5.00040433e+00 -2.73224527e+00
  -4.32027209e+00  1.49292094e+01 -4.05814669e+00  1.11821680e+01
   1.15763302e-01 -1.01626119e+01  2.49719354e+00 -8.80319611e+00
  -2.46264358e-01 -3.62831977e-01  3.23071960e+01  4.28911160e+01
   8.67444581e+00 -4.41583228e+01  0.00000000e+00 -2.26367242e+01
  -2.95083826e+00 -1.87163353e+00 -3.77757522e+00 -1.03007394e+01
  -4.57413966e+00  1.72063310e+01  6.26859508e+00 -4.38452233e+00
   4.38452233e+00  7.10542736e-15 -1.35001837e+01 -1.31429144e+01
  -1.10880042e+01  3.33465800e+01  1.52169161e+01  8.43753225e+00
  -1.00158223e+00 -2.12898453e+01  2.47260658e+01 -2.17045642e+01
  -8.13085718e+00 -2.14973778e+01 -1.25652385e+01  4.21934735e+01
  -9.47775726e-01 -1.17803456e+01  1.27281213e+01]]
Train Mean Squarred Error: 
 average_price    4425.219368
dtype: float64
Test Mean Squarred Error: 
 average_pri

In [13]:
from sklearn.metrics import mean_squared_error

train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
print('Train Mean Squarred Error:', train_mse)
print('Test Mean Squarred Error:', test_mse)

Train Mean Squarred Error: 4425.219368184643
Test Mean Squarred Error: 4053.376500409685


In [14]:
rmse_test

average_price    63.666133
dtype: float64

In [15]:
rmse_train

average_price    66.522322
dtype: float64

In [16]:
#Testing by guessing mean value
from sklearn import metrics
import numpy as np

y_null = np.zeros_like(y_hat_test, dtype=float)

# fill the array with the mean value of y_test
y_null.fill(y_hat_test.mean())
# print(y_null)

np.sqrt(metrics.mean_squared_error(y_hat_test, y_null))

109.78464349184561

In [17]:
X_test.shape

(222, 51)

In [18]:
y_test.shape

(222, 1)

In [19]:
# r 2 means that this is the percent 
linreg.score(X,y)

0.7673174158055928

In [20]:
weights = linreg.coef_
variables = c_df.columns

In [21]:
linreg.intercept_

array([-57.78337073])

In [22]:
linreg.score(X,y)

0.7673174158055928

In [23]:
y_pred = linreg.predict(X_test)
y_pred

array([[ 93.76466352],
       [129.73520921],
       [147.79242901],
       [194.16862237],
       [139.05509921],
       [329.70506421],
       [124.41210065],
       [229.88607262],
       [127.35168597],
       [173.95742618],
       [269.49717179],
       [123.10815973],
       [117.77936177],
       [132.78976772],
       [289.7308008 ],
       [172.0997246 ],
       [134.3546232 ],
       [734.91725085],
       [175.31210585],
       [162.47451756],
       [179.36489647],
       [120.60738674],
       [239.19195337],
       [368.69118984],
       [121.01372742],
       [107.21459102],
       [220.18036103],
       [546.07781232],
       [148.62450319],
       [127.64435071],
       [305.31385768],
       [138.67375278],
       [176.07315341],
       [269.83403432],
       [234.97186197],
       [157.05597208],
       [ 88.80500274],
       [344.54399323],
       [195.28243363],
       [179.77123599],
       [187.70029918],
       [142.97336769],
       [144.37443429],
       [189

## With Lowest Price

In [24]:
X = c_df.drop(columns=['average_price', 'highest_price'])
y = pd.DataFrame(c_df, columns=['average_price'])

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size = 0.2)

print(len(X_train), len(X_test), len(y_train), len(y_test))

885 222 885 222


In [26]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

print('Intercept: \n', linreg.intercept_)
print('Coefficients: \n', linreg.coef_)

y_hat_train = linreg.predict(X_train)
y_hat_test = linreg.predict(X_test)


train_residuals = y_hat_train - y_train
test_residuals = y_hat_test - y_test

Intercept: 
 [-141.79341841]
Coefficients: 
 [[ 1.10205740e+00  2.86993767e-02  4.19327144e+02 -1.74822231e+01
   1.31801428e-01 -7.28122492e+00  7.44386291e-01 -5.95881077e+00
   1.08663085e+01  9.38375887e-01  2.14778244e+01 -3.43643765e+00
  -1.09829665e+01  1.26263294e+01 -2.34448299e+00  7.09093507e+00
   2.11933398e+01  3.34145059e+01  4.86435012e+01 -1.12319870e+01
  -4.30836496e+01 -1.56319402e-13 -5.53255254e+01 -7.13124517e+00
   7.43290805e+00 -7.97452667e+00 -9.86265681e+00 -9.08453828e+00
   1.52168008e+01  1.14032581e+01 -8.39618550e+00  8.39618550e+00
   0.00000000e+00 -1.58406656e+01 -1.80039674e+01 -1.34631164e+01
   3.89115639e+01  2.79358466e+01  7.74327216e+00 -4.85321713e+00
  -2.61516368e+01  2.52778301e+01 -2.15559094e+01 -9.48727502e+00
  -3.20010564e+01 -1.16834216e+01  5.31717530e+01  1.70593507e+00
  -1.54647446e+01  1.37588095e+01]]


In [28]:
mse_train = np.sum((y_train - y_hat_train)**2)/len(y_train)
mse_test = np.sum((y_test - y_hat_test)**2) / len(y_test)
rmse_train=np.sqrt(mse_train)
rmse_test=np.sqrt(mse_test)

print('Train Mean Squared Error: \n', mse_train)
print('Test Mean Squared Error: \n', mse_test)
print('Prediction: \n', linreg.predict(X_test))

from sklearn.metrics import mean_squared_error

train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
print('Train Mean Squared Error:', train_mse)
print('Test Mean Squared Error:', test_mse)

Train Mean Squared Error: 
 average_price    6894.576288
dtype: float64
Test Mean Squared Error: 
 average_price    8418.35275
dtype: float64
Prediction: 
 [[ 74.21465518]
 [121.66160658]
 [145.30149757]
 [231.9615435 ]
 [134.00559327]
 [364.80412606]
 [119.46431598]
 [241.95968846]
 [110.50634911]
 [189.24130126]
 [228.83877284]
 [113.84805973]
 [120.44139644]
 [126.00058615]
 [302.4978812 ]
 [177.43075405]
 [121.24052193]
 [640.08452012]
 [184.33394732]
 [149.46114285]
 [160.84519408]
 [ 97.56652077]
 [248.62953116]
 [379.46285585]
 [116.81426906]
 [119.14617821]
 [249.71184852]
 [136.88500809]
 [153.14321718]
 [126.34826885]
 [391.03338345]
 [152.27888036]
 [169.64929252]
 [319.46086095]
 [333.72986769]
 [182.38932221]
 [ 84.83269222]
 [394.11898116]
 [235.41568549]
 [200.26732519]
 [182.50936244]
 [155.0411598 ]
 [165.23213834]
 [193.25016082]
 [238.52398482]
 [171.88628665]
 [157.32455045]
 [295.29634215]
 [135.42104836]
 [126.85805677]
 [153.42637013]
 [151.87894167]
 [227.133971

In [29]:
#Testing by guessing mean value
from sklearn import metrics
import numpy as np

y_null = np.zeros_like(y_hat_test, dtype=float)

# fill the array with the mean value of y_test
y_null.fill(y_hat_test.mean())
# print(y_null)

np.sqrt(metrics.mean_squared_error(y_hat_test, y_null))

99.46087343186332

## With highest price

In [30]:
X = c_df.drop(columns=['average_price', 'lowest_price'])
y = pd.DataFrame(c_df, columns=['average_price'])

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size = 0.2)

print(len(X_train), len(X_test), len(y_train), len(y_test))

885 222 885 222


In [32]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

print('Intercept: \n', linreg.intercept_)
print('Coefficients: \n', linreg.coef_)

y_hat_train = linreg.predict(X_train)
y_hat_test = linreg.predict(X_test)


train_residuals = y_hat_train - y_train
test_residuals = y_hat_test - y_test

Intercept: 
 [68.74502849]
Coefficients: 
 [[ 4.58078584e-02 -2.53894278e-02  1.88826640e+02 -3.50987586e+01
   1.31377408e+01  7.28366514e+00 -1.95444287e+01  2.00044167e+01
   1.12466369e+01 -1.23554321e+00  1.71778561e+01 -1.29715851e+01
   4.18281048e+00  1.51722451e+00 -1.78086285e+01  1.66169418e+01
  -1.09504696e+01  2.17621127e+01  5.17358514e+01  1.27474897e+01
  -5.07496995e+01 -7.10542736e-15 -2.90536330e+01 -2.90647986e+00
  -1.26463296e+01  6.49895687e-01 -1.30944237e+01 -1.59474045e+01
   1.84419497e+01  2.55027923e+01  1.43283581e+01 -1.43283581e+01
   2.84217094e-14  9.92773928e+01 -2.05125131e+00 -5.94696000e+01
  -2.34281833e+01  1.06616088e+01  7.26128937e-01  9.21911775e-01
  -4.91942301e+01  3.74233161e+01 -1.48670937e+01  7.54648225e+00
  -2.65335148e+01 -2.55130688e+01  4.45001014e+01 -9.27590354e+00
   1.15251427e+00  8.12338927e+00]]


In [33]:
mse_train = np.sum((y_train - y_hat_train)**2)/len(y_train)
mse_test = np.sum((y_test - y_hat_test)**2) / len(y_test)
rmse_train=np.sqrt(mse_train)
rmse_test=np.sqrt(mse_test)

print('Train Mean Squarred Error: \n', mse_train)
print('Test Mean Squarred Error: \n', mse_test)
print('Prediction: \n', linreg.predict(X_test))

from sklearn.metrics import mean_squared_error

train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
print('Train Mean Squarred Error:', train_mse)
print('Test Mean Squarred Error:', test_mse)

Train Mean Squarred Error: 
 average_price    9557.273385
dtype: float64
Test Mean Squarred Error: 
 average_price    8997.203869
dtype: float64
Prediction: 
 [[128.59869044]
 [158.08564331]
 [177.48851635]
 [198.21141001]
 [136.85161792]
 [288.58808481]
 [167.87008591]
 [197.03841711]
 [171.97887748]
 [257.25490876]
 [306.11386795]
 [193.12225601]
 [109.33943135]
 [184.92005775]
 [279.15651818]
 [183.225522  ]
 [161.75488587]
 [530.41637528]
 [157.0707709 ]
 [171.57767724]
 [132.51310005]
 [ 96.87738332]
 [251.57362862]
 [224.39892234]
 [101.16191463]
 [135.59457452]
 [249.30828887]
 [615.12668911]
 [172.53987472]
 [138.22208045]
 [301.35819912]
 [177.82347606]
 [150.62043458]
 [304.42420106]
 [269.52384668]
 [176.26141585]
 [101.76205602]
 [294.48329588]
 [236.32191656]
 [176.83053561]
 [231.3970178 ]
 [167.73565933]
 [187.26317237]
 [144.85265495]
 [211.11122263]
 [226.96500901]
 [197.69788076]
 [259.25916005]
 [152.52289364]
 [137.00234572]
 [146.09621936]
 [143.39858902]
 [158.305

In [34]:
#Testing by guessing mean value
from sklearn import metrics
import numpy as np

y_null = np.zeros_like(y_hat_test, dtype=float)

# fill the array with the mean value of y_test
y_null.fill(y_hat_test.mean())
# print(y_null)

np.sqrt(metrics.mean_squared_error(y_hat_test, y_null))

92.87016105938608