In [4]:
import os
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [8]:
#Load the csv file you created in step 3

step3_output = pd.read_csv(r"C:/Users/Curtis/Documents/SpringboardGuidedCapstone/data/step3_output.csv")

step3_output.head()

Unnamed: 0,Name,state,summit_elev,vertical_drop,trams,fastEight,fastSixes,fastQuads,quad,triple,...,SkiableTerrain_ac,Snow Making_ac,daysOpenLastYear,yearsOpen,averageSnowfall,AdultWeekday,AdultWeekend,projectedDaysOpen,NightSkiing_ac,cluster
0,Hilltop Ski Area,Alaska,2090,294,0,0,0,0,0,1,...,30,30,150,36,69,30,34,152,30,2
1,Sunrise Park Resort,Arizona,11100,1800,0,0,0,1,2,3,...,800,80,115,49,250,74,78,104,80,1
2,Yosemite Ski & Snowboard Area,California,7800,600,0,0,0,0,0,1,...,88,174,110,84,300,47,47,107,0,1
3,Boreal Mountain Resort,California,7700,500,0,0,0,1,1,3,...,380,200,150,54,400,49,64,150,200,1
4,Dodge Ridge,California,8200,1600,0,0,0,0,1,2,...,862,174,115,69,350,78,78,140,0,1


In [10]:
#Create dummy variables for state. Add the dummies back to the dataframe and remove the original column for state.

dfo = pd.concat([step3_output, pd.get_dummies(step3_output['state'])], axis=1).drop(['state'], axis =1)
print(dfo.shape)
dfo.head()


(176, 60)


Unnamed: 0,Name,summit_elev,vertical_drop,trams,fastEight,fastSixes,fastQuads,quad,triple,double,...,Rhode Island,South Dakota,Tennessee,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,Hilltop Ski Area,2090,294,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Sunrise Park Resort,11100,1800,0,0,0,1,2,3,1,...,0,0,0,0,0,0,0,0,0,0
2,Yosemite Ski & Snowboard Area,7800,600,0,0,0,0,0,1,3,...,0,0,0,0,0,0,0,0,0,0
3,Boreal Mountain Resort,7700,500,0,0,0,1,1,3,1,...,0,0,0,0,0,0,0,0,0,0
4,Dodge Ridge,8200,1600,0,0,0,0,1,2,5,...,0,0,0,0,0,0,0,0,0,0


In [48]:
#Using sklearn preprocessing standardize the scale of the features of the dataframe except the name of the resort which we 
#done't need in the dataframe for modeling, so it can be droppped here as well. Also, we want to hold out our response 
#variable(s) so we can have their true values available for model performance review. Let's set AdultWeekend to the 
#y variable as our response for scaling and modeling. Later we will go back and consider the AdultWeekday, 
#dayOpenLastYear, and projectedDaysOpen. For now leave them in the development dataframe.

# first we import the preprocessing package from the sklearn library
from sklearn.preprocessing import StandardScaler

# Declare an explanatory variable, called X,and assign it the result of dropping 'Name' and 'AdultWeekend' from the df
X = dfo.drop(columns=['Name','AdultWeekend', 'summit_elev'], axis=0)

# Declare a response variable, called y, and assign it the AdultWeekend column of the df 
y= dfo['AdultWeekend']
# Here we use the StandardScaler() method of the preprocessing package, and then call the fit() method with parameter X 
scaler = StandardScaler().fit(X)

# Declare a variable called X_scaled, and assign it the result of calling the transform() method with parameter X 
X_scaled=scaler.transform(X) 

In [49]:
#Using sklearn model selection import train_test_split, and create a 75/25 split with the y = AdultWeekend. 
#We will start by using the adult weekend ticket price as our response variable for modeling.

# Import the train_test_split function from the sklearn.model_selection utility.  
from sklearn.model_selection import train_test_split

# Get the 1-dimensional flattened array of our response variable y by calling the ravel() function on y
y = y.ravel()

# Call the train_test_split() function with the first two parameters set to X_scaled and y 
# Declare four variables, X_train, X_test, y_train and y_test separated by commas 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=1)

In [50]:
# Ridge Regression using alpha = 0.05

from sklearn.metrics import explained_variance_score,mean_absolute_error
from sklearn.linear_model import Ridge

## training the model

ridgeReg = Ridge(alpha=0.05, normalize=True)

model = ridgeReg.fit(X_train, y_train)

# Make a variable called y_pred and assign it the result of calling predict() on our model variable with parameter X_test

y_pred = model.predict(X_test)

#calculating mean squared error

mse = np.mean((y_pred - y_test)**2) 
score = ridgeReg.score(X_test, y_test)

print('mse:', mse)
print('score:', score)

mse: 3786.7942995821463
score: -19.377953719507108


In [51]:
explained_variance_score(y_test, y_pred)

-19.1058743674432

In [52]:
mean_absolute_error(y_test, y_pred)

15.33388843481714

In [53]:
ridgeReg.intercept_

58.68672910187364

In [54]:
# You might want to make a pandas DataFrame displaying the coefficients for each state like so: 
pd.DataFrame(abs(ridgeReg.coef_), X.columns, columns=['Coefficient']).sort_values(by='Coefficient', ascending=False)

Unnamed: 0,Coefficient
New Jersey,31.523183
AdultWeekday,8.366257
vertical_drop,2.335199
cluster,1.695697
quad,1.640879
Virginia,1.618273
Idaho,1.615547
TerrainParks,1.60707
New Hampshire,1.476297
triple,1.447682


In [55]:
# Ridge Regression using alpha = 0.5

from sklearn.metrics import explained_variance_score,mean_absolute_error
from sklearn.linear_model import Ridge

## training the model

ridgeReg = Ridge(alpha=0.5, normalize=True)

model = ridgeReg.fit(X_train, y_train)

# Make a variable called y_pred and assign it the result of calling predict() on our model variable with parameter X_test

y_pred = model.predict(X_test)

#calculating mean squared error

mse = np.mean((y_pred - y_test)**2) 
score = ridgeReg.score(X_test, y_test)

print('mse:', mse)
print('score:', score)

mse: 2065.5471241874297
score: -10.115371042677717


In [56]:
explained_variance_score(y_test, y_pred)

-9.96832767210034

In [57]:
mean_absolute_error(y_test, y_pred)

13.228722854607884

In [58]:
ridgeReg.intercept_

58.21591852922764

In [59]:
# Ridge Regression using alpha = 5.0

from sklearn.metrics import explained_variance_score,mean_absolute_error
from sklearn.linear_model import Ridge

## training the model

ridgeReg = Ridge(alpha=5.0, normalize=True)

model = ridgeReg.fit(X_train, y_train)

# Make a variable called y_pred and assign it the result of calling predict() on our model variable with parameter X_test

y_pred = model.predict(X_test)

#calculating mean squared error

mse = np.mean((y_pred - y_test)**2) 
score = ridgeReg.score(X_test, y_test)

print('mse:', mse)
print('score:', score)

mse: 193.87596798138955
score: -0.04330871716093698


In [60]:
explained_variance_score(y_test, y_pred)

-0.041166829453287423

In [61]:
mean_absolute_error(y_test, y_pred)

10.015747218543112

In [62]:
ridgeReg.intercept_

57.06681350973964

In [63]:
# Ridge Regression using alpha = 10.0

from sklearn.metrics import explained_variance_score,mean_absolute_error
from sklearn.linear_model import Ridge

## training the model

ridgeReg = Ridge(alpha=10.0, normalize=True)

model = ridgeReg.fit(X_train, y_train)

# Make a variable called y_pred and assign it the result of calling predict() on our model variable with parameter X_test

y_pred = model.predict(X_test)

#calculating mean squared error

mse = np.mean((y_pred - y_test)**2) 
score = ridgeReg.score(X_test, y_test)

print('mse:', mse)
print('score:', score)

mse: 139.6588714019168
score: 0.24845085505148967


In [64]:
explained_variance_score(y_test, y_pred)

0.24861440766660947

In [65]:
mean_absolute_error(y_test, y_pred)

10.17851679434491

In [66]:
ridgeReg.intercept_

56.86550718600624

In [74]:
# Ridge Regression using alpha = 12.0

from sklearn.metrics import explained_variance_score,mean_absolute_error
from sklearn.linear_model import Ridge

## training the model

ridgeReg = Ridge(alpha=12.0, normalize=True)

model = ridgeReg.fit(X_train, y_train)

# Make a variable called y_pred and assign it the result of calling predict() on our model variable with parameter X_test

y_pred = model.predict(X_test)

#calculating mean squared error

mse = np.mean((y_pred - y_test)**2) 
score = ridgeReg.score(X_test, y_test)

print('mse:', mse)
print('score:', score)

mse: 138.10898875068042
score: 0.25679127030484705
