Goal: Find predictors of season length
# Imports and setup


In [1]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
from scipy import stats
from tqdm import tqdm
import json
from pandas import json_normalize

#model zoo
from sklearn.linear_model import ElasticNet, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor as GradBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor as GPRegressor

#model selection
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

In [2]:
betas_path = r"../../data/processed/snow_betas.parquet"
stations_path = r"../../data/processed/stationswki.parquet"
path_season_length = r"../../data/processed/season_length.parquet"

betas_df = pd.read_parquet(betas_path)
stations_df = pd.read_parquet(stations_path)
season_length_df = pd.read_parquet(path_season_length)

In [3]:
stations_df.head()

Unnamed: 0,name,region,state_province,peak_elevation_ft,base_elevation_ft,skiable_acreage,total_lifts,avg_annual_snowfall_in,avg_elevation_ft
253,Mount Hood Skibowl,Cascades,Oregon,5027.0,3500.0,960.0,9.0,300.0,4263.5
254,Spout Springs,Cascades,Oregon,5450.0,4920.0,200.0,2.0,130.0,5185.0
252,Hoodoo,Cascades,Oregon,5703.0,4668.0,806.0,5.0,360.0,5185.5
251,Ferguson Ridge,Cascades,Oregon,5840.0,5200.0,170.0,2.0,300.0,5520.0
257,Warner Canyon,Cascades,Oregon,6003.0,5271.0,200.0,1.0,50.0,5637.0


In [4]:
betas_df.tail()

Unnamed: 0,station,beta,region
Berkshire East,,0.104517,New_England
Showdown Montana,,-0.104772,Rockies_Other
Terry Peak,,0.003419,Other
Timberline,,0.130186,East
Arapahoe Basin,,0.118733,Colorado


In [5]:
betas_df = betas_df.drop(columns=["station", "region"]).rename_axis('station')
betas_df.head()

Unnamed: 0_level_0,beta
station,Unnamed: 1_level_1
Big Squaw,-0.206543
Mt. Holiday,-0.692675
Song Mountain,0.029599
Blue Hills,0.589103
Snowshoe,0.082373


In [6]:
season_length_df.head(6)

Unnamed: 0,station,region,length,timestamp_start,timestamp_end
0,49 Degrees North,Cascades,145,2010-11-16,2011-04-10
1,49 Degrees North,Cascades,143,2011-11-18,2012-04-09
2,49 Degrees North,Cascades,142,2012-11-19,2013-04-10
3,49 Degrees North,Cascades,145,2013-11-19,2014-04-13
4,49 Degrees North,Cascades,138,2014-11-14,2015-04-01
5,49 Degrees North,Cascades,125,2015-12-07,2016-04-10


In [7]:
season_length_avg_df = (season_length_df
                        .groupby(['station', 'region'])
                        [['length']]
                        .mean()
                        .reset_index()
                       )
season_length_avg_df.head()

Unnamed: 0,station,region,length
0,49 Degrees North,Cascades,138.666667
1,Afton Alps,Other,127.555556
2,Alpental,Cascades,154.444444
3,Alpine Valley,Other,114.555556
4,Alta,Utah,163.222222


In [8]:
complete_df = (pd.merge(left=stations_df, right=betas_df,
                        left_on="name", right_index=True)
               .merge(right=season_length_avg_df, left_on='name', right_on='station')
               .drop(columns=['region_y', 'total_lifts', 'state_province', 'name'])
               .rename(columns={'region_x': 'region', 'length': 'season_len'})
               .set_index('station')
               
                    )
complete_df.head()

Unnamed: 0_level_0,region,peak_elevation_ft,base_elevation_ft,skiable_acreage,avg_annual_snowfall_in,avg_elevation_ft,beta,season_len
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Spout Springs,Cascades,5450.0,4920.0,200.0,130.0,5185.0,0.185223,91.5
Hoodoo,Cascades,5703.0,4668.0,806.0,360.0,5185.5,0.817764,130.222222
Willamette Pass,Cascades,6683.0,5120.0,555.0,430.0,5901.5,0.043802,124.125
Timberline Lodge,Cascades,8540.0,4850.0,1430.0,540.0,6695.0,0.093837,246.428571
Anthony Lakes,Cascades,8000.0,7100.0,1100.0,300.0,7550.0,0.037908,127.0


In [11]:
pd.get_dummies(data=complete_df, columns=['region'])

Unnamed: 0_level_0,peak_elevation_ft,base_elevation_ft,skiable_acreage,avg_annual_snowfall_in,avg_elevation_ft,beta,season_len,region_Cascades,region_Colorado,region_East,region_Sierras,region_Utah
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Spout Springs,5450.0,4920.0,200.0,130.0,5185.0,0.185223,91.5,1,0,0,0,0
Hoodoo,5703.0,4668.0,806.0,360.0,5185.5,0.817764,130.222222,1,0,0,0,0
Willamette Pass,6683.0,5120.0,555.0,430.0,5901.5,0.043802,124.125,1,0,0,0,0
Timberline Lodge,8540.0,4850.0,1430.0,540.0,6695.0,0.093837,246.428571,1,0,0,0,0
Anthony Lakes,8000.0,7100.0,1100.0,300.0,7550.0,0.037908,127.0,1,0,0,0,0
49 Degrees North,5774.0,3923.0,2325.0,300.0,4848.5,-0.000528,138.666667,1,0,0,0,0
Crystal Mountain,7012.0,3912.0,2600.0,350.0,5462.0,0.101911,179.222222,1,0,0,0,0
White Pass,6500.0,4500.0,1400.0,350.0,5500.0,0.051971,155.0,1,0,0,0,0
Mission Ridge,6820.0,4570.0,2000.0,200.0,5695.0,0.160408,147.666667,1,0,0,0,0
Sunlight Mountain,9895.0,7885.0,470.0,250.0,8890.0,-0.009455,123.666667,0,1,0,0,0


In [None]:
train, test = train_test_split(complete_df, test_size=.8)
def xy_split(data=None, ycol='season_len'):
    return data.copy().drop(columns=[ycol]), data.copy()[[ycol]]
X_train, y_train = xy_split(train)
X_test, y_test = xy_split(test)

In [None]:
X_test.head(20).to_numpy()

## Regression

### Feature work

In [None]:
ProfileReport(stations_wbeta_df)

# Modeling

In [None]:
# Algos
linear = ElasticNet()
rforest = RandomForestRegressor()
gboost = GradBoostRegressor()
knn =  KNeighborsRegressor()
svm = SVR()
gp = GPRegressor()
models = ['linear', 'rforest', 'gboost', 'knn', 'svm', 'gp']

In [None]:
params = {}
params['linear'] = {'alpha': stats.uniform(.0001, 100),
                    'l1_ratio': stats.uniform(.1, .9)}
params['rforest'] = {'criterion': ('mse', 'mae'),
                     'max_features': ('log2', 'sqrt', .2, .15),  
                     'max_depth': (5, 15, 30),
                     'min_samples_split': (10, 20, 30)}
params['gboost'] = {'learning_rate': stats.uniform(.08, .12),
                    'n_estimators': (80, 100, 120)}
params['knn'] = {'n_neighbors': (3, 5, 7),
                 'weights': ('uniform', 'distance'),
                 'p': (1.5, 2, 2.5)}
params['svm'] = {'C': (.4, .7, 1, 1.3, 2, 10),
                 'epsilon': (.001, .01, .1), 
                 'kernel': ('linear', 'poly', 'rbf')}
params['gp'] = {'alpha': stats.uniform(1e-8, 1e-12)}

In [None]:
cv_scores_df = pd.DataFrame()#columns=['model', 'score'])
for model in tqdm(models):
    cv_object = RandomizedSearchCV(estimator=eval(model), 
                                   param_distributions=params[model], 
                                   n_iter=20, cv=5)
    scores = cross_validate(estimator=cv_object, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error'],
                            X=X_train.to_numpy(), y=y_train.to_numpy(),
                            cv=5, n_jobs=-1, return_estimator=True)
    scores_df = pd.DataFrame(scores)   # todo: shorten
    cv_scores_df = pd.concat([cv_scores_df, scores_df], axis=0)

In [None]:
#json_normalize(scores_df)
scores_df

In [None]:
cv_scores_df.head(20)

In [None]:
for model in cv_scores:
    #print(json.loads(model))

In [None]:
cv_scores