In [1]:
%matplotlib inline

In [53]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

In [3]:
results = pd.read_csv('../data/parsed_data.csv')

In [4]:
results['ice_out'] = pd.to_datetime(results['ice_out'])
results.head()
results1 = results[(results.Year>=1950)&(results.Year<=2016)][['Year', 'JDOY']].copy()
results1.columns = ['Year', 'iceout']

In [9]:
df = pd.read_csv('../data/PANN.csv', parse_dates=['date'], index_col='date')
df = df.dropna(how='all', axis=1)
df['doy'] = df.index.dayofyear

df['week'] = df.index.week
df['year'] = df.index.year
# push OCT, NOV, and DEC into the appropriate Ice Classic Results Year
df['year'] = np.where(df.index.month == 10, df.year+1, df.year)
df['year'] = np.where(df.index.month == 11, df.year+1, df.year)
df['year'] = np.where(df.index.month == 12, df.year+1, df.year)

In [11]:
df2 = pd.pivot_table(df, values='actual_mean_temp', columns='doy', index='year')
df2 = df2.dropna(how='all', axis=1).copy()

In [13]:
col_names = df2.columns.tolist()
col_names_corr = []
for col in col_names:
    col = 'day'+str(col)
    col_names_corr.append(col)

In [15]:
df2.columns = col_names_corr

In [16]:
df3 = df2.merge(results1, how='inner', left_index=True, right_on='Year')
df_hold = df3[(df3.Year<=2016)&(df3.Year>=2013)].copy()
df3 = df3[df3.Year<2014]
df3 = df3.drop(['Year'], axis=1)
df_hold = df_hold.drop(['Year'], axis=1)

In [19]:
features = df3.columns.tolist()
features = [x for x in features if x != 'iceout']

In [21]:
df3 = df3.fillna(method='ffill')
df3 = df3.fillna(method='bfill')

y = df3.iceout.values
X = df3[features].values

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [32]:
rf = RandomForestRegressor(n_estimators=40, criterion='mse')
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=40, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [33]:
y_rf = rf.predict(X_test)
rf.score(X_test, y_test)

-0.13192238728800243

In [37]:
rf.predict(df_hold.iloc[-1][features].fillna(method='ffill').values)



array([ 123.87232639])

In [46]:
df_hold.iloc[-2]['iceout']

114.60069444444444

In [29]:
parameters = {'kernel':('linear', 'rbf', 'sigmoid', 'poly'), 'C':[.1, 1,10,100]}
svr = svm.SVR()
clf = GridSearchCV(svr, parameters)
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10, 100], 'kernel': ('linear', 'rbf', 'sigmoid', 'poly')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [30]:
clf.score(X_test, y_test)

-0.058902881675715113

In [31]:
clf.predict(df_hold.iloc[-1][features].fillna(method='ffill').values)



array([ 124.20643004])

In [45]:
erf = ExtraTreesRegressor()
erf.fit(X_train, y_train)
erf.score(X_test, y_test)
erf.predict(df_hold.iloc[-2][features].fillna(method='ffill').values)



array([ 135.91576389])

In [52]:
areg = AdaBoostRegressor()
areg.fit(X_train, y_train)
areg.score(X_test, y_test)
areg.predict(df_hold.iloc[-1][features].fillna(method='ffill').values)



array([ 126.46717172])

In [80]:
est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
...     max_depth=10, loss='lad').fit(X_train, y_train)

In [81]:
est.score(X_test, y_test)

-0.41205469677768819

In [83]:
est.predict(df_hold.iloc[-4][features].fillna(method='ffill').values)



array([ 136.12270632])

In [84]:
est

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='lad', max_depth=10,
             max_features=None, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [88]:
est1 = GradientBoostingRegressor()

parameters = {'loss':('lad', 'huber', 'ls'), 'n_estimators':[1,10,100, 500, 1000], 'max_depth':[1,2,3,4,5,6,7,8,9,10,20]}
clf = GridSearchCV(est, parameters)
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='lad', max_depth=10,
             max_features=None, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [1, 10, 100, 500, 1000], 'loss': ('lad', 'huber', 'ls'), 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [89]:
clf.score(X_test, y_test)

-0.060849464027114893

In [90]:
clf.predict(df_hold.iloc[-4][features].fillna(method='ffill').values)



array([ 125.15896759])