In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

In [3]:
results = pd.read_csv('../data/parsed_data.csv')

In [4]:
results['ice_out'] = pd.to_datetime(results['ice_out'])
results.head()
results1 = results[(results.Year>=1950)&(results.Year<=2016)][['Year', 'JDOY']].copy()
results1.columns = ['Year', 'iceout']

In [100]:
df = pd.read_csv('../data/PANN2.csv', parse_dates=['date'], index_col='date')
df = df.dropna(how='all', axis=1)
df['doy'] = df.index.dayofyear


df['year'] = df.index.year
# push OCT, NOV, and DEC into the appropriate Ice Classic Results Year
df['year'] = np.where(df.index.month == 10, df.year+1, df.year)
df['year'] = np.where(df.index.month == 11, df.year+1, df.year)
df['year'] = np.where(df.index.month == 12, df.year+1, df.year)

df = df.resample('W-MON').mean()
df['week'] = df.index.week

In [104]:
df.columns = ['actual_mean_temp', 'wind', 'doy', 'year', 'week']

df_temp = df[['actual_mean_temp', 'doy', 'week', 'year']]
df_wind = df[['wind', 'doy', 'week', 'year']]

In [107]:
df2 = pd.pivot_table(df_temp, values='actual_mean_temp', columns='week', index='year')
df2 = df2.dropna(how='all', axis=1).copy()

df3 = pd.pivot_table(df_wind, values='wind', columns='week', index='year')
df3 = df3.dropna(how='all', axis=1).copy()

In [108]:
col_names = df2.columns.tolist()
col_names_corr = []
for col in col_names:
    col = 'temp'+str(col)
    col_names_corr.append(col)
    
col_names_w = df3.columns.tolist()
col_names_corr_w = []
for col in col_names_w:
    col = 'wind'+str(col)
    col_names_corr_w.append(col)

In [109]:
df2.columns = col_names_corr
df3.columns = col_names_corr_w

In [110]:
df4 = df2.merge(results1, how='inner', left_index=True, right_on='Year')
df5 = df4.merge(df3, how='inner', left_on="Year", right_index=True)
df_hold = df5[(df5.Year<=2016)&(df5.Year>=2013)].copy()
df5 = df5[df5.Year<2014]
df5 = df5.drop(['Year'], axis=1)
df_hold = df_hold.drop(['Year'], axis=1)

In [111]:
df5 = df5.fillna(method='ffill').fillna(method='bfill')

In [112]:
features = df5.columns.tolist()
features = [x for x in features if x != 'iceout']

In [113]:
y = df5.iceout.values
X = df5[features].values

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [139]:
rf = RandomForestRegressor(n_estimators=125, criterion='mse')
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=125, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [140]:
y_rf = rf.predict(X_test)
rf.score(X_test, y_test)

0.12310818779471987

In [144]:
rf.predict(df_hold.iloc[-3][features].fillna(method='ffill').values)



array([ 124.83803889])

In [51]:
df_hold.iloc[-4]['iceout']

140.61180555555555

In [145]:
parameters = {'kernel':('linear', 'rbf', 'sigmoid', 'poly'), 'C':[.1, 1,10,100]}
svr = svm.SVR()
clf = GridSearchCV(svr, parameters)
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kernel': ('linear', 'rbf', 'sigmoid', 'poly'), 'C': [0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [146]:
clf.score(X_test, y_test)

-0.05536353759858792

In [148]:
clf.predict(df_hold.iloc[-1][features].fillna(method='ffill').values)



array([ 122.6266523])

In [150]:
erf = ExtraTreesRegressor()
erf.fit(X_train, y_train)
print(erf.score(X_test, y_test))
erf.predict(df_hold.iloc[-1][features].fillna(method='ffill').values)

0.142277129403




array([ 136.21305556])

In [153]:
areg = AdaBoostRegressor()
areg.fit(X_train, y_train)
print(areg.score(X_test, y_test))
areg.predict(df_hold.iloc[-1][features].fillna(method='ffill').values)

-0.126348344677




array([ 128.57003968])

In [65]:
est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
...     max_depth=10, loss='lad').fit(X_train, y_train)

In [66]:
est.score(X_test, y_test)

-3.4649926545346545

In [67]:
est.predict(df_hold.iloc[-4][features].fillna(method='ffill').values)



array([ 136.7163351])

In [68]:
est

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='lad', max_depth=10,
             max_features=None, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [69]:
est1 = GradientBoostingRegressor()

parameters = {'loss':('lad', 'huber', 'ls'), 'n_estimators':[1,10,100, 500, 1000], 'max_depth':[1,2,3,4,5,6,7,8,9,10,20]}
clf = GridSearchCV(est, parameters)
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='lad', max_depth=10,
             max_features=None, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20], 'n_estimators': [1, 10, 100, 500, 1000], 'loss': ('lad', 'huber', 'ls')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [70]:
clf.score(X_test, y_test)

-1.8083367542085318

In [73]:
clf.predict(df_hold.iloc[-3][features].fillna(method='ffill').values)



array([ 122.24796619])