In [1]:
%matplotlib inline

In [13]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [3]:
results = pd.read_csv('../data/parsed_data.csv')

In [4]:
results['ice_out'] = pd.to_datetime(results['ice_out'])
results.head()
results1 = results[(results.Year>=1950)&(results.Year<=2016)][['Year', 'JDOY']].copy()
results1.columns = ['Year', 'iceout']

In [5]:
df = pd.read_csv('../data/PANN2.csv', parse_dates=['date'], index_col='date')
df = df.dropna(how='all', axis=1)
df['doy'] = df.index.dayofyear


df['year'] = df.index.year
# push OCT, NOV, and DEC into the appropriate Ice Classic Results Year
df['year'] = np.where(df.index.month == 10, df.year+1, df.year)
df['year'] = np.where(df.index.month == 11, df.year+1, df.year)
df['year'] = np.where(df.index.month == 12, df.year+1, df.year)

df = df.resample('W-MON').median()
df['week'] = df.index.week

In [6]:
df.columns = ['actual_mean_temp', 'wind', 'doy', 'year', 'week']

df_temp = df[['actual_mean_temp', 'doy', 'week', 'year']]
df_wind = df[['wind', 'doy', 'week', 'year']]

In [7]:
df2 = pd.pivot_table(df_temp, values='actual_mean_temp', columns='week', index='year')
df2 = df2.dropna(how='all', axis=1).copy()

df3 = pd.pivot_table(df_wind, values='wind', columns='week', index='year')
df3 = df3.dropna(how='all', axis=1).copy()

In [8]:
col_names = df2.columns.tolist()
col_names_corr = []
for col in col_names:
    col = 'temp'+str(col)
    col_names_corr.append(col)
    
col_names_w = df3.columns.tolist()
col_names_corr_w = []
for col in col_names_w:
    col = 'wind'+str(col)
    col_names_corr_w.append(col)

In [9]:
df2.columns = col_names_corr
df3.columns = col_names_corr_w

In [10]:
df4 = df2.merge(results1, how='inner', left_index=True, right_on='Year')
df5 = df4.merge(df3, how='inner', left_on="Year", right_index=True)
df_hold = df5[(df5.Year<=2016)&(df5.Year>=2013)].copy()
df5 = df5[df5.Year<2014]
df5 = df5.drop(['Year'], axis=1)
df_hold = df_hold.drop(['Year'], axis=1)

In [11]:
df5 = df5.fillna(method='ffill').fillna(method='bfill')

In [12]:
features = df5.columns.tolist()
features = [x for x in features if x != 'iceout']

In [24]:
std_scale = preprocessing.StandardScaler().fit(df5)
df_std = pd.DataFrame(std_scale.transform(df5))
df_std.columns = df5.columns

minmax_scale = preprocessing.MinMaxScaler().fit(df5)
df_minmax = pd.DataFrame(minmax_scale.transform(df5))
df_minmax.columns = df5.columns

In [25]:
y = df_minmax.iceout.values
X = df_minmax[features].values

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

In [None]:
tpot = TPOTRegressor(generations=40, population_size=300, verbosity=2, scoring='r2', warm_start=False, cv=9)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

In [None]:
tpot2 = TPOTRegressor(generations=30, population_size=275, verbosity=2, scoring='mean_squared_error', warm_start=False, cv=3)
tpot2.fit(X_train, y_train)
print(tpot2.score(X_test, y_test))

Optimization Progress:   6%|▋         | 550/8525 [04:14<33:34,  3.96pipeline/s]  

Generation 1 - Current best internal CV score: 0.03778541117976199


Optimization Progress:  10%|▉         | 825/8525 [06:06<24:57,  5.14pipeline/s]  

Generation 2 - Current best internal CV score: 0.03778541117976199


Optimization Progress:  13%|█▎        | 1100/8525 [08:24<26:31,  4.67pipeline/s] 

Generation 3 - Current best internal CV score: 0.03778541117976199


Optimization Progress:  16%|█▌        | 1375/8525 [11:18<40:11,  2.96pipeline/s]  

Generation 4 - Current best internal CV score: 0.03778541117976199


Optimization Progress:  19%|█▉        | 1650/8525 [13:43<30:09,  3.80pipeline/s]  

Generation 5 - Current best internal CV score: 0.03778541117976199


Optimization Progress:  23%|██▎       | 1925/8525 [16:55<35:52,  3.07pipeline/s]  

Generation 6 - Current best internal CV score: 0.03778541117976199


Optimization Progress:  26%|██▌       | 2200/8525 [20:05<26:39,  3.95pipeline/s]   

Generation 7 - Current best internal CV score: 0.03778541117976199


Optimization Progress:  29%|██▉       | 2475/8525 [23:01<25:13,  4.00pipeline/s]   

Generation 8 - Current best internal CV score: 0.03778541117976199


Optimization Progress:  32%|███▏      | 2750/8525 [25:46<34:20,  2.80pipeline/s]   

Generation 9 - Current best internal CV score: 0.03778541117976199


Optimization Progress:  35%|███▌      | 3025/8525 [28:20<26:51,  3.41pipeline/s]   

Generation 10 - Current best internal CV score: 0.03778541117976199


Optimization Progress:  39%|███▊      | 3300/8525 [31:02<28:07,  3.10pipeline/s]   

Generation 11 - Current best internal CV score: 0.03248031294398702


Optimization Progress:  42%|████▏     | 3575/8525 [34:05<41:42,  1.98pipeline/s]   

Generation 12 - Current best internal CV score: 0.03242916133376584


Optimization Progress:  45%|████▌     | 3850/8525 [36:43<35:40,  2.18pipeline/s]   

Generation 13 - Current best internal CV score: 0.03242916133376584


Optimization Progress:  48%|████▊     | 4125/8525 [39:29<20:11,  3.63pipeline/s]   

Generation 14 - Current best internal CV score: 0.03242916133376584


Optimization Progress:  52%|█████▏    | 4400/8525 [41:41<16:24,  4.19pipeline/s]   

Generation 15 - Current best internal CV score: 0.03057738000890678


Optimization Progress:  55%|█████▍    | 4675/8525 [44:11<19:14,  3.33pipeline/s]   

Generation 16 - Current best internal CV score: 0.03057738000890678


Optimization Progress:  58%|█████▊    | 4950/8525 [46:48<16:43,  3.56pipeline/s]   

Generation 17 - Current best internal CV score: 0.03057738000890678


Optimization Progress:  61%|██████▏   | 5225/8525 [49:18<17:00,  3.23pipeline/s]   

Generation 18 - Current best internal CV score: 0.030562448354449236


Optimization Progress:  65%|██████▍   | 5500/8525 [51:28<13:55,  3.62pipeline/s]   

Generation 19 - Current best internal CV score: 0.030562448354449236


Optimization Progress:  68%|██████▊   | 5775/8525 [53:40<12:21,  3.71pipeline/s]   

Generation 20 - Current best internal CV score: 0.030562448354449236


Optimization Progress:  71%|███████   | 6050/8525 [55:57<11:16,  3.66pipeline/s]   

Generation 21 - Current best internal CV score: 0.030562448354449236


Optimization Progress:  74%|███████▍  | 6325/8525 [58:15<09:13,  3.97pipeline/s]   

Generation 22 - Current best internal CV score: 0.030562448354449236


Optimization Progress:  77%|███████▋  | 6600/8525 [1:00:46<07:43,  4.15pipeline/s] 

Generation 23 - Current best internal CV score: 0.03053852728687564


Optimization Progress:  81%|████████  | 6875/8525 [1:03:19<08:07,  3.38pipeline/s]   

Generation 24 - Current best internal CV score: 0.03053852728687564


Optimization Progress:  90%|█████████ | 7700/8525 [1:12:50<05:20,  2.58pipeline/s]   

Generation 27 - Current best internal CV score: 0.03053852728687564


In [None]:
tpot2.export('pipeline.py')

In [None]:
df_hold['iceout']

In [33]:
tpot2.predict(df_hold[features].iloc[-4].fillna(0))



array([ 129.12308852])

In [24]:
df_hold['iceout'].iloc[-4]

140.61180555555555