In [None]:
import pandas as pd
from pandas.plotting import register_matplotlib_converters
import sys
import matplotlib.pyplot as plt
import numpy as np

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import datetime as dt
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
import joblib

In [None]:
register_matplotlib_converters()

with pd.HDFStore('daily_combined_data.hdf') as store:
    daily_keys = [key[1:] for key in store.keys()]

In [None]:
def plot_errors(model, X_valid, y_valid, output_filename):
    plt.hist(y_valid - model.predict(X_valid), bins=100)
    plt.savefig('models/errors_{}.png'.format(output_filename))
    plt.close()

def plot_model(df, model, output_filename, filter_month=None):
   # df['date'] = df.index.map(dt.datetime.toordinal)
    if filter_month:
        df = df[df['month'] == filter_month]
    # real values in blue, predicted values in red
    plt.plot(df['date'], df['water_level'], 'b.', alpha=0.5)
    plt.scatter(df['date'], model.predict(df[X_COLUMNS]), s=1, alpha=0.5, c='r')
    plt.show()
    plt.savefig('models/predict_{}.png'.format(output_filename))
    plt.close()
    

    
def model_scores(model, X_train, X_valid, y_train, y_valid):
    return {'train_score': model.score(X_train, y_train), 'valid_score': model.score(X_valid, y_valid)}

def train_and_generate_model(X_train, y_train, C, n_components, scaler):
    imputer = SimpleImputer()
    model = make_pipeline(
        imputer,
        scaler,
        PCA(n_components=n_components), # assume we have some kind of time, precip and temp dimensionality
        SVR(C=C, gamma='scale')
    )
    model.fit(X_train, y_train)
    return model

def prepare_df(df):
    df['date'] = df.index.map(dt.datetime.toordinal)
    y_column = 'water_level'
    X = df[X_COLUMNS]
    y = df['water_level'].values
    return (X, y)

In [None]:
def create_models(key,df=None, filter_month=None, show_plot=False, n_components=5, C=100, scaler=MinMaxScaler()):
    if df is None:
        df = pd.read_hdf('daily_combined_data.hdf', key=key).sort_index().dropna()
    X, y = prepare_df(df)
    
    if len(df) == 0:
        return None, None
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y)

    model = train_and_generate_model(X_train, y_train, C=C, n_components=n_components, scaler=scaler)
    
    if show_plot:
        plot_model(df, model, key, filter_month)
    if SAVE_MODELS:
        joblib.dump(model, 'models/model-{}.pckl'.format(key))
        
    return model, model_scores(model, X_train, X_valid, y_train, y_valid)

In [None]:
def PCA_test(key, daily=True):
    filename = 'daily_combined_data.hdf' if daily else 'monthly_combined_data.hdf'

    df = pd.read_hdf(filename, key=key).sort_index().dropna()
    for i in range(2,8):
        print(key, i, create_models(key, df=df, n_components=i)[1])
    print()

In [None]:
def SVR_test(key, daily=True):
    filename = 'daily_combined_data.hdf' if daily else 'monthly_combined_data.hdf'

    df = pd.read_hdf(filename, key=key).sort_index().dropna()
    for i in range(-3,3):
        print(key, i, create_models(key, df=df, C=10**i)[1])
    print()

In [None]:
def Scaler_test(key, daily=True):
    filename = 'daily_combined_data.hdf' if daily else 'monthly_combined_data.hdf'

    df = pd.read_hdf(filename, key=key).sort_index().dropna()
    print(key, 'standard', create_models(key, df=df, scaler=StandardScaler())[1])
    print(key, 'minmax', create_models(key, df=df, scaler=MinMaxScaler())[1])
    print(key, 'robust', create_models(key, df=df, scaler=RobustScaler())[1])
    print()

In [None]:
SAVE_MODELS = False
X_COLUMNS = ['date','year', 'month', 'rain', 'precip', 'mean_temp', 'snow_on_grnd', 'snow_precip']
daily_keys

In [None]:
# WILL TERMINATE <5 mins. Use this to build models. Specific keys can also be set, shown above ^
# daily models
SAVE_MODELS=True
my_model_scores = dict()
# There are thirty stations in the dataframe
# Some of these do not have enough good data to build a model off of
# change slice size to build models for more than the first 5 models
slice_size = 5
for key in daily_keys[:slice_size]:
    model, scores = create_models(key, show_plot=True)
    print(key, scores)
    my_model_scores[key] = scores
    if SAVE_MODELS and scores is not None:
        joblib.dump(model, 'models/model-{}.pckl'.format(key))

In [None]:
my_model_scores

In [None]:
# WILL NOT TERMINATE QUICKLY
# test best components. 5 looks pretty good
for key in daily_keys:
    PCA_test(key)

In [None]:
# WILL NOT TERMINATE QUICKLY
# test best C value. 10e2 looks good, in terms of time/accuraccy tradeoff
for key in daily_keys:
    SVR_test(key)

In [None]:
# WILL NOT TERMINATE QUICKLY
# test best scaler. minmax looks the best
for key in daily_keys:
    Scaler_test(key)

In [None]:
# Use this to filter a date range out of a dataframe and check the prediction
key =  'hydro_08MH149'
df = pd.read_hdf('daily_combined_data.hdf', key=key).sort_index().dropna()

start ='1990-01-01'#format: %Y-%m-%d 
end = '2000-07-16' #format: %Y-%m-%d
def filter_by_date_range(df,start,end):
    df = df.reset_index(drop=False)
    df = df.rename(columns={'index':'Date/Time'})
    df['Date/Time'] = pd.to_datetime(df['Date/Time'],format="%Y-%m-%d",errors='coerce')
    df = df[pd.notnull(df['Date/Time'])]

    df = df[(df['Date/Time'] <= end) &(df['Date/Time'] >= start)]
    
    return df.set_index('Date/Time')

model = joblib.load('./models/model-hydro_{hydro_id}.pckl'.format(hydro_id=key[6:]))#create_models(key)[0]

model.predict(prepare_df(filter_by_date_range(df, start, end))[0])

In [None]:
# plot a specific month for a model
create_models('hydro_08MH149', show_plot=True)[1]