In [None]:
import pandas as pd
from pandas.plotting import register_matplotlib_converters
import sys
import matplotlib.pyplot as plt
import numpy as np

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import datetime as dt
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
import joblib

In [None]:
HDF_NAME = 'monthly_combined_data.hdf'
with pd.HDFStore(HDF_NAME) as store:
    monthly_keys = [key[1:] for key in store.keys()]
    
SAVE_MODELS = False
X_COLUMNS = ['year', 'month', 'monthly_water_mean', 'monthly_water_total',
       'monthly_water_min', 'monthly_water_max', 'min_water_day',
       'max_water_day', 'mean_max_temp', 'mean_min_temp', 'mean_temp',
       'month_min_temp', 'month_max_temp', 'rain', 'total_snow', 'precip',
       'remaining_snow']
Y_COLUMN = 'monthly_water_mean'

In [None]:
def plot_errors(model, X_valid, y_valid, output_filename):
    plt.hist(y_valid - model.predict(X_valid), bins=100)
    plt.savefig('models/errors_{}.png'.format(output_filename))
    plt.close()

def plot_model(df, model, output_filename, filter_month=None):
    if filter_month:
        df = df[df['month'] == filter_month]
    # real values in blue, predicted values in red
    plt.plot(df[Y_COLUMN], 'b.', alpha=0.5)
    plt.scatter(df.index, model.predict(df[X_COLUMNS]), s=1, alpha=0.7, c='r')
    plt.show()
    plt.savefig('models/predict_{}.png'.format(output_filename))
    plt.close()
    
def plot_df_vals(df):
    scaled_df = pd.DataFrame(StandardScaler().fit_transform(df))
    scaled_df.columns = df.columns
    scaled_df.index = df.index
    plt.plot(scaled_df.index, scaled_df['water_level'], c= 'b')
    plt.plot(scaled_df.index, scaled_df['precip'], c= 'r')
    plt.plot(scaled_df.index, scaled_df['rain'], c= 'yellow')
    plt.plot(scaled_df.index, scaled_df['mean_temp'], c='g')
    plt.plot(scaled_df.index, scaled_df['snow_precip'], c='aqua')
    plt.plot(scaled_df.index, scaled_df['snow_on_grnd'], c='magenta')
    
def model_scores(model, X_train, X_valid, y_train, y_valid):
    return {'train_score': model.score(X_train, y_train), 'valid_score': model.score(X_valid, y_valid)}

def train_and_generate_model(X_train, y_train, C, n_components, scaler):
    imputer = SimpleImputer()
    model = make_pipeline(
        imputer,
        scaler,
        PCA(n_components=n_components), # assume we have some kind of time, precip and temp dimensionality
        SVR(C=C, gamma='scale')
    )
    model.fit(X_train, y_train)
    return model

def prepare_df(df):
    X = df[X_COLUMNS]
    y = df[Y_COLUMN].values
    return (X, y)

In [None]:
def create_models(key,df=None, filter_month=None, show_plot=False, n_components=5, C=100, scaler=MinMaxScaler()):
    if df is None:
        df = pd.read_hdf(HDF_NAME, key=key).sort_index().dropna()
    X, y = prepare_df(df)
    
    if len(df) == 0:
        return None, None
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y)

    model = train_and_generate_model(X_train, y_train, C=C, n_components=n_components, scaler=scaler)
    
    if show_plot:
        plot_model(df, model, key, filter_month)
    if SAVE_MODELS:
        joblib.dump(model, 'models/model-{}.pckl'.format(key))
        
    return model, model_scores(model, X_train, X_valid, y_train, y_valid)

In [None]:
def PCA_test(key):
    df = pd.read_hdf(HDF_NAME, key=key).sort_index().dropna()
    for i in range(2,8):
        print(key, i, create_models(key, df=df, n_components=i)[1])
    print()

In [None]:
def SVR_test(key):
    df = pd.read_hdf(HDF_NAME, key=key).sort_index().dropna()
    for i in range(-3,3):
        print(key, i, create_models(key, df=df, C=10**i)[1])
    print()

In [None]:
def Scaler_test(key):

    df = pd.read_hdf(HDF_NAME, key=key).sort_index().dropna()
    print(key, 'standard', create_models(key, df=df, scaler=StandardScaler())[1])
    print(key, 'minmax', create_models(key, df=df, scaler=MinMaxScaler())[1])
    print(key, 'robust', create_models(key, df=df, scaler=RobustScaler())[1])
    print()

In [None]:
# monthly groupby, accuracy suffers from lack of datapoints
for key in monthly_keys:
    df = pd.read_hdf(HDF_NAME, key=key).sort_index()
    df.index = df.apply(lambda x: dt.date(year=int(x['year']), month=int(x['month']), day=1), axis=1)
    print(key, create_models(key, df=df, show_plot=True)[1])
# real values are in blue, predicted values are in read

In [None]:
# WILL NOT RUN QUICKLY, USED FOR OPTIMIZING
# test best components. 5 looks pretty good
for key in monthly_keys:
    PCA_test(key)

In [None]:
# WILL NOT RUN QUICKLY, USED FOR OPTIMIZING
# test best C value. 10e2 looks good, in terms of time/accuraccy tradeoff
for key in monthly_keys:
    SVR_test(key)

In [None]:
# WILL NOT RUN QUICKLY, USED FOR OPTIMIZING
# test best scaler. minmax looks the best
for key in monthly_keys:
    Scaler_test(key)