In [1]:
import joblib
import numpy as np
import pandas as pd
import datetime as dt

In [2]:
X_COLUMNS = ['year', 'month', 'date', 'rain', 'precip', 'mean_temp', 'snow_on_grnd', 'snow_precip']

station_data = pd.read_csv('./index_data/closest_weather_to_hydro_stations.csv')
rename_map = { station_data.columns[0]: 'hydro_id',
              'Latitude (Decimal Degrees)': 'latitude',
              'Longitude (Decimal Degrees)': 'longitude',
              'Name': 'weather_name',
              'Climate ID': 'weather_climate_id',
              'Station ID': 'weather_station_id',
              'First Year': 'weather_first_year',
              'Last Year': 'weather_last_year',
              'Elevation (m)': 'weather_elevation(m)', # in m
              'First Year': 'weather_first_year',
              'Last Year': 'weather_last_year',
             }
station_data.rename(columns=rename_map, inplace=True)

mean_data, stdev_data = None, None

In [40]:
def load_model(hydro_id):
     return joblib.load('./models/model-hydro_{hydro_id}.pckl'.format(hydro_id=hydro_id))


def get_weather_cluster_data(hydro_id):
    weather_station_id = station_data[station_data['hydro_id'] == hydro_id]['weather_station_id'].values[0]
   
    mean_data = pd.read_csv('./cluster_data/{}-daily.csv'.format(weather_station_id))
    stdev_data = pd.read_csv('./cluster_data/{}-stdev.csv'.format(weather_station_id))
    
    return mean_data, stdev_data


def filter_df(df, month, day):
    return df[(df['month'] == month) & (df['day']==day)]


def genday(month, day, cluster_id):
    generate_cols = ['rain', 'precip', 'mean_temp', 'snow_on_grnd', 'snow_precip']

    means = filter_df(mean_data, month, day)
    means= means[means['cluster_id'] == cluster_id][generate_cols]
    stds = filter_df(stdev_data, month, day)[generate_cols]
    
    return np.random.normal(means, stds)[0]

def generate_weather_data(df_row, cluster_id):
    row = genday(df_row['month'], df_row['day'], cluster_id)
    
    # make sure values can't be negative, except for temp
    return pd.Series([abs(val) if enum !=3 else val for enum, val in enumerate(row)])

def generate_regression_input(start_year, start_month, start_day, days, cluster_id):
    start_date = '{year}-{month}-{date}'.format(year=start_year, month=start_month, date=start_day)
    df = pd.DataFrame({'Date/Time':pd.date_range(start=start_date, periods=days)}).set_index('Date/Time')
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['date'] = df.index.map(dt.datetime.toordinal)
    
    generated_data = df.apply(generate_weather_data, cluster_id=cluster_id, axis=1)
    
    regression_input = df.join(generated_data)
    del regression_input['day']
    regression_input.columns = X_COLUMNS
    
    return regression_input

In [45]:
hydro_id = 'hydro_08JA023'[6:]
model = load_model(hydro_id)
mean_data, stdev_data = get_weather_cluster_data(hydro_id)
model.predict(generate_regression_input(start_year=2000, start_month=8, start_day=2, cluster_id=1, days=100))

array([10.76289376, 10.76289376, 10.76289376, 10.76289376, 10.76289376,
       10.76289376, 10.76289376, 10.76289376, 10.76289376, 10.76289376,
       10.76289376, 10.76289376, 10.76289376, 10.76289376, 10.76289376,
       10.76289376, 10.76289376, 10.76289376, 10.76289376, 10.76289376,
       10.76289376, 10.76289376, 10.76289376, 10.76289376, 10.76289376,
       10.76289376, 10.76289376, 10.76289376, 10.76289376, 10.76289376,
       10.76289376, 10.76289376, 10.76289376, 10.76289376, 10.76289376,
       10.76289376, 10.76289376, 10.76289376, 10.76289376, 10.76289376,
       10.76289376, 10.76289376, 10.76289376, 10.76289376, 10.76289376,
       10.76289376, 10.76289376, 10.76289376, 10.76289376, 10.76289376,
       10.76289376, 10.76289376, 10.76289376, 10.76289376, 10.76289376,
       10.76289376, 10.76289376, 10.76289376, 10.76289376, 10.76289376,
       10.76289376, 10.76289376, 10.76289376, 10.76289376, 10.76289376,
       10.76289376, 10.76289376, 10.76289376, 10.76289376, 10.76

In [44]:
generate_regression_input(start_year=1994, start_month=7, start_day=2, cluster_id=0, days=10)

Unnamed: 0_level_0,year,month,date,rain,precip,mean_temp,snow_on_grnd,snow_precip
Date/Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1994-07-02,1994,7,728111,4.837808,0.478239,7.349932,0.0,0.0
1994-07-03,1994,7,728112,2.165093,2.159186,13.100265,0.0,0.0
1994-07-04,1994,7,728113,6.712967,1.223777,10.160355,0.0,0.0
1994-07-05,1994,7,728114,5.804914,4.658343,11.638643,0.0,0.0
1994-07-06,1994,7,728115,1.926045,0.776511,11.33825,0.0,0.0
1994-07-07,1994,7,728116,12.025288,12.135808,13.243529,0.0,0.0
1994-07-08,1994,7,728117,2.464072,8.359698,11.888983,0.0,0.0
1994-07-09,1994,7,728118,12.671223,6.192085,11.193884,0.0,0.0
1994-07-10,1994,7,728119,0.432386,0.018296,7.015177,0.0,0.0
1994-07-11,1994,7,728120,2.941836,4.811169,9.441087,0.0,0.0


In [38]:
genday(1,1,1)

array([  1.81058686,   0.64302128, -26.55385826,  21.32645567,
         1.24637177])

In [39]:
generate_regression_input()

TypeError: generate_regression_input() missing 5 required positional arguments: 'start_year', 'start_month', 'start_day', 'days', and 'cluster_id'