In [35]:
import pandas as pd
import numpy as np
import csv
import statsmodels.api as sm
import warnings
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from itertools import combinations
from scipy import stats
from datetime import datetime
from sklearn.metrics import mean_absolute_error
from datetime import datetime
from dateutil.relativedelta import relativedelta
warnings.filterwarnings("ignore")


file = 'rainfalldata.csv'
rd = pd.read_csv(file)
file2 = 'data/ncrainfalldata.csv'
ncrd = pd.read_csv(file2)
rd.Date = pd.to_datetime(rd.Date)
rd = rd.set_index('Date')
ncrd.Date = pd.to_datetime(ncrd.Date)
ncrd = ncrd.set_index('Date')

In [8]:
# this cell takes the stored exogen dictionary that is stored in the Data_Wrangling_CAP1 jupyter notebook
# that was imported above.
%store -r exogen


In [9]:
exogen.keys()

dict_keys(['Arcola, NC', 'Henderson 2 NNW, NC', 'Laurinburg, NC', 'Roanoke Rapids, NC', 'Murfreesboro, NC', 'Lumberton Area, NC', 'LONGWOOD, NC', 'WHITEVILLE 7 NW, NC', 'Charlotte Area, NC', 'Mount Mitchell Area, NC', 'ASHEVILLE AIRPORT, NC', 'BANNER ELK, NC', 'BEECH MOUNTAIN, NC', 'BRYSON CITY 4, NC', 'BREVARD, NC', 'CASAR, NC', 'COWEETA EXP STATION, NC', 'CULLOWHEE, NC', 'FOREST CITY 8 W, NC', 'FRANKLIN, NC', 'GASTONIA, NC', 'GRANDFATHER MTN, NC', ' HENDERSONVILLE 1 NE, NC', ' HIGHLANDS, NC', 'HOT SPRINGS, NC', 'LAKE LURE 2, NC', 'LAKE TOXAWAY 2 SW, NC', 'MARSHALL, NC', 'MONROE 2 SE, NC', ' MOUNT HOLLY 4 NE, NC', ' OCONALUFTEE, NC', 'PISGAH FOREST 3 NE, NC', 'ROBBINSVILLE AG 5 NE, NC', 'ROSMAN, NC', 'SHELBY 2 NW, NC', 'TAPOCO, NC', 'TRYON, NC', 'WAYNESVILLE 1 E, NC', 'Boone 1 SE, NC', 'DANBURY, NC', 'EDEN, NC', ' MOUNT AIRY 2 W, NC', 'REIDSVILLE 2 NW, NC', 'HAYESVILLE 1 NE, NC', 'MURPHY 4ESE, NC', ' KING, NC'])

In [7]:
def sarima_model_creation(data, p, d, q, P, D, Q, m, exog=None):
    my_order = [p,d,q]
    my_sorder = [P,D,Q,m]
    sarimamod = sm.tsa.statespace.SARIMAX(data, exog, order=my_order, seasonal_order=my_sorder, 
                                          enforce_stationarity=False, enforce_invertibility=False,
                                          initialization='approximate_diffuse')
    model_fit = sarimamod.fit()# start_params=[0, 0, 0, 0, 1])
    return(model_fit)

In [5]:
def hyperparameter_find(training_data, comb, testing_data, search = False, exogtr = None, exogtest = None):
    leastmae = 1000
    for com in tqdm(comb):
        li_one_step = []
        for i in tqdm(range(len(testing_data))):
            if i == 0:
                copytraining = training_data.copy()
                if exogtr is not None:
                    excopy = exogtr.copy()
                    mod_1 = sarima_model_creation(copytraining, com[0], 0, com[1], com[2], 0, 
                                                  com[3], 12, exog=excopy)
                    one_step_pred = mod_1.forecast(exog=excopy.iloc[[-1]]) #uses the data from the year before
                    excopy = pd.concat([excopy, exogtest.iloc[[i]]])
                else:
                    mod_1 = sarima_model_creation(copytraining, com[0], 0, com[1], com[2], 0, com[3], 12)
                    one_step_pred = mod_1.forecast()
                li_one_step.append(one_step_pred[0])
                copytraining = pd.concat([copytraining, testing_data[[i]]])
            else:
                if exogtr is not None:
                    mod_1 = sarima_model_creation(copytraining, com[0], 0, com[1], com[2], 0, 
                                                  com[3], 12, exog=excopy)
                    one_step_pred2 = mod_1.forecast(exog=excopy.iloc[[-1]])
                    excopy = pd.concat([excopy, exogtest.iloc[[i]]])
                else:
                    mod_1 = sarima_model_creation(copytraining, com[0], 0, com[1], com[2], 0, com[3], 12)
                    one_step_pred2 = mod_1.forecast()
                li_one_step.append(one_step_pred2[0])
                copytraining = pd.concat([copytraining, testing_data[[i]]])
        mae = mean_absolute_error(testing_data, li_one_step)
        if search is True:
            if mae < leastmae:
                leastmae = mae
                H_AR = com[0]
                H_MA = com[1]
                H_SAR = com[2]
                H_SMA = com[3]
            print(com,mae)            
    if search is True:
        return('AR: '+ str(H_AR), 'MA: ' +str(H_MA), 'SAR: '+str(H_SAR), 'SMA: '+str(H_SMA))
    else:
        return(mae)

In [6]:
def exog_combinations(df, exoe):
    lo_dfs = []
    if len(exoe) == 1:
        lo_dfs.append(df.loc[:,exoe])
    if len(exoe) > 1:
        lo_dfs.append(df.loc[:,exoe])
        for ex in exoe:
            lo_dfs.append(df.loc[:,[ex]])
        if len(exoe) >2:
            for i in range(2, len(exoe)):
                combolist = list(combinations(exoe,i))
                for c in combolist:
                    lo_dfs.append(df.loc[:,c])
    return(lo_dfs)


In [7]:
todokeys = ('TAPOCO, NC', 'TRYON, NC', 'WAYNESVILLE 1 E, NC', 'Boone 1 SE, NC', 'DANBURY, NC', 'EDEN, NC', ' MOUNT AIRY 2 W, NC', 'REIDSVILLE 2 NW, NC', 'HAYESVILLE 1 NE, NC', 'MURPHY 4ESE, NC', ' KING, NC')
sub_exogen = {k: exogen[k] for k in todokeys}

In [8]:
from collections import defaultdict
l_o_dfs = defaultdict(list)
for key,value in tqdm(sub_exogen.items()):
    lo_dfs2 = exog_combinations(rd, value)
    l_o_dfs[key] = lo_dfs2
# l_o_dfs['ROBBINSVILLE AG 5 NE, NC']

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))




In [9]:
def exogenous_var(data, ncloc, l_exoloc, best_comb):
#     for key, value in tqdm(exo_dict.items()):
    dat = data[ncloc]
#         l_exog = exog_combinations(data, value)
    tr, test = train_test_split(dat, test_size = 0.2, shuffle=False)
    keymae = hyperparameter_find(tr, best_comb, test)
    print('keymae of: '+ key +' = '+str(keymae))
    bettermae = {}
    for exog in tqdm(l_exoloc):
        extr, extest = train_test_split(exog, test_size = 0.2, shuffle=False)
        exmae = hyperparameter_find(tr, best_comb, test, exogtr=extr, exogtest = extest)
        co = tuple(exog.columns)
        print('exmae = {}'.format(co) + ' '+ str(exmae))
        if exmae < keymae:
            bettermae[co] = exmae
            bettermae2 = {key: bettermae}
    return(co)

In [10]:
best_comb = [[4,3,3,4]]
warnings.filterwarnings("ignore")
for key,value in tqdm(l_o_dfs.items()):
    exogenous_var(rd, key, value, best_comb)

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

keymae of: TAPOCO, NC = 0.9800673566131274


HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('GATLINBURG 2 SW, TN', 'NEWFOUND GAP, TN', ' TOWNSEND 5S, TN') 1.5764560295913925


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('GATLINBURG 2 SW, TN',) 1.4972163335360191


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('NEWFOUND GAP, TN',) 1.5501456737516661


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = (' TOWNSEND 5S, TN',) 2.1402191932997416


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('GATLINBURG 2 SW, TN', 'NEWFOUND GAP, TN') 1.5489398080891763


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('GATLINBURG 2 SW, TN', ' TOWNSEND 5S, TN') 1.616671690504988


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('NEWFOUND GAP, TN', ' TOWNSEND 5S, TN') 2.436971933451168


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

keymae of: TRYON, NC = 2.6823507498407078


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC', 'CAESARS HEAD, SC', 'CHESNEE 7 WSW, SC', 'CLEVELAND 3S, SC', 'SPARTANBURG 3 SSE, SC') 3.731134415701127


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC',) 2.986931810013167


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('CAESARS HEAD, SC',) 3.1072872524201767


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('CHESNEE 7 WSW, SC',) 2.815911182694172


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('CLEVELAND 3S, SC',) 3.077446176795452


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('SPARTANBURG 3 SSE, SC',) 2.812649001967126


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC', 'CAESARS HEAD, SC') 2.942549753368086


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC', 'CHESNEE 7 WSW, SC') 2.8608751566191


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC', 'CLEVELAND 3S, SC') 3.049632259088924


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC', 'SPARTANBURG 3 SSE, SC') 2.9144776099202763


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('CAESARS HEAD, SC', 'CHESNEE 7 WSW, SC') 2.9762863578022682


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('CAESARS HEAD, SC', 'CLEVELAND 3S, SC') 3.20189310778487


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('CAESARS HEAD, SC', 'SPARTANBURG 3 SSE, SC') 3.096819228892839


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('CHESNEE 7 WSW, SC', 'CLEVELAND 3S, SC') 2.9903458656920554


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('CHESNEE 7 WSW, SC', 'SPARTANBURG 3 SSE, SC') 2.890486851482266


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('CLEVELAND 3S, SC', 'SPARTANBURG 3 SSE, SC') 3.0436085745720485


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC', 'CAESARS HEAD, SC', 'CHESNEE 7 WSW, SC') 2.9033859279655085


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC', 'CAESARS HEAD, SC', 'CLEVELAND 3S, SC') 3.154525441526979


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC', 'CAESARS HEAD, SC', 'SPARTANBURG 3 SSE, SC') 2.9314747667962813


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC', 'CHESNEE 7 WSW, SC', 'CLEVELAND 3S, SC') 2.9713486547381884


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC', 'CHESNEE 7 WSW, SC', 'SPARTANBURG 3 SSE, SC') 2.86854756656046


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC', 'CLEVELAND 3S, SC', 'SPARTANBURG 3 SSE, SC') 3.0161890393875477


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('CAESARS HEAD, SC', 'CHESNEE 7 WSW, SC', 'CLEVELAND 3S, SC') 2.99567340879987


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('CAESARS HEAD, SC', 'CHESNEE 7 WSW, SC', 'SPARTANBURG 3 SSE, SC') 2.9370572008592566


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('CAESARS HEAD, SC', 'CLEVELAND 3S, SC', 'SPARTANBURG 3 SSE, SC') 3.277737214007516


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('CHESNEE 7 WSW, SC', 'CLEVELAND 3S, SC', 'SPARTANBURG 3 SSE, SC') 2.9667226323555127


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC', 'CAESARS HEAD, SC', 'CHESNEE 7 WSW, SC', 'CLEVELAND 3S, SC') 3.190416268016882


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC', 'CAESARS HEAD, SC', 'CHESNEE 7 WSW, SC', 'SPARTANBURG 3 SSE, SC') 2.8771958771274124


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC', 'CAESARS HEAD, SC', 'CLEVELAND 3S, SC', 'SPARTANBURG 3 SSE, SC') 3.0292999192163945


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('Greenville-Spartanburg Area, SC', 'CHESNEE 7 WSW, SC', 'CLEVELAND 3S, SC', 'SPARTANBURG 3 SSE, SC') 2.975167876273045


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('CAESARS HEAD, SC', 'CHESNEE 7 WSW, SC', 'CLEVELAND 3S, SC', 'SPARTANBURG 3 SSE, SC') 3.258738835983667


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

keymae of: WAYNESVILLE 1 E, NC = 1.747482139423062


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('MT LECONTE, TN', 'NEWFOUND GAP, TN') 2.2339993455520246


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('MT LECONTE, TN',) 2.067047382901454


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

exmae = ('NEWFOUND GAP, TN',) 2.262076438940648


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

MemoryError: 

In [69]:
with_exogs = ['WHITEVILLE 7 NW, NC', 'CASAR, NC', 'FOREST CITY 8 W, NC', 'GASTONIA, NC', 'LAKE LURE 2, NC', 
                       'ELIZABETHTOWN, NC', ' MOUNT HOLLY 4 NE, NC','GRANDFATHER MTN, NC']
ncrd2 = ncrd.copy()
ncrd_less = ncrd2.drop(with_exogs,axis=1)

In [74]:
def prediction_fx(data, begin, end):
    base = datetime.strptime(begin,'%Y-%m-%d')
    date_list = [base + relativedelta(months=x) for x in range(600)]
    prediction1_df = pd.DataFrame(index=date_list)
    for col in tqdm(data.columns):
        loc = data[col]
        mod_fit1 = sarima_model_creation(loc, 4,0,3,3,0,4,12)
        point_predictions = pd.DataFrame(mod_fit1.predict(start=begin, end=end), columns=col)
        future_pred1 = mod_fit1.get_prediction(start=begin, end=end)
        future_pred1_ci = future_pred1.conf_int()
        point_predictions_df = pd.merge(point_predictions, future_pred1_ci, left_index=True, right_index=True)
        prediction1_df = pd.merge(prediction1_df, point_predictions_df, left_index=True, right_index=True)
    return(prediction1_df)

In [71]:
pre_df = prediction_fx(ncrd_less, '2019-05-01', '2069-05-01')
pre_df.head(10)

HBox(children=(IntProgress(value=0, max=104), HTML(value='')))




Unnamed: 0,0_x,"lower Raleigh, NC","upper Raleigh, NC",0_y,"lower Fayetteville, NC","upper Fayetteville, NC",0_x.1,"lower Albemarle, NC","upper Albemarle, NC",0_y.1,...,"upper YADKINVILLE 6 E, NC",0_y.2,"lower HAYESVILLE 1 NE, NC","upper HAYESVILLE 1 NE, NC",0_x.2,"lower MURPHY 4ESE, NC","upper MURPHY 4ESE, NC",0_y.3,"lower KING, NC","upper KING, NC"
2019-05-01,3.366287,-0.94759,7.680163,3.111657,-1.504349,7.727664,3.461614,-0.901392,7.824621,4.101742,...,8.403074,6.052865,2.067085,10.038645,6.054696,1.844234,10.265159,4.555015,0.729303,8.380726
2019-06-01,4.311068,-0.004202,8.626337,4.639369,-0.012169,9.290906,4.748512,0.334216,9.162809,4.453031,...,8.033802,5.717889,1.691579,9.7442,6.150096,1.908906,10.391286,3.477236,-0.366788,7.32126
2019-07-01,6.314961,1.995109,10.634814,4.736273,0.084019,9.388527,5.211461,0.767251,9.655671,4.030026,...,8.752115,6.071343,2.008941,10.133745,5.999118,1.743897,10.25434,4.060046,0.207505,7.912587
2019-08-01,4.145673,-0.191141,8.482487,5.171562,0.516924,9.826201,4.538334,0.074311,9.002357,3.948713,...,8.744874,5.126533,1.018007,9.235059,4.852656,0.570876,9.134435,4.301428,0.444422,8.158433
2019-09-01,4.875324,0.538447,9.212201,4.613723,-0.047083,9.27453,3.479564,-0.985128,7.944255,4.547796,...,8.041144,5.06393,0.946878,9.180982,4.718284,0.405907,9.030662,4.442798,0.584406,8.301189
2019-10-01,4.620644,0.275651,8.965637,3.449842,-1.211666,8.111351,3.654706,-0.812263,8.121676,4.270212,...,7.43548,4.244699,0.118656,8.370741,3.892131,-0.433895,8.218157,4.220072,0.360803,8.079341
2019-11-01,2.836573,-1.515683,7.188829,2.833024,-1.829652,7.4957,3.578847,-0.888818,8.046511,4.145416,...,7.125765,5.128604,1.001646,9.255561,4.46293,0.130424,8.795437,4.280093,0.420679,8.139506
2019-12-01,3.028145,-1.32567,7.38196,3.043657,-1.619592,7.706907,3.945386,-0.522327,8.413099,4.089217,...,7.805577,5.843728,1.715459,9.971997,5.297449,0.96355,9.631349,4.52121,0.661633,8.380788
2020-01-01,3.99628,-0.365668,8.358228,2.90744,-1.75607,7.57095,3.020206,-1.447848,7.488261,3.895951,...,7.420529,5.472307,1.343995,9.600618,4.975608,0.641069,9.310147,4.132021,0.272453,7.991588
2020-02-01,2.822907,-1.540627,7.18644,2.760683,-1.903013,7.424379,3.227472,-1.240312,7.695256,3.833445,...,6.962277,5.590564,1.46213,9.718997,5.443202,1.108752,9.777651,4.021765,0.162172,7.881357


In [73]:
list(pre_df.columns)

['0_x',
 'lower Raleigh, NC',
 'upper Raleigh, NC',
 '0_y',
 'lower Fayetteville, NC',
 'upper Fayetteville, NC',
 '0_x',
 'lower Albemarle, NC',
 'upper Albemarle, NC',
 '0_y',
 'lower Arcola, NC',
 'upper Arcola, NC',
 '0_x',
 'lower Asheboro, NC',
 'upper Asheboro, NC',
 '0_y',
 'lower Burlington, NC',
 'upper Burlington, NC',
 '0_x',
 'lower Carthage, NC',
 'upper Carthage, NC',
 '0_y',
 'lower Chapel Hill, NC',
 'upper Chapel Hill, NC',
 '0_x',
 'lower Clayton, NC',
 'upper Clayton, NC',
 '0_y',
 'lower Dunn, NC',
 'upper Dunn, NC',
 '0_x',
 'lower Durham, NC',
 'upper Durham, NC',
 '0_y',
 'lower Enfield, NC',
 'upper Enfield, NC',
 '0_x',
 'lower Fort Bragg, NC',
 'upper Fort Bragg, NC',
 '0_y',
 'lower Graham, NC',
 'upper Graham, NC',
 '0_x',
 'lower Greensboro AP, NC',
 'upper Greensboro AP, NC',
 '0_y',
 'lower Henderson 2 NNW, NC',
 'upper Henderson 2 NNW, NC',
 '0_x',
 'lower JACKSON SPRINGS 5 WNW, NC',
 'upper JACKSON SPRINGS 5 WNW, NC',
 '0_y',
 'lower Laurinburg, NC',
 

In [15]:
exo_var_dict = {
    'WHITEVILLE 7 NW, NC': [rd[[' LORIS 2 S, SC']]],
    'CASAR, NC': [rd[['CHESNEE 7 WSW, SC', 'GAFFNEY 6 E, SC']],
                rd[['CHESNEE 7 WSW, SC']],
                rd[['GAFFNEY 6 E, SC']]],
    'FOREST CITY 8 W, NC': [rd[['GAFFNEY 6 E, SC']], rd[['GAFFNEY 6 E, SC','SPARTANBURG 3 SSE, SC']]],
    'GASTONIA, NC': [rd[['FORT MILL 4 NW, SC','GAFFNEY 6 E, SC']], rd[['GAFFNEY 6 E, SC']]],
    'LAKE LURE 2, NC': [rd[['CHESNEE 7 WSW, SC']]],
    ' MOUNT HOLLY 4 NE, NC': [rd[['CHESNEE 7 WSW, SC']],rd[['CHESTER 1 SE, SC']],rd[['GAFFNEY 6 E, SC']], 
                              rd[['LOCKHART, SC']],
                             rd[['CATAWBA, SC','GAFFNEY 6 E, SC']],
                             rd[['CHESNEE 7 WSW, SC','GAFFNEY 6 E, SC']],
                             rd[['CHESNEE 7 WSW, SC', 'LOCKHART, SC']],
                             rd[['CHESTER 1 SE, SC', 'GAFFNEY 6 E, SC']],
                             rd[['CHESTER 1 SE, SC', 'LOCKHART, SC']],
                             rd[['FORT MILL 4 NW, SC', 'GAFFNEY 6 E, SC']],
                             rd[['GAFFNEY 6 E, SC', 'LOCKHART, SC']],
                             rd[['CATAWBA, SC', 'CHESNEE 7 WSW, SC', 'CHESTER 1 SE, SC']],
                             rd[['CATAWBA, SC', 'CHESNEE 7 WSW, SC', 'GAFFNEY 6 E, SC']],
                             rd[['CATAWBA, SC', 'CHESNEE 7 WSW, SC', 'LOCKHART, SC']],
                             rd[['CATAWBA, SC', 'CHESTER 1 SE, SC', 'LOCKHART, SC']],
                             rd[['CATAWBA, SC', 'CHESTER 1 SE, SC', 'GAFFNEY 6 E, SC']]]
}

In [16]:
exo_var_dict_12 = {
    'WHITEVILLE 7 NW, NC': [rd[' LORIS 2 S, SC']],
    'CASAR, NC': [rd['GAFFNEY 6 E, SC']],
    'FOREST CITY 8 W, NC': [rd['GAFFNEY 6 E, SC']],
    'GASTONIA, NC': [rd['GAFFNEY 6 E, SC']],
    'GRANDFATHER MTN, NC': [rd['ELIZABETHTON, TN']]
}

In [17]:
locs_w_exogs = rd[list(exo_var_dict.keys())]
print(locs_w_exogs.head())
keys_list12 = list(exo_var_dict_12.keys())
locs_w_exogs12 = rd[keys_list12]

            WHITEVILLE 7 NW, NC  CASAR, NC  FOREST CITY 8 W, NC  GASTONIA, NC  \
Date                                                                            
1980-01-01                 4.63       4.83             4.609293          4.96   
1980-02-01                 1.48       1.34             1.150000          0.97   
1980-03-01                 8.62       7.61             9.459499          8.46   
1980-04-01                 1.68       3.73             4.967260          2.35   
1980-05-01                 4.89       7.96             5.655974          5.32   

            LAKE LURE 2, NC   MOUNT HOLLY 4 NE, NC  
Date                                                
1980-01-01         4.570520                   4.84  
1980-02-01         1.238559                   1.27  
1980-03-01         9.868008                   7.67  
1980-04-01         5.735163                   1.83  
1980-05-01         6.112051                   4.25  


In [96]:
ff = 'A'
fb = 'A'
fbff= pd.DataFrame(
    1, index=['a','b','c'], columns= [ff, fb]
)
fbff

Unnamed: 0,A,A.1
a,1,1
b,1,1
c,1,1


In [64]:
def prediction_exog_fx(data, exog_dict, begin, end):
    base = datetime.strptime(begin,'%Y-%m-%d')
    date_list = [base + relativedelta(months=x) for x in range(600)]
    prediction_df = pd.DataFrame(index = date_list)
    pred_val_df = pd.DataFrame(index = date_list)
    exog_predictions_df = pd.DataFrame(index = date_list)
    for key,value in exog_dict.items():
        loc = data[key]
        for v in value:
            mod_fit1 = sarima_model_creation(loc, 4,0,3,3,0,4, 12,exog=v)
            if v.shape[1] > 1:
                shap = v.shape[1]
                for i in range(shap):
                    exog_mod_fit = sarima_model_creation(v.iloc[:,i],4,0,3,3,0,4,12)
                    e_preds2 = pd.DataFrame(exog_mod_fit.predict(start=begin, end=end))
                    if i is 0:
                        exog_predictions_df = e_preds2
                    else:
                        exog_predictions_df = pd.merge(exog_predictions_df, e_preds2, left_index=True, 
                                                       right_index=True)
            else:
                exog_mod_fit = sarima_model_creation(v, 4,0,3,3,0,4,12)
                exog_predictions_df = pd.DataFrame(exog_mod_fit.predict(start=begin, end=end))
            future_pred = mod_fit1.get_prediction(exog=exog_predictions_df,start=begin, end=end)
            future_pred_ci = future_pred.conf_int()
            future_pred_val= pd.DataFrame(mod_fit1.predict(exog=exog_predictions_df, start=begin, end=end), 
                                          columns = [key+[v.columns]])
#                 future_pred_full = pd.merge(future_pred_val, future_pred_ci, left_index=True, right_index=True)
            prediction_df = pd.merge(prediction_df, future_pred_ci, left_index=True, right_index=True)
            pred_val_df = pd.merge(pred_val_df, future_pred_val, left_index=True, right_index=True)
    return(pred_val_df, prediction_df)
        
        

In [65]:
values_df, ci_df = prediction_exog_fx(locs_w_exogs, exo_var_dict, '2019-05-01', '2069-05-01')

In [77]:
values_df.head(10)

Unnamed: 0,0_x,0_y,0_x.1,0_y.1,0_x.2,0_y.2,0_x.3,0_y.3,0_x.4,0_y.4,...,0_y.5,0_x.5,0_y.6,0_x.6,0_y.7,0_x.7,0_y.8,0_x.8,0_y.9,0
2019-05-01,5.753516,3.94613,4.62861,3.591515,3.899437,4.176669,4.178921,3.903083,6.951886,4.240776,...,3.833654,3.930699,3.71725,4.098503,2.999996,3.880575,3.704935,3.57674,3.492323,3.550743
2019-06-01,3.724518,4.600442,4.603507,4.899144,4.043647,4.184025,3.854757,3.514431,5.266286,3.709291,...,3.794374,3.824803,3.758859,3.777296,3.213914,3.678788,3.683364,3.77241,3.740924,3.675629
2019-07-01,5.557662,4.789477,4.620463,4.932119,4.229915,4.303735,3.854248,3.798161,5.135533,3.723821,...,3.712437,3.825703,3.605133,3.920627,3.273328,3.609286,3.694711,3.647864,3.579727,3.628567
2019-08-01,6.962032,4.307608,4.054748,4.510153,4.273738,3.978493,3.864488,3.734745,5.74063,3.557211,...,3.398863,3.712387,3.579898,3.772888,3.310991,3.499665,3.57376,3.447058,3.546051,3.635563
2019-09-01,5.759614,4.124589,4.482363,4.223978,4.151798,3.756654,3.730619,3.757905,5.67852,4.252898,...,4.113615,3.716474,4.029905,3.781442,3.720841,3.968845,4.069324,4.238167,4.212276,3.954684
2019-10-01,3.492725,3.784372,3.951204,3.624053,3.761492,3.343411,3.63001,3.972551,5.223536,3.720289,...,3.929549,3.750673,3.798698,3.397504,3.531279,3.669133,3.651791,3.760456,3.741603,3.632168
2019-11-01,3.845209,4.263838,3.88893,4.54598,4.187163,3.808662,3.811119,3.863836,4.768388,3.425112,...,3.70845,3.813984,3.924216,3.492598,3.598184,3.794542,3.732256,3.880909,4.030589,3.852863
2019-12-01,3.647705,4.222191,4.094379,4.362686,3.789722,4.518101,3.485347,3.769207,5.376335,3.349256,...,3.938125,3.783163,4.04265,3.575134,4.130114,3.65657,3.70634,3.907163,4.112145,3.936292
2020-01-01,2.551145,4.239287,4.101368,4.507149,4.227263,4.036171,3.782411,3.872347,4.898288,3.727392,...,3.748236,3.803494,3.73574,3.69962,3.185743,3.657504,3.665142,3.651758,3.668828,3.678475
2020-02-01,4.253857,4.139583,4.081032,4.019727,3.890641,3.574033,3.878353,3.717872,5.106642,3.55234,...,3.523303,3.640892,3.568224,3.590177,3.067745,3.558271,3.55193,3.589905,3.60404,3.485449


In [63]:
ci_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 600 entries, 2019-05-01 to 2069-04-01
Columns: 300 entries, lower WHITEVILLE 7 NW, NC_x to upper  MOUNT HOLLY 4 NE, NC
dtypes: float64(300)
memory usage: 1.4 MB


In [None]:
ci_merged = pd.merge(pre_df, ci_df, left_index=True, right_index=True)
all_merged = pd.merge(ci_merged, values_df, left_index=True, right_index=True)
all_merged.head()

In [None]:
all_merged.to_csv('predictions.csv')