In [94]:
import sys
sys.path.insert(0, '../')
from forecast_model import Forecast_model

import pandas as pd
import numpy as np
import copy
from tqdm import tqdm
from scipy import spatial
try:
    import cPickle as pickle
except:
    import pickle
    
import sys
sys.path.insert(0, '../../utils/')
from utils import *
from utils_date import *

class Ha_inverted_model(Forecast_model):

    def __init__(self, name: str):
        Forecast_model.__init__(self, name)
        self.infos['start_date'] = ''
        self.infos['end_date'] = ''
        self.infos['features'] = []
        self.infos['features_day'] = []
        self.infos['time_series'] = []
        self.infos['observation_path'] = ''
        self.infos['context_path'] = ''


    def __str__(self):
        return "Description of model: %s\n" \
                "Start date learing: %s\n" \
                "End date learing: %s\n" \
                "Features: %s\n" \
                "Features day: %s\n"\
                "Learned time series: %s\n" \
                "Training observation path: %s\n" \
                "Training context data path: %s" % (self.infos['name'],
                                                   self.infos['start_date'],
                                                   self.infos['end_date'],
                                                   self.infos['features'],
                                                   self.infos['features_day'],
                                                   self.infos['time_series'],
                                                   self.infos['observation_path'],
                                                   self.infos['context_path'])

    def fit(self, X, ylist):

        dict_pred_mean = {}
        dict_pred_median = {}
        for ix, ts in enumerate(tqdm(self.infos['time_series'])):
            data = np.concatenate([X, ylist[ix]], axis=1)
            dfXy = pd.DataFrame(data = data, columns = np.arange(data.shape[1]).astype(str))
            df_mean = dfXy.groupby(dfXy.columns.values[:X.shape[1]].tolist()).mean()
            df_median = dfXy.groupby(dfXy.columns.values[:X.shape[1]].tolist()).median()

            for f in df_mean.index.values:
                try:
                    dict_pred_mean[tuple(f)][ts] = df_mean.loc[f].values.astype(float)
                    dict_pred_median[tuple(f)][ts] = df_median.loc[f].values.astype(float)
                except:
                    dict_pred_mean[tuple(f)]={}
                    dict_pred_mean[tuple(f)][ts] = df_mean.loc[f].values.astype(float)
                    dict_pred_median[tuple(f)]={}
                    dict_pred_median[tuple(f)][ts] = df_median.loc[f].values.astype(float)
        self.infos['dict_pred_mean'] = dict_pred_mean
        self.infos['dict_pred_median'] = dict_pred_median
        return 


    def predict(self, X, choice='mean'):
        if choice=='mean':
            dict_pred = self.infos['dict_pred_mean']
        else:
            dict_pred = self.infos['dict_pred_median']
            
        possibles_day = [i for i in list(set(dict_pred.keys()))]
        pred_all = []
        for x in tqdm(X):
            pred = []
            try:
                for ts in self.infos['time_series']:
                    pred.append(dict_pred[tuple(x)][ts])
            except: 
                npd = nearest_tuple(possibles_day, tuple(x))
                for ts in self.infos['time_series']:
                    pred.append(dict_pred[npd][ts]) 
            pred_all.append(pred)
        return np.array(pred_all)

In [100]:
observation_path = ['/home/toque/data2/montreal/stm/data/valid_metro_15min_2015_2016_2017_sumpass_nodayfree.csv']
context_path =  ['/home/toque/data2/date/2013-01-01-2019-01-01_new.csv',
                '/home/toque/data2/montreal/events/data/clean/events_2015_2018_start_period_event_stopid_aggdaily.csv']
path_to_save = '/home/toque/data2/forecast/model/ha_inverted/fit/'



features = ['Day_id', 'Mois_id','vac_noel_quebec', 'day_off_quebec', '24DEC', '31DEC', 'renov_beaubien', 'vac_udem1',
            'vac_udem2', ]

features_day = ['5-start_period_event', '11-start_period_event', 
                    '12-start_period_event', '13-start_period_event',
                    '15-start_period_event', '16-start_period_event',
                    '23-start_period_event', '24-start_period_event',
                    '31-start_period_event', '32-start_period_event',
                    '35-start_period_event', '43-start_period_event',
                    '45-start_period_event', '61-start_period_event', '68-start_period_event']
features_day=[]
time_series = ['11', '32', '34', '15', '44', '65', '31', '33', '35', '47',
               '13', '14', '1', '9', '5', '18', '36', '24', '68', '43', '8', '64',
               '10', '55', '3', '49', '51', '2', '19', '56', '7', '6', '4', '48', '66',
               '25', '23', '28', '39', '54', '60', '27', '20', '46', '12', '21', '62',
               '52', '41', '50', '30', '16', '37', '40', '26', '67', '57', '61', '42',
               '45', '38', '29', '58', '63', '22', '59', '53', '17']

model_name = 'ha_inverted'

start_date = '2015-01-01'
end_date = '2017-01-01'




# fill datetime empty
df_observation = read_csv_list(observation_path)
days = sorted(list(set([i[:10] for i in df_observation['Datetime'].values])))
timestamp_list = [j for i in [build_timestamp_list(d+' 00:00:00', d+' 23:45:00', time_step_second=15*60) for d in days] for j in i]
df_date = pd.DataFrame(data = timestamp_list, columns = ['Datetime']).set_index('Datetime')
df_observation = df_date.join(df_observation.set_index('Datetime')).fillna(0).reset_index()

df_context = read_csv_list(context_path)




my_model = Ha_inverted_model(model_name)

my_model.infos['start_date'] = start_date
my_model.infos['end_date'] = end_date
my_model.infos['time_series'] = time_series
my_model.infos['features'] = features
my_model.infos['features_day'] = features_day
my_model.infos['observation_path'] = observation_path
my_model.infos['context_path'] = context_path
my_model.infos['path_to_save'] = path_to_save

dfXy = df_observation.set_index('Datetime')[start_date: end_date].join(df_context.set_index('Datetime')).reset_index()
X, ylist, features_end = build_Xylist(dfXy, features, features_day, time_series, start_date, end_date)

my_model.fit(X, ylist)

save_pickle(path_to_save+model_name+'/'+model_name+'.pkl', my_model)

100%|██████████| 68/68 [00:03<00:00, 19.57it/s]


In [104]:
model_path = '/home/toque/data2/forecast/model/ha_inverted/fit/ha_inverted/ha_inverted.pkl'
path_save_prediction = '/home/toque/data2/forecast/model/ha_inverted/prediction/'
my_model = load_pickle(model_path)

start_date = '2015-01-01'
end_date = '2018-01-01'

dfXy = df_observation.set_index('Datetime')[start_date: end_date].join(df_context.set_index('Datetime')).reset_index()
datetime_list = dfXy['Datetime'].values
X, features_end = build_X(dfXy, features, features_day, time_series, start_date, end_date)

pred_all = my_model.predict(X)

pred_all = np.swapaxes(pred_all,0,1)
pred_all = pred_all.reshape(len(time_series), pred_all.shape[1]*pred_all.shape[2])
df_pred = pd.DataFrame(data = datetime_list, columns=['Datetime'])
for ix,ts in enumerate(time_series):
    df_pred[ts] = pred_all[ix] 
    
df_pred = df_pred.round(3)

if not os.path.exists(path_save_prediction+my_model.infos['name']+"/"):
    os.makedirs(path_save_prediction+my_model.infos['name']+"/")
    
df_pred.to_csv(path_save_prediction + my_model.infos['name']+'/'+start_date+'_'+end_date + '.csv', index=False)

100%|██████████| 1093/1093 [00:00<00:00, 1409.43it/s]


In [103]:
df_pred.round(3)

Unnamed: 0,Datetime,11,32,34,15,44,65,31,33,35,...,42,45,38,29,58,63,22,59,53,17
0,2015-01-01 00:00:00,140.0,44.0,90.0,62.0,21.0,25.0,42.0,47.0,50.0,...,6.0,15.0,7.0,16.0,6.0,2.0,36.0,4.0,9.0,2.0
1,2015-01-01 00:15:00,216.0,31.0,108.0,36.0,28.0,38.0,41.0,53.0,40.0,...,15.0,111.0,12.0,53.0,17.0,7.0,67.0,2.0,20.0,11.0
2,2015-01-01 00:30:00,614.0,39.0,100.0,42.0,13.0,4.0,128.0,62.0,49.0,...,6.0,38.0,26.0,60.0,0.0,0.0,40.0,0.0,6.0,9.0
3,2015-01-01 00:45:00,278.0,21.0,54.0,17.0,23.0,2.0,173.0,45.0,21.0,...,1.0,30.0,1.0,23.0,0.0,0.0,7.0,0.0,0.0,6.0
4,2015-01-01 01:00:00,44.0,5.0,17.0,16.0,0.0,1.0,65.0,18.0,7.0,...,0.0,2.0,1.0,7.0,0.0,0.0,3.0,0.0,1.0,0.0
5,2015-01-01 01:15:00,3.0,0.0,0.0,3.0,0.0,3.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
6,2015-01-01 01:30:00,2.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2015-01-01 01:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2015-01-01 02:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2015-01-01 02:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
def build_Xylist(df, features, features_day, time_series, start_date, end_date):
    # Build ylist
    days = sorted(list(set([ i[:10] + ' 00:00:00' for i in df['Datetime'].values])))
    df_observation = copy.deepcopy(df[['Datetime']+time_series])
    df_observation['time'] = [d[11:] for d in df_observation['Datetime'].values]
    df_observation['Datetime'] = [d[:10]+ ' 00:00:00' for d in df_observation['Datetime'].values]
    df_observation = df_observation.pivot_table(values=time_series, index = 'Datetime', columns='time')
    ylist = np.array([df_observation[ts].values for ts in time_series])
    
    
    # Build X
    if len(features_day)>0:
        df_context_day = copy.deepcopy(df[['Datetime']+features_day])
        df_context_day['time'] = [d[11:] for d in df_context_day['Datetime'].values]
        df_context_day['Datetime'] = [d[:10]+ ' 00:00:00' for d in df_context_day['Datetime'].values]
        df_context_day = df_context_day.pivot_table(values=features_day, index = 'Datetime', columns='time')
        df_context_day.columns = df_context_day.columns.map('|'.join)
        dfX = df_context_day.join(df.set_index('Datetime')[features].loc[days])
    else:
        dfX = df.set_index('Datetime')[features].loc[days]
    features_end = dfX.columns.values.tolist()
    
    X = dfX.values
    return X, ylist, features_end


def build_X(df, features, features_day, time_series, start_date, end_date):
    # Build X    
    days = sorted(list(set([ i[:10] + ' 00:00:00' for i in df['Datetime'].values])))
    if len(features_day)>0:
        df_context_day = copy.deepcopy(df[['Datetime']+features_day])
        df_context_day['time'] = [d[11:] for d in df_context_day['Datetime'].values]
        df_context_day['Datetime'] = [d[:10]+ ' 00:00:00' for d in df_context_day['Datetime'].values]
        df_context_day = df_context_day.pivot_table(values=features_day, index = 'Datetime', columns='time')
        df_context_day.columns = df_context_day.columns.map('|'.join)
        days = df_context_day.index.values
        dfX = df_context_day.join(df.set_index('Datetime')[features].loc[days])
    else:
        dfX = df.set_index('Datetime')[features].loc[days]
    features_end = dfX.columns.values.tolist()

    X = dfX.values
    return X, features_end


def learn(X, ylist, features_end, time_series):
 
    dict_pred_mean = {}
    dict_pred_median = {}
    for ix, ts in enumerate(tqdm(time_series)):
        data = np.concatenate([X, ylist[ix]], axis=1)
        dfXy = pd.DataFrame(data = data , columns = features_end+np.arange(96).astype(str).tolist())
        df_mean = dfXy.groupby(features_end).mean()
        df_median = dfXy.groupby(features_end).median()

        for f in df_mean.index.values:
            try:
                dict_pred_mean[tuple(f)][ts] = df_mean.loc[f].values.astype(float)
                dict_pred_median[tuple(f)][ts] = df_median.loc[f].values.astype(float)
            except:
                dict_pred_mean[tuple(f)]={}
                dict_pred_mean[tuple(f)][ts] = df_mean.loc[f].values.astype(float)
                dict_pred_median[tuple(f)]={}
                dict_pred_median[tuple(f)][ts] = df_median.loc[f].values.astype(float)

    return dict_pred_mean, dict_pred_median


def forecast(X, dict_pred, time_series):
    possibles_day = [i for i in list(set(dict_pred.keys()))]
    pred_all = []
    for x in tqdm(X):
        pred = []
        try:
            for ts in time_series:
                pred.append(dict_pred[tuple(x)][ts])
        except: 
            npd = nearest_tuple(possibles_day, tuple(x))
            for ts in time_series:
                pred.append(dict_pred[npd][ts]) 
        pred_all.append(pred)
    return np.array(pred_all)

In [42]:
start_date = '2015-01-01'
end_date = '2017-01-01'
dfXy = df_observation.set_index('Datetime')[start_date: end_date].join(df_context.set_index('Datetime')).reset_index()
X, ylist, features_end = build_Xylist(dfXy, features, features_day, time_series, start_date, end_date)

dict_pred_mean, dict_pred_median = learn(X, ylist, features_end, time_series)



start_date = '2015-01-01'
end_date = '2018-01-01'
dfXy = df_observation.set_index('Datetime')[start_date: end_date].join(df_context.set_index('Datetime')).reset_index()
datetime_list = dfXy['Datetime'].values
X, features_end = build_X(dfXy, features, features_day, time_series, start_date, end_date)

pred_all = forecast(X, dict_pred_mean, time_series)

pred_all = np.swapaxes(pred_all,0,1)
pred_all = pred_all.reshape(len(time_series), pred_all.shape[1]*pred_all.shape[2])
df_pred = pd.DataFrame(data = datetime_list, columns=['Datetime'])
for ix,ts in enumerate(time_series):
    df_pred[ts] = pred_all[ix] 
    
df_pred.to_csv("/home/toque/data2/forecast/model/ha_inverted/prediction/model_event_tmp2/2015-01-01_2017-01-01",index=False)

100%|██████████| 68/68 [00:13<00:00,  5.13it/s]
100%|██████████| 1093/1093 [00:02<00:00, 394.32it/s]


In [78]:
my_model = Ha_model('my_ha_model')

In [79]:
print(my_model)

Description of model: my_ha_model
Learning start date: 
Learning end date: 
Features: 
Learned time series: 
Training data path: 
Training exogenous data (date) path: 


In [25]:
df_sta_info_path = "../../../../data2/montreal/stm/data/station_info.csv"
df_i = pd.read_csv(df_sta_info_path)
print(df_i['stop_id'].values.astype('str').tolist())

['11', '32', '34', '15', '44', '65', '31', '33', '35', '47', '13', '14', '1', '9', '5', '18', '36', '24', '68', '43', '8', '64', '10', '55', '3', '49', '51', '2', '19', '56', '7', '6', '4', '48', '66', '25', '23', '28', '39', '54', '60', '27', '20', '46', '12', '21', '62', '52', '41', '50', '30', '16', '37', '40', '26', '67', '57', '61', '42', '45', '38', '29', '58', '63', '22', '59', '53', '17']


In [80]:
df_observation_path = '/home/toque/data2/montreal/stm/data/valid_metro_15min_2015_2016_2017_sumpass.csv'
df_date_path = '/home/toque/data/data_clean/date/2013-01-01-2019-01-01_new.csv'
start_date = '2015-01-01'
end_date = '2017-01-01'
features = ["hms_int_15min","Day_id"]
time_series = ['11', '32', '34', '15', '44', '65', '31', '33', '35', '47', '13', '14',
               '1', '9', '5', '18', '36', '24', '68', '43', '8', '64', '10', '55', '3',
               '49', '51', '2', '19', '56', '7', '6', '4', '48', '66', '25', '23', '28',
               '39', '54', '60', '27', '20', '46', '12', '21', '62', '52', '41', '50', '30',
               '16', '37', '40', '26', '67', '57', '61', '42', '45', '38', '29', '58', '63',
               '22', '59', '53', '17']

In [81]:
my_model.learn(df_observation_path, df_date_path, start_date, end_date, features, time_series)

Read data: observation and date


100%|██████████| 608/608 [00:00<00:00, 218408.43it/s]
100%|██████████| 608/608 [00:00<00:00, 219008.66it/s]

Progress Bar 1/2 : learning mean
Progress Bar 2/2 : learning median
End of Learning





In [113]:
import yaml
import argparse

import os
import sys
sys.path.insert(0, '../utils/')
from utils import *

"""
    Load configuration of the model from yaml file
"""

#parser = argparse.ArgumentParser(description='Process some integers.')
#parser.add_argument('--config', type=str, help='Yaml file containing the configuration of the model')
#config_file = parser.parse_args(['--config'])

config_file = 'config.yaml'
with open(config_file, 'r') as stream:
    try:
        config = yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)
        
        
df_observation_path = config['df_observation_path']
df_date_path = config['df_date_path']
features = config['features']
stations = config['stations']
model_name = config['model_name']
start_date = config['start_date']
end_date = config['end_date']
path_to_save = config['path_to_save']
path_directory_to_save = path_to_save + model_name + '/'


print("You are going to create the model: %s", % (model_name))

create_model = True
if os.path.exists(path_directory_to_save):
    create_model = yes_or_no("WARNING !!!!!!!\nThe model %s saved in path: %s already exists.\nDo you want to erase and replace it?")

if create_model:
    my_model = Ha_model(model_name)
    print("Creation of the model done")

    print("Learning the model..")
    my_model.fit(df_valid_path, df_date_path, start_date, end_date, features, time_series)
    print("Learning done")

    print("Saving models..")
    my_model.save(path_directory_to_save, os.getcwd()+"/"+config_file)
    print("Saving models done")


AttributeError: 'dict' object has no attribute 'df_date_path'

In [115]:
def yes_or_no(question):
    while "The answer is invalid":
        reply = str(input(question+' (y/n): ')).lower().strip()
        if reply[:1] == 'y':
            return True
        if reply[:1] == 'n':
            return False

In [130]:

if yes_or_no("Do you want to continue"):
    print(' ok')
else:
    sys.exit()

print('10')

Do you want to continue (y/n): n


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [124]:
test()

do you want to continue (y/n): yn
 ok
10


In [133]:
import os
dir_path = os.getcwd()

In [134]:
dir_path

'/home/toque/work/forecast/model/ha'

In [1]:
import pandas as pd

In [3]:
obs = pd.read_csv('/home/toque/data2/montreal/stm/data/valid_metro_15min_2015_2016_2017_sumpass.csv')
exo = pd.read_csv('/home/toque/data2/date/2013-01-01-2019-01-01_new.csv')

In [29]:
import numpy as np

In [40]:
b = obs.head(3)
b = b.set_index("Datetime")
d = np.around((b.values/3*np.random.rand()), 2)
b[:] = d
b.reset_index()

Unnamed: 0,Datetime,11,32,34,15,44,65,31,33,35,...,42,45,38,29,58,63,22,59,53,17
0,2015-01-01 00:00:00,19.15,6.02,12.31,8.48,2.87,3.42,5.74,6.43,6.84,...,0.82,2.05,0.96,2.19,0.82,0.27,4.92,0.55,1.23,0.27
1,2015-01-01 00:15:00,29.54,4.24,14.77,4.92,3.83,5.2,5.61,7.25,5.47,...,2.05,15.18,1.64,7.25,2.32,0.96,9.16,0.27,2.74,1.5
2,2015-01-01 00:30:00,83.97,5.33,13.68,5.74,1.78,0.55,17.51,8.48,6.7,...,0.82,5.2,3.56,8.21,0.0,0.0,5.47,0.0,0.82,1.23


In [47]:
for i in np.arange(1):
    print('hello')

hello


In [19]:
a = exo.head(3)[['Datetime','Day_id','hms_int_15min','Vacances','Ferie','Mois_id']]

In [21]:
a.columns=['Datetime','day_id','timestep_id','school_holiday','holiday','month_id']

In [23]:
a[['Datetime', 'month_id', 'day_id', 'timestep_id', 'school_holiday', 'holiday' ]]

Unnamed: 0,Datetime,month_id,day_id,timestep_id,school_holiday,holiday
0,2013-01-01 00:00:00,1,1,0,1,1
1,2013-01-01 00:15:00,1,1,1,1,1
2,2013-01-01 00:30:00,1,1,2,1,1


In [48]:
a

Unnamed: 0,Datetime,day_id,timestep_id,school_holiday,holiday,month_id
0,2013-01-01 00:00:00,1,0,1,1,1
1,2013-01-01 00:15:00,1,1,1,1,1
2,2013-01-01 00:30:00,1,2,1,1,1


In [50]:
a = a.set_index('Datetime')

In [52]:
a.join([])


Unnamed: 0_level_0,day_id,timestep_id,school_holiday,holiday,month_id
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-01 00:00:00,1,0,1,1,1
2013-01-01 00:15:00,1,1,1,1,1
2013-01-01 00:30:00,1,2,1,1,1
