In [13]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta

In [14]:
# Exogenous weather data
parser = lambda date: datetime.strptime(date, '%d/%m/%Y')
milan_weather_df = pd.read_csv("../datasets/Eso_data/weather/milano_weather_data.csv", index_col=[1], parse_dates=True, date_parser=parser)
turin_weather_df = pd.read_csv("../datasets/Eso_data/weather/torino_weather_data.csv", index_col=[1], parse_dates=True, date_parser=parser)
rome_weather_df = pd.read_csv("../datasets/Eso_data/weather/roma_weather_data.csv", index_col=[1], parse_dates=True, date_parser=parser)

In [15]:
def augment_daily_store_data (store_df, loc_weather_df):
    store_daily_totals = store_df
        
    ## TIME DATA
    # First add the exougenous variables we can extract directly from the time element
    day_yearly_nr = np.array(list(map(int, store_daily_totals.index.strftime('%j').tolist())))
    nrm_days = day_yearly_nr/365
    store_daily_totals["sin_day"] =  np.sin((day_yearly_nr/365) * 2*np.pi)
    store_daily_totals["cos_day"] = np.cos((day_yearly_nr/365) * 2*np.pi)
    store_daily_totals["sincos_day"] = np.sin((day_yearly_nr/365) * 2*np.pi) * np.cos((day_yearly_nr/365) * 2*np.pi)
    store_daily_totals["day_of_week"] = store_daily_totals.index.dayofweek

    ## WEATHER DATA
    # We won't be using the following cols (too many nan vals)
    loc_weather_df = loc_weather_df.drop(["LOCALITA", "PRESSIONEMEDIA mb", "PIOGGIA mm"], axis=1)
    loc_weather_df = loc_weather_df.loc[store_daily_totals.index[0]:,:]
    
    # Binary encode the weather phenomena: rainy or not:
    loc_weather_df.FENOMENI.fillna("good", inplace=True)
    loc_weather_df["conditions"] = [-1 if("pioggia" in phenom or "neve" in phenom) else 1 for phenom in loc_weather_df.FENOMENI.values]
    loc_weather_df = loc_weather_df.drop("FENOMENI", axis=1)

    # Here we add exogenous data we run the train the network with and without these new data. 
    # If the prediction improves then we are looking at leading indicator and better forecast accuracy.
    store_daily_weather_augmented = store_daily_totals.copy(deep=True)
    for w_col in loc_weather_df.columns:
        store_daily_weather_augmented[w_col] = loc_weather_df.loc[:, w_col]
    
    # Find and replace NAN values in sales and replace them if they are present
    nan_indices = store_daily_weather_augmented[loc_weather_df.isnull().any(axis=1)].index
    for nan_idx in nan_indices:
        n_days_before = nan_idx - timedelta(days=3)
        before_nan = nan_idx - timedelta(days=1)
        store_daily_weather_augmented.fillna(np.mean(store_daily_weather_augmented.loc[n_days_before:before_nan]), inplace=True)

    
    return store_daily_weather_augmented

In [25]:
def augment_weekly_store_data (store_df, loc_weather_df):
    store_weekly_totals = store_df

    ## TIME DATA
    week_nr_values = store_weekly_totals.index.week
    store_weekly_totals["sin_week"] =  np.sin((week_nr_values/53) * 2*np.pi)
    store_weekly_totals["cos_week"] = np.cos((week_nr_values/53) * 2*np.pi)
    store_weekly_totals["sincos_week"] = np.sin((week_nr_values/53) * 2*np.pi) * np.cos((week_nr_values/53) * 2*np.pi)
    
    ## WEATHER DATA
    # We won't be using the following cols (too many nan vals)
    loc_weather_df = loc_weather_df.drop(["LOCALITA", "PRESSIONEMEDIA mb", "PIOGGIA mm"], axis=1)
    # Binary encode the weather phenomena: rainy or not:
    loc_weather_df.FENOMENI.fillna("good", inplace=True)
    loc_weather_df["conditions"] = [-1 if("pioggia" in phenom or "neve" in phenom) else 1 for phenom in loc_weather_df.FENOMENI.values]
    loc_weather_df = loc_weather_df.drop("FENOMENI", axis=1)
    
    # Group data by week (starting with the date 01-01 of the the first year) and aggregate by mean
    loc_weather_df = loc_weather_df.groupby([pd.Grouper(freq = "W-MON")]).agg(np.mean)
    loc_weather_df["PRESSIONESLM mb"] = loc_weather_df["PRESSIONESLM mb"].fillna(np.mean(loc_weather_df["PRESSIONESLM mb"]))
    loc_weather_df["conditions"] = [-1 if val<0 else 1 for val in loc_weather_df.conditions.values]
    loc_weather_df = loc_weather_df.loc[store_weekly_totals.index[0]:,:]

    # Here we add exogenous data we run the train the network with and without these new data. 
    # If the prediction improves then we are looking at leading indicator and better forecast accuracy.
    store_weekly_weather_augmented = store_weekly_totals.copy(deep=True)
    for w_col in loc_weather_df.columns:
        store_weekly_weather_augmented[w_col] = loc_weather_df.loc[:, w_col]
    
    nan_indices = store_weekly_weather_augmented[store_weekly_weather_augmented.isnull().any(axis=1)].index
    for nan_idx in nan_indices:
        n_weeks_before = nan_idx - timedelta(weeks=3)
        before_nan = nan_idx - timedelta(weeks=1)
        store_weekly_weather_augmented.fillna(np.mean(store_weekly_weather_augmented.loc[n_weeks_before:before_nan]), inplace=True)
        
    return store_weekly_weather_augmented

In [17]:
# Augment and export daily data
daily_directory = os.fsencode("../datasets/ts_data/newdaily/daily/")

for file in os.listdir(daily_directory):
    filename = os.fsdecode(file)
    if filename.endswith(".csv"): 
        current_daily_data = pd.read_csv("../datasets/ts_data/newdaily/daily/"+filename, index_col=[0], parse_dates=True)
        if("milan" in filename):
            current_daily_data_aug = augment_daily_store_data(current_daily_data, milan_weather_df)
            print(filename, "milan weather")
        elif("turin" in filename):
            current_daily_data_aug = augment_daily_store_data(current_daily_data, turin_weather_df)
            print(filename, "turin weather")
        else:
            current_daily_data_aug = augment_daily_store_data(current_daily_data, rome_weather_df)
            print(filename, "rome weather")

        current_daily_data_aug.to_csv("aug_"+filename)

rome_neg_tibur_data.csv rome weather
turin_cc_juve_data.csv turin weather
turin_cc_todor_data.csv turin weather
rome_neg_tusc1_data.csv rome weather
turin_cc_niche_data.csv turin weather
turin_cc_beina_data.csv turin weather
turin_cc_torin_data.csv turin weather
rome_neg_tusc2_data.csv rome weather
rome_cc_eur2_data.csv rome weather
milan_cc_arese_data.csv milan weather
milan_cc_busna_data.csv milan weather
rome_neg_rmnaz_data.csv rome weather


In [27]:
# Augment and export weekly data
weekly_directory = os.fsencode("../datasets/ts_data/newdaily/weekly/")

for file in os.listdir(weekly_directory):
    filename = os.fsdecode(file)
    if filename.endswith(".csv"):
        current_weekly_data = pd.read_csv("../datasets/ts_data/newdaily/weekly/"+filename, index_col=[0], parse_dates=True)
        if("milan" in filename):
            current_weekly_data_aug = augment_weekly_store_data(current_weekly_data, milan_weather_df)
            print(filename, "milan weather")
        elif("turin" in filename):
            current_weekly_data_aug = augment_weekly_store_data(current_weekly_data, turin_weather_df)
            print(filename, "turin weather")
        else:
            current_weekly_data_aug = augment_weekly_store_data(current_weekly_data, rome_weather_df)
            print(filename, "rome weather")

        current_weekly_data_aug.to_csv("aug_"+filename)

rome_neg_rmnaz_data_w.csv rome weather
turin_cc_torin_data_w.csv turin weather
milan_cc_busna_data_w.csv milan weather
rome_neg_tusc2_data_w.csv rome weather
rome_cc_eur2_data_w.csv rome weather
rome_neg_tibur_data_w.csv rome weather
rome_neg_tusc1_data_w.csv rome weather
turin_cc_beina_data_w.csv turin weather
turin_cc_todor_data_w.csv turin weather
turin_cc_juve_data_w.csv turin weather
turin_cc_niche_data_w.csv turin weather
milan_cc_arese_data_w.csv milan weather
