In [1]:
import os 
os.chdir(os.path.pardir)

In [2]:
os.getcwd()

'/Volumes/Samsung_T5/MacT5/research/multi-seq-learning/multi-seq-learning'

In [3]:
import warnings

In [4]:
warnings.filterwarnings('ignore')

In [5]:
import pandas as pd 

In [6]:
import numpy as np

In [7]:
firestation_pm25_path = 'data/external/fire_station_pm25_2011_2018.csv'

In [8]:
firestation_pm25 = pd.read_csv(firestation_pm25_path)
firestation_pm25['Date'] = pd.to_datetime(firestation_pm25['Date'])

In [9]:
firestation_pm25.sort_values(['Date'], inplace=True)

In [10]:
firestation_pm25.index = firestation_pm25['Date']

In [11]:
firestation_pm25 = firestation_pm25.asfreq('D', fill_value=np.nan)

In [12]:
firestation_pm25['Date'] = firestation_pm25.index

In [13]:
firestation_pm25.reset_index(drop=True, inplace=True)

In [14]:
def get_index_range(df, start_date, end_date):
    '''
    start_date: str, format "%y-%m-%d", e.g. "2011-01-01"
    '''
    return df[(df['Date']>=start_date) & (df['Date']<=end_date)].index.values 

In [15]:
train_index = get_index_range(firestation_pm25, '2011-01-01', '2016-12-31')
valid_index = get_index_range(firestation_pm25, '2017-01-01', '2017-12-31')
test_index = get_index_range(firestation_pm25, '2018-01-01', '2018-12-31')

In [16]:
import numpy as np 

In [17]:
firestation_pm25_arr = firestation_pm25[['Daily_Mean_PM2_5_Concentration']].values

In [18]:
masking = (~np.isnan(firestation_pm25_arr)).astype(int)

In [19]:
delta = np.zeros((masking.shape[0], masking.shape[1]))
for value_pos, value in np.ndenumerate(masking):
    '''
    value_pos: tuple (row, col)
    value: int, 1 or 0
    '''
    row = value_pos[0]
    col = value_pos[1]
    
    # first day, delta equals 0 
    if row == 0:
        delta[row, col] = 0
    # if previous day exist 
    elif masking[row-1, col] == 1:
        delta[row, col] = 1
    # if previous day not exist values
    else:
        delta[row, col] = delta[row-1, col] + 1 

In [20]:
unique_value, count = np.unique(delta, return_counts = True)

In [21]:
# {i:j for i, j in zip(unique_value, count)}

In [22]:
total_days = len(firestation_pm25_arr)

In [23]:
seq_len = 7

In [24]:
def get_feature_sequence(feature_matrix, seq_len):
    feature_sequence = []
    total_days = len(feature_matrix)
    for i in range(total_days - seq_len):
#         print(i)
        feature_sequence.append(feature_matrix[i:i+seq_len])
    return np.array(feature_sequence)

def get_label_sequence(label_arr, seq_len):
    label_sequence = []
    total_days = len(label_arr)
    for i in range(total_days - seq_len):
        label_sequence.append(label_arr[i+seq_len])
    return np.array(label_sequence) 

In [25]:
fea_seq = get_feature_sequence(firestation_pm25_arr, seq_len)

In [26]:
# fill na with previous value
def pd_fillna(feature_matrix, method = 'ffill'):
    return pd.DataFrame(feature_matrix).fillna(method='ffill').values

In [27]:
pm25_feature_matrix = pd_fillna(firestation_pm25_arr)

In [28]:
# last observation if nans, however, if not na, its itself
last_observation_seq = get_feature_sequence(pm25_feature_matrix, seq_len)

In [None]:
fea_seq[1]

In [29]:
# pm25_feature_matrix

In [30]:
# pd.DataFrame(firestation_pm25_arr)

In [31]:
label_seq = get_label_sequence(firestation_pm25_arr, seq_len)

In [32]:
masking_seq = get_feature_sequence(masking, seq_len)

In [33]:
delta_seq = get_feature_sequence(delta, seq_len)

In [34]:
train_valid_test_split = np.array([len(train_index)-seq_len, len(valid_index), len(test_index)])

In [38]:
save_folder = 'data/raw'

In [39]:
save_files_dict = {
    'fea_seq.npy': fea_seq,
    'last_observation_seq.npy':last_observation_seq,
    'label_seq.npy': label_seq,
    'masking_seq.npy': masking_seq,
    'delta_seq.npy': delta_seq,
    'train_valid_test_split.npy': train_valid_test_split 
}

In [40]:
for file_name in save_files_dict.keys():
    np.save(os.path.join(save_folder, file_name), save_files_dict[file_name])

In [41]:
fea_seq[2]

array([[nan],
       [9.7],
       [nan],
       [nan],
       [7.4],
       [nan],
       [nan]])