In [1]:
import os 
os.chdir(os.path.pardir)

In [2]:
import pandas as pd
import numpy as np

In [3]:
trend_fea = pd.read_csv('data/external/atlanta_search.csv')
seed_path = 'data/external/seed_common.txt'
seed_terms = []
with open(seed_path, 'r') as fi:
    line = fi.readline().strip().replace(' ', '_')
    while line:
        seed_terms.append(line)
        line = fi.readline().strip().replace(' ', '_')
seed_terms.sort()
merge_terms = [k for k in trend_fea.columns if k in seed_terms]
seed_trend_df = trend_fea[merge_terms]
seed_trend_df.index = pd.to_datetime(trend_fea.iloc[:,0])
seed_trend_df = seed_trend_df.sort_index()
seed_trend_df = seed_trend_df.asfreq('D', fill_value=np.nan)
seed_trend_df['Date'] = seed_trend_df.index
seed_trend_df.reset_index(drop=True, inplace=True)
meteo_data = pd.read_csv('data/external/atlanta_rh_temp.csv')
meteo_data['Date'] = pd.to_datetime(meteo_data['Date'])
pm25_data = pd.read_csv('data/external/fire_station_pm25_2011_2018.csv')
pm25_data.index = pd.to_datetime(pm25_data['Date'])
pm25_data.sort_index(inplace=True)
pm25_data = pm25_data.asfreq('D', fill_value=np.nan)
pm25_data['Date'] = pm25_data.index
pm25_data.reset_index(drop=True, inplace=True)
merged_data = pd.merge(pm25_data[['Date', 'Daily_Mean_PM2_5_Concentration']], meteo_data, how='left', on='Date')
merged_data = pd.merge(merged_data, seed_trend_df, how='left', on='Date')


In [5]:
merged_data.reset_index(drop=True, inplace=True)

In [6]:
def get_index_range(df, start_date, end_date):
    '''
    start_date: str, format "%y-%m-%d", e.g. "2011-01-01"
    '''
    return df[(df['Date']>=start_date) & (df['Date']<=end_date)].index.values 

In [7]:
train_index = get_index_range(merged_data, '2011-01-01', '2016-12-31')
valid_index = get_index_range(merged_data, '2017-01-01', '2017-12-31')
test_index = get_index_range(merged_data, '2018-01-01', '2018-12-31')

merged_data_arr = merged_data.drop(['Date'], axis = 1).values

In [9]:
masking = (~np.isnan(merged_data_arr)).astype(int)

delta = np.zeros((masking.shape[0], masking.shape[1]))
for value_pos, value in np.ndenumerate(masking):
    '''
    value_pos: tuple (row, col)
    value: int, 1 or 0
    '''
    row = value_pos[0]
    col = value_pos[1]
    
    # first day, delta equals 0 
    if row == 0:
        delta[row, col] = 0
    # if previous day exist 
    elif masking[row-1, col] == 1:
        delta[row, col] = 1
    # if previous day not exist values
    else:
        delta[row, col] = delta[row-1, col] + 1 

total_days = len(merged_data_arr)
seq_len = 7

In [10]:
def get_feature_sequence(feature_matrix, seq_len):
    feature_sequence = []
    total_days = len(feature_matrix)
    for i in range(total_days - seq_len):
#         print(i)
        feature_sequence.append(feature_matrix[i:i+seq_len])
    return np.array(feature_sequence)

def get_label_sequence(label_arr, seq_len):
    label_sequence = []
    total_days = len(label_arr)
    for i in range(total_days - seq_len):
        label_sequence.append(label_arr[i+seq_len])
    return np.array(label_sequence) 

In [11]:
fea_seq = get_feature_sequence(merged_data_arr, seq_len)

In [25]:
merged_label =  merged_data[['Daily_Mean_PM2_5_Concentration']].values

In [26]:
merged_label[1:] = merged_label[1:] - pd_fillna(merged_label)[0:-1]

In [29]:
merged_label[0]  = np.nan

In [30]:
label_seq = get_label_sequence(merged_label, seq_len)

In [31]:
# fill na with previous value
def pd_fillna(feature_matrix, method = 'ffill'):
    return pd.DataFrame(feature_matrix).fillna(method='ffill').values

# fill na with zero 
def pd_fillna_zero(feature_matrix):
    return pd.DataFrame(feature_matrix).fillna(0.).values

merged_data_feature_matrix = pd_fillna(merged_data_arr)
merged_data_feature_matrix = pd_fillna_zero(merged_data_feature_matrix)

# last observation if nans, however, if not na, its itself
last_observation_seq = get_feature_sequence(merged_data_feature_matrix, seq_len)
masking_seq = get_feature_sequence(masking, seq_len)
delta_seq = get_feature_sequence(delta, seq_len)
train_valid_test_split = np.array([len(train_index)-seq_len, len(valid_index), len(test_index)])

In [34]:
save_folder = 'data/raw/predict-one-day-diff/pol-met-search'

In [37]:
os.mkdir(save_folder)

In [38]:
save_files_dict = {
    'fea_seq.npy': fea_seq,
    'last_observation_seq.npy':last_observation_seq,
    'label_seq.npy': label_seq,
    'masking_seq.npy': masking_seq,
    'delta_seq.npy': delta_seq,
    'train_valid_test_split.npy': train_valid_test_split 
}

In [39]:
for file_name in save_files_dict.keys():
    np.save(os.path.join(save_folder, file_name), save_files_dict[file_name])

In [43]:
np.nanmin(label_seq)

-34.800000000000004