In [1]:
import os 
os.chdir(os.path.pardir)

In [3]:
import pandas as pd
import numpy as np

In [4]:
trend_fea = pd.read_csv('data/external/atlanta_search.csv')

In [6]:
seed_path = 'data/external/seed_common.txt'

In [18]:
seed_terms = []
with open(seed_path, 'r') as fi:
    line = fi.readline().strip().replace(' ', '_')
    while line:
        seed_terms.append(line)
        line = fi.readline().strip().replace(' ', '_')

In [33]:
seed_terms.sort()

In [41]:
merge_terms = [k for k in trend_fea.columns if k in seed_terms]

In [52]:
seed_trend_df = trend_fea[merge_terms]

In [53]:
seed_trend_df.index = pd.to_datetime(trend_fea.iloc[:,0])

In [54]:
seed_trend_df = seed_trend_df.sort_index()

In [57]:
seed_trend_df = seed_trend_df.asfreq('D', fill_value=np.nan)

In [65]:
seed_trend_df['Date'] = seed_trend_df.index

In [66]:
seed_trend_df.reset_index(drop=True, inplace=True)

In [68]:
meteo_data = pd.read_csv('data/external/atlanta_rh_temp.csv')
meteo_data['Date'] = pd.to_datetime(meteo_data['Date'])
pm25_data = pd.read_csv('data/external/fire_station_pm25_2011_2018.csv')
pm25_data.index = pd.to_datetime(pm25_data['Date'])
pm25_data.sort_index(inplace=True)
pm25_data = pm25_data.asfreq('D', fill_value=np.nan)
pm25_data['Date'] = pm25_data.index
pm25_data.reset_index(drop=True, inplace=True)
merged_data = pd.merge(pm25_data[['Date', 'Daily_Mean_PM2_5_Concentration']], meteo_data, how='left', on='Date')

In [70]:
merged_data = pd.merge(merged_data, seed_trend_df, how='left', on='Date')

In [74]:
merged_data.head()

Unnamed: 0,Date,Daily_Mean_PM2_5_Concentration,rh_max,rh_mean,temp_max,temp_mean,air_pollutant,air_pollution,arrhythmia,asthma,...,smoggy,smoke,snoring,soot,sulfate,tailpipe,throat_irritation,traffic,wheezing,wildfires
0,2011-02-23,14.3,90.0,62.8125,64.0,51.541667,0.0,0.0,0.0,78.260694,...,,37.030573,48.143379,0.0,121.991522,0.0,0.0,19.803256,0.0,0.0
1,2011-02-24,,83.0,68.0,70.0,57.75,0.0,58.373206,0.0,89.440793,...,,35.224204,48.143379,0.0,85.948572,0.0,0.0,15.088195,19.86394,16.470588
2,2011-02-25,,98.0,73.875,67.0,58.0,0.0,0.0,78.255208,30.745273,...,,26.192357,97.791239,43.654485,37.429217,0.0,0.0,23.575305,20.599642,0.0
3,2011-02-26,9.7,100.0,68.708334,67.0,49.416667,0.0,0.0,0.0,25.155223,...,,54.191083,60.179224,0.0,45.746821,48.833333,0.0,14.145183,0.0,0.0
4,2011-02-27,,98.0,71.604166,75.0,60.645833,0.0,73.383459,0.0,75.465669,...,,66.835669,0.0,0.0,30.49788,0.0,0.0,12.259159,50.763402,0.0


In [75]:
merged_data.reset_index(drop=True, inplace=True)

In [76]:
def get_index_range(df, start_date, end_date):
    '''
    start_date: str, format "%y-%m-%d", e.g. "2011-01-01"
    '''
    return df[(df['Date']>=start_date) & (df['Date']<=end_date)].index.values 

In [77]:
train_index = get_index_range(merged_data, '2011-01-01', '2016-12-31')
valid_index = get_index_range(merged_data, '2017-01-01', '2017-12-31')
test_index = get_index_range(merged_data, '2018-01-01', '2018-12-31')

In [81]:
merged_data_arr = merged_data.drop(['Date'], axis = 1).values

In [84]:
#merged_data_arr[0].shape
#1+4+47 = 52

In [86]:
masking = (~np.isnan(merged_data_arr)).astype(int)

In [89]:
delta = np.zeros((masking.shape[0], masking.shape[1]))
for value_pos, value in np.ndenumerate(masking):
    '''
    value_pos: tuple (row, col)
    value: int, 1 or 0
    '''
    row = value_pos[0]
    col = value_pos[1]
    
    # first day, delta equals 0 
    if row == 0:
        delta[row, col] = 0
    # if previous day exist 
    elif masking[row-1, col] == 1:
        delta[row, col] = 1
    # if previous day not exist values
    else:
        delta[row, col] = delta[row-1, col] + 1 

In [90]:
total_days = len(merged_data_arr)

In [92]:
seq_len = 7

In [93]:
def get_feature_sequence(feature_matrix, seq_len):
    feature_sequence = []
    total_days = len(feature_matrix)
    for i in range(total_days - seq_len):
#         print(i)
        feature_sequence.append(feature_matrix[i:i+seq_len])
    return np.array(feature_sequence)

def get_label_sequence(label_arr, seq_len):
    label_sequence = []
    total_days = len(label_arr)
    for i in range(total_days - seq_len):
        label_sequence.append(label_arr[i+seq_len])
    return np.array(label_sequence) 

In [94]:
fea_seq = get_feature_sequence(merged_data_arr, seq_len)

In [95]:
merged_label =  merged_data[['Daily_Mean_PM2_5_Concentration']].values

In [96]:
label_seq = get_label_sequence(merged_label, seq_len)

In [97]:
# fill na with previous value
def pd_fillna(feature_matrix, method = 'ffill'):
    return pd.DataFrame(feature_matrix).fillna(method='ffill').values

In [101]:
# fill na with zero 
def pd_fillna_zero(feature_matrix):
    return pd.DataFrame(feature_matrix).fillna(0.).values

In [102]:
merged_data_feature_matrix = pd_fillna(merged_data_arr)
merged_data_feature_matrix = pd_fillna_zero(merged_data_feature_matrix)

# last observation if nans, however, if not na, its itself
last_observation_seq = get_feature_sequence(merged_data_feature_matrix, seq_len)
masking_seq = get_feature_sequence(masking, seq_len)
delta_seq = get_feature_sequence(delta, seq_len)
train_valid_test_split = np.array([len(train_index)-seq_len, len(valid_index), len(test_index)])

In [103]:
sum(np.isnan(merged_data_feature_matrix))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [104]:
save_folder = 'data/raw/met-search'

In [105]:
os.mkdir(save_folder)

In [106]:
save_files_dict = {
    'fea_seq.npy': fea_seq,
    'last_observation_seq.npy':last_observation_seq,
    'label_seq.npy': label_seq,
    'masking_seq.npy': masking_seq,
    'delta_seq.npy': delta_seq,
    'train_valid_test_split.npy': train_valid_test_split 
}

In [107]:
for file_name in save_files_dict.keys():
    np.save(os.path.join(save_folder, file_name), save_files_dict[file_name])