In [1]:
import os 

In [2]:
os.chdir(os.path.pardir)

In [3]:
import configparser
from configparser import ExtendedInterpolation
config_path = 'config/parameters.ini'
pars = configparser.ConfigParser(interpolation=ExtendedInterpolation())

In [4]:
del pars
pars = configparser.ConfigParser(interpolation=ExtendedInterpolation())
pars.read(config_path)

['config/parameters.ini']

In [5]:
from airpolnowcast.data.utils import read_query_from_file, read_raw_data
import ast

In [6]:
from airpolnowcast.features.build_features import FeatureEngineer

In [7]:
from airpolnowcast.evaluation.utils import get_feature_pars

Using TensorFlow backend.


In [8]:
# global parameters
# seed word list
seed_path = pars['extract_search_trend']['term_list_path']
seed_word_list = read_query_from_file(seed_path)
seq_length = int(pars['train_model']['seq_length'])
search_lag = int(pars['train_model']['search_lag'])
# features_array = ast.literal_eval(pars['train_model']['FEATURE'])
use_feature = ast.literal_eval(pars['train_model']['use_feature'])

# create object for feature engineer
feature_engineer = FeatureEngineer()

In [9]:
index = use_feature[0]

In [10]:
feature_pars = get_feature_pars(pars, index)

In [11]:
# get model_type
model_type = feature_pars['model_type']

In [12]:
save_model_path = feature_pars['save_model_path']

In [13]:
train_data_path = 'data/processed/train.csv'
valid_data_path = 'data/processed/valid.csv'
test_data_path = 'data/processed/test.csv'

In [14]:
# save input_data_path for dllstm model
feature_pars['input_data_path'] = valid_data_path


In [15]:
y_train, train_pol, train_phys, train_trend = feature_engineer.feature_from_file(train_data_path, seq_length, search_lag)



In [85]:
from airpolnowcast.features.build_features import process_data, get_feature_from_config

In [17]:
train_data = read_raw_data(train_data_path)


In [19]:
y_data, pol_val, trend_fea, phys_fea = process_data(train_data)


In [86]:
x_train, embedding_dim = get_feature_from_config(feature_pars, train_pol, train_phys, train_trend)



In [94]:
# generate sequence input features for LSTM training
def generate_input_sequence(input_array, seq_length):
    """

    :param input_array: np.array
        shape: N*P
        N: number of days
        P: number of features for day i
    :param seq_length: int
        sequence length for LSTM model
    :return: np.array
        shape: N*seq_length*P
    """
    embedding_dim = input_array.shape[1]
    input_embedding = []
    for i in range(len(input_array)):
        input_series = []
        for days_index in range(i - seq_length + 1, i + 1):
            if days_index >= 0:
                day_embedding = input_array[days_index]
            else:
                na_vec = np.array([0. for i in range(embedding_dim)])
                day_embedding = na_vec

            input_series.append(day_embedding)
        input_embedding.append(np.array(input_series))
    input_embedding = np.array(input_embedding)
    return input_embedding

In [198]:
# trend_fea.isna().sum()

In [145]:
trend_fill_na = trend_fea.fillna(0.).replace(0., np.nan).fillna(method='ffill').fillna(0.)

In [154]:
trend_fill_na_norm = (trend_fill_na - trend_fill_na.mean())/ trend_fill_na.std()

In [164]:
trend_fill_na_norm.fillna(0., inplace = True)

In [170]:
x_mean_aft_nor = np.array(trend_fill_na_norm.mean())

In [172]:
x_mean_path = 'data/raw/x_mean_aft_nor.pkl'

In [174]:
import pickle

In [176]:
with open(x_mean_path, 'wb') as fi:
    pickle.dump(x_mean_aft_nor, fi)

In [177]:
x_median_aft_nor = np.array(trend_fill_na_norm.median())

In [178]:
x_median_path = 'data/raw/x_median_aft_nor.pkl'

In [179]:
with open(x_median_path, 'wb') as fi:
    pickle.dump(x_median_aft_nor, fi)

In [199]:
# x_mean_aft_nor

In [162]:
# trend_fill_na_norm.isna().sum()

In [152]:
# trend_fill_na.std()

In [183]:
trend_fill_na_norm.shape

(2922, 51)

In [184]:
trend_seq = generate_input_sequence(np.array(trend_fill_na_norm), 7)

In [115]:
masking_seq = generate_input_sequence(masking.T, 7)

In [117]:
delta_seq = generate_input_sequence(delta.T, 7)

In [188]:
trend_seq_path = 'data/raw/trend_seq.pkl'
masking_seq_path = 'data/raw/masking_seq.pkl'
delta_seq_path = 'data/raw/delta_seq.pkl'

In [194]:
y_out_path = 'data/raw/y_out.pkl'

In [None]:
with open(trend_seq_path, 'wb') as fi:
    pickle.dump(trend_seq, fi)

In [190]:
with open(masking_seq_path, 'wb') as fi:
    pickle.dump(masking_seq, fi)

In [191]:
with open(delta_seq_path, 'wb') as fi:
    pickle.dump(delta_seq, fi)

In [197]:
with open(y_out_path, 'wb') as fi:
    pickle.dump(np.array(y_data), fi)

array([0, 0, 0, ..., 0, 0, 0])

In [185]:
trend_seq.shape

(2922, 7, 51)

In [161]:
# delta.T

In [160]:
# masking_seq[0]

In [159]:
# masking_seq[0]

In [158]:
# x_train[0]

In [None]:
def get_delta(masking, delta):
    # fill the delta vectors
    for index, value in np.ndenumerate(masking):
        '''
        index[0] = row, agg
        index[1] = col, time
        '''
        if index[1] == 0:
            delta[index[0], index[1]] = 0
        elif masking[index[0], index[1]-1] == 0:
            delta[index[0], index[1]] = 1 + delta[index[0], index[1]-1]
        else:
            delta[index[0], index[1]] = 1
    
    return delta

In [None]:
def trend_fea_to_delta(trend_fea):
    
    trend_for_grud = trend_fea.fillna(0.0)
    x = np.array(trend_for_grud.T)
    trend_for_grud = (trend_for_grud != 0)
    masking = np.array(trend_for_grud.astype(int))
    delta = np.zeros((masking.shape[0], masking.shape[1]))
    delta = get_delta(masking, delta)
    
    return masking, delta 


In [24]:
trend_for_grud = trend_fea.fillna(0.0)

In [200]:
# trend_for_grud.T.shape


In [38]:
import numpy as np

In [39]:
x = np.array(trend_for_grud.T)

In [45]:
trend_for_grud = trend_for_grud.T

In [49]:
trend_for_grud = (trend_for_grud != 0)

In [52]:
masking = np.array(trend_for_grud.astype(int))

In [57]:
masking.shape

(51, 2922)

In [58]:
delta = np.zeros((masking.shape[0], masking.shape[1]))

In [61]:
# fill the delta vectors
for index, value in np.ndenumerate(masking):
    '''
    index[0] = row, agg
    index[1] = col, time
    '''
    if index[1] == 0:
        delta[index[0], index[1]] = 0
    elif masking[index[0], index[1]-1] == 0:
        delta[index[0], index[1]] = 1 + delta[index[0], index[1]-1]
    else:
        delta[index[0], index[1]] = 1

In [70]:
delta[10]

array([0.000e+00, 1.000e+00, 2.000e+00, ..., 2.919e+03, 2.920e+03,
       2.921e+03])

In [72]:
delta.shape

(51, 2922)

In [68]:
delta.shape

(51, 2922)

In [83]:
masking

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [84]:
delta

array([[ 0.,  1.,  2., ..., 74., 75., 76.],
       [ 0.,  1.,  2., ...,  2.,  3.,  4.],
       [ 0.,  1.,  2., ...,  1.,  2.,  1.],
       ...,
       [ 0.,  1.,  1., ...,  1.,  1.,  1.],
       [ 0.,  1.,  2., ...,  1.,  1.,  2.],
       [ 0.,  1.,  2., ..., 36., 37., 38.]])

In [34]:
trend_for_grud.shape

(2922, 51)

In [82]:
trend_for_grud 

(51, 2922)

In [None]:
trend_fea.isna().sum()

In [75]:
x.shape

(51, 2922)