In [3]:
import os 
import configparser
from configparser import ExtendedInterpolation

In [4]:
os.chdir(os.path.pardir)
config_path = 'config/parameters.ini'
pars = configparser.ConfigParser(interpolation=ExtendedInterpolation())
del pars
pars = configparser.ConfigParser(interpolation=ExtendedInterpolation())
pars.read(config_path)

['config/parameters.ini']

In [5]:
from airpolnowcast.data.utils import read_query_from_file, read_raw_data
import ast
from airpolnowcast.features.build_features import FeatureEngineer
from airpolnowcast.evaluation.utils import get_feature_pars
from airpolnowcast.features.build_features import process_data, get_feature_from_config

Using TensorFlow backend.


In [101]:
import pandas as pd

In [99]:
# apply lags to search features
def lag_search_features(input_df, lag):
    """

    :param input_df: pd.DataFrame
        the search feature input data
        shape: N*M
        N: number of days
        M: number of search terms
    :param lag: int
        lag of days applied ont search data
    :return: pd.DataFrame
        shape: N*M
        for day i, we have the info of day i+lag (later)
    """
    # record column names
    df_column_names = input_df.columns
    input_df = np.array(input_df)
    embedding_dim = input_df.shape[1]
    reveserse_embeddings = input_df[::-1]
    lag_features = np.roll(reveserse_embeddings, lag, axis=0)
    for i in range(lag):
        na_embedding = np.array([0. for k in range(embedding_dim)])
        lag_features[i] = na_embedding
    lag_features = lag_features[::-1]
    return pd.DataFrame(lag_features, columns=df_column_names)

In [6]:
# generate sequence input features for LSTM training
def generate_input_sequence(input_array, seq_length):
    """

    :param input_array: np.array
        shape: N*P
        N: number of days
        P: number of features for day i
    :param seq_length: int
        sequence length for LSTM model
    :return: np.array
        shape: N*seq_length*P
    """
    embedding_dim = input_array.shape[1]
    input_embedding = []
    for i in range(len(input_array)):
        input_series = []
        for days_index in range(i - seq_length + 1, i + 1):
            if days_index >= 0:
                day_embedding = input_array[days_index]
            else:
                na_vec = np.array([0. for i in range(embedding_dim)])
                day_embedding = na_vec

            input_series.append(day_embedding)
        input_embedding.append(np.array(input_series))
    input_embedding = np.array(input_embedding)
    return input_embedding

In [7]:
def get_delta(masking, delta):
    # fill the delta vectors
    for index, value in np.ndenumerate(masking):
        '''
        index[0] = row, agg
        index[1] = col, time
        '''
        if index[1] == 0:
            delta[index[0], index[1]] = 0
        elif masking[index[0], index[1]-1] == 0:
            delta[index[0], index[1]] = 1 + delta[index[0], index[1]-1]
        else:
            delta[index[0], index[1]] = 1
    
    return delta

In [53]:
def trend_fea_to_delta(trend_fea):
    
    trend_for_grud = trend_fea.fillna(0.0)
    trend_for_grud = (trend_for_grud != 0)
    masking = np.array(trend_for_grud.astype(int).T)
    delta = np.zeros((masking.shape[0], masking.shape[1]))
    delta = get_delta(masking, delta)
    
    return masking, delta 

In [54]:
def trend_fea_to_x(trend_fea):
    trend_fill_na = trend_fea.fillna(0.).replace(0., np.nan).fillna(method='ffill').fillna(0.)
    trend_fill_na_norm = (trend_fill_na - trend_fill_na.mean())/ trend_fill_na.std()
    trend_fill_na_norm.fillna(0., inplace = True)
    x_mean_aft_nor = np.array(trend_fill_na_norm.mean())
    x_median_aft_nor = np.array(trend_fill_na_norm.median())
    return np.array(trend_fill_na_norm), x_mean_aft_nor, x_median_aft_nor


In [87]:
def generate_x_seq(x, masking, delta):
    
    trend_seq = generate_input_sequence(x, 7)
    masking_seq = generate_input_sequence(masking.T, 7)
    delta_seq = generate_input_sequence(delta.T, 7)
    t_dataset = np.stack((trend_seq, masking_seq, delta_seq), axis = 1)
    t_dataset = np.einsum('klij->klji', t_dataset)
    
    return t_dataset
    

In [9]:
# global parameters
# seed word list
seed_path = pars['extract_search_trend']['term_list_path']
seed_word_list = read_query_from_file(seed_path)
seq_length = int(pars['train_model']['seq_length'])
search_lag = int(pars['train_model']['search_lag'])
# features_array = ast.literal_eval(pars['train_model']['FEATURE'])
use_feature = ast.literal_eval(pars['train_model']['use_feature'])

# create object for feature engineer
feature_engineer = FeatureEngineer()


In [10]:
index = use_feature[0]
feature_pars = get_feature_pars(pars, index)
# get model_type
model_type = feature_pars['model_type']
save_model_path = feature_pars['save_model_path']


In [11]:
train_data_path = 'data/processed/train.csv'
valid_data_path = 'data/processed/valid.csv'
test_data_path = 'data/processed/test.csv'
# save input_data_path for dllstm model
feature_pars['input_data_path'] = valid_data_path


In [108]:
def save_pkl_dataset(train_data_path, search_lag, x_mean_path, x_median_path, t_dataset_path, y_out_path):
    train_data = read_raw_data(train_data_path)
    y_data, pol_val, trend_fea, phys_fea = process_data(train_data)
    trend_fea = lag_search_features(trend_fea, search_lag)
    masking, delta = trend_fea_to_delta(trend_fea)
    x, x_mean_aft_nor, x_median_aft_nor = trend_fea_to_x(trend_fea)
    t_dataset = generate_x_seq(x, masking, delta)
    
    with open(x_mean_path, 'wb') as fi:
        pickle.dump(x_mean_aft_nor, fi)
    with open(x_median_path, 'wb') as fi:
        pickle.dump(x_median_aft_nor, fi)
    with open(t_dataset_path, 'wb') as fo:
        pickle.dump(t_dataset, fo)
    with open(y_out_path, 'wb') as fi:
        pickle.dump(np.array(y_data), fi)




In [109]:
x_mean_path = 'data/savepkl/x_mean_aft_nor.pkl'
x_median_path = 'data/savepkl/x_median_aft_nor.pkl'
t_dataset_path = 'data/savepkl/t_dataset.pkl'
y_out_path = 'data/savepkl/y_out.pkl'

In [110]:
save_pkl_dataset(train_data_path, search_lag, x_mean_path, x_median_path, t_dataset_path, y_out_path)

In [106]:
trend_fea = lag_search_features(trend_fea, search_lag)

'train'

In [107]:
trend_fea

Unnamed: 0,air pollutant,air pollution,arrhythmia,asthma,asthma attack,black carbon,bronchitis,cardiovascular disease,chest pain,chest tightness,...,smoggy,smoke,snoring,soot,sulfate,tailpipe,throat irritation,traffic,wheezing,wildfires
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,34.000000,0.000000,0.000000,0.000000,...,,22.000000,0.000000,0.000000,0.000000,0.000000,100.000000,14.000000,0.000000,0.0
1,0.0,0.000000,0.000000,37.000000,0.000000,0.000000,33.000000,0.000000,0.000000,0.000000,...,,32.000000,0.000000,0.000000,22.000000,0.000000,0.000000,20.000000,0.000000,0.0
2,0.0,0.000000,0.000000,25.000000,0.000000,0.000000,0.000000,0.000000,35.000000,0.000000,...,,16.000000,0.000000,0.000000,22.000000,0.000000,0.000000,26.000000,0.000000,0.0
3,0.0,0.000000,0.000000,37.000000,0.000000,0.000000,32.000000,0.000000,33.000000,0.000000,...,,31.000000,0.000000,0.000000,22.000000,0.000000,0.000000,38.000000,62.000000,0.0
4,0.0,0.000000,0.000000,37.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000,...,,24.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7.000000,0.000000,0.0
5,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,48.000000,0.000000,0.000000,0.000000,...,,15.000000,0.000000,0.000000,0.000000,0.000000,0.000000,10.000000,92.000000,0.0
6,0.0,0.000000,0.000000,23.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,20.000000,0.000000,0.000000,21.000000,0.000000,0.000000,22.000000,0.000000,0.0
7,0.0,0.000000,0.000000,23.000000,0.000000,0.000000,31.000000,0.000000,0.000000,0.000000,...,,10.000000,0.000000,0.000000,0.000000,0.000000,0.000000,28.000000,0.000000,0.0
8,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,62.000000,0.000000,32.000000,63.000000,...,,30.000000,0.000000,0.000000,0.000000,0.000000,0.000000,22.000000,0.000000,0.0
9,0.0,0.000000,0.000000,24.000000,0.000000,0.000000,32.000000,54.000000,0.000000,0.000000,...,,31.000000,0.000000,0.000000,21.000000,0.000000,0.000000,37.000000,0.000000,0.0


In [105]:
trend_fea

Unnamed: 0,air pollutant,air pollution,arrhythmia,asthma,asthma attack,black carbon,bronchitis,cardiovascular disease,chest pain,chest tightness,...,smoggy,smoke,snoring,soot,sulfate,tailpipe,throat irritation,traffic,wheezing,wildfires
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,57.000000,0.000000,59.000000,0.000000,...,,19.000000,0.000000,0.000000,39.000000,0.000000,0.000000,6.000000,0.000000,0.0
1,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,34.000000,0.000000,0.000000,0.000000,...,,22.000000,0.000000,0.000000,0.000000,0.000000,100.000000,14.000000,0.000000,0.0
2,0.0,0.000000,0.000000,37.000000,0.000000,0.000000,33.000000,0.000000,0.000000,0.000000,...,,32.000000,0.000000,0.000000,22.000000,0.000000,0.000000,20.000000,0.000000,0.0
3,0.0,0.000000,0.000000,25.000000,0.000000,0.000000,0.000000,0.000000,35.000000,0.000000,...,,16.000000,0.000000,0.000000,22.000000,0.000000,0.000000,26.000000,0.000000,0.0
4,0.0,0.000000,0.000000,37.000000,0.000000,0.000000,32.000000,0.000000,33.000000,0.000000,...,,31.000000,0.000000,0.000000,22.000000,0.000000,0.000000,38.000000,62.000000,0.0
5,0.0,0.000000,0.000000,37.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000,...,,24.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7.000000,0.000000,0.0
6,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,48.000000,0.000000,0.000000,0.000000,...,,15.000000,0.000000,0.000000,0.000000,0.000000,0.000000,10.000000,92.000000,0.0
7,0.0,0.000000,0.000000,23.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,20.000000,0.000000,0.000000,21.000000,0.000000,0.000000,22.000000,0.000000,0.0
8,0.0,0.000000,0.000000,23.000000,0.000000,0.000000,31.000000,0.000000,0.000000,0.000000,...,,10.000000,0.000000,0.000000,0.000000,0.000000,0.000000,28.000000,0.000000,0.0
9,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,62.000000,0.000000,32.000000,63.000000,...,,30.000000,0.000000,0.000000,0.000000,0.000000,0.000000,22.000000,0.000000,0.0
