In [442]:
import os
import math
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt  
import joblib

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf


from skopt import BayesSearchCV
from skopt.space import Real, Categorical

from tsextract.feature_extraction.extract import build_features
from statistics import mean, median, std

from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, mean_absolute_percentage_error, r2_score

# from google.cloud import storage

In [395]:
def segregate_datetime(df, date_col):
    try:
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
        df['year'] = df[date_col].dt.year
        df['month'] = df[date_col].dt.month
        df['day'] = df[date_col].dt.day
        return df

    except Exception as e:
        print(str(e))

In [396]:
def aggregate_interval(df, target_feature, freq='D'):
    try:
        sum_df = pd.DataFrame(df[target_feature].resample(freq).sum())
        sum_df.dropna(inplace=True)

        ave_df = pd.DataFrame(df[target_feature].resample(freq).mean())
        ave_df.dropna(inplace=True)

        if [pd.date_range(start=str(np.datetime_as_string(df.index.values[0], unit=freq)),
                          end=str(np.datetime_as_string(df.index.values[-1], unit=freq))).difference(
                df.index)] is not None:
            sum_df = sum_df.resample(freq).interpolate(method='linear')
            ave_df = ave_df.resample(freq).interpolate(method='linear')

        return sum_df, ave_df

    except Exception as e:
        print(str(e))

In [397]:
def normalise_series(series):
    try:
        avg, dev = series.mean(), series.std()
        return (series - avg) / dev

    except Exception as e:
        print(str(e))

In [431]:
def remove_increasing_vol(series, series_range='M'):
    transformed_series = series
    try:
        if series_range == 'D':
            volatility = series.groupby(series.index.day).std()
            vol = series.index.map(lambda d: volatility.loc[d.day])
            transformed_series = series / vol
        if series_range == 'M':
            volatility = series.groupby(series.index.month).std()
            vol = series.index.map(lambda d: volatility.loc[d.month])
            transformed_series = series / vol
        if series_range == 'Y':
            volatility = series.groupby(series.index.year).std()
            vol = series.index.map(lambda d: volatility.loc[d.year])
            transformed_series = series / vol
        return transformed_series

    except Exception as e:
        print(str(e))

In [432]:
def difference(series):
    try:
        return series.diff()
    except Exception as e:
        print(str(e))

In [433]:
def adfuller_test(values):
    # Augmented Dickey-Fuller test
    dickey_fuller_res = adfuller(values)
    print('test statistic: ', dickey_fuller_res[0], '\np-value: ', dickey_fuller_res[1], '\nstationary: ', str(dickey_fuller_res[1] <= 0.05))
    print('Critical Values:')
    for key, value in dickey_fuller_res[4].items():
        print('\t%s: %.3f' % (key, value))
    return dickey_fuller_res[1] <= 0.05

In [434]:
def get_optimal_pq(series, alpha, lag_size):
    try:
        # plot_acf(series, alpha=alpha)
        acfs, aci = acf(series, nlags=math.ceil(len(series) * lag_size) - 1, alpha=alpha)
        acfs = [i for i in range(0, len(acfs)) if acfs[i] < (np.array(aci[i]) - acfs[i])[0] or acfs[i] > (np.array(aci[i]) - acfs[i])[1]]
        # plot_pacf(series, alpha=alpha)
        pacfs, pci = pacf(series, nlags=math.ceil(len(series) * lag_size) - 1, alpha=alpha)
        pacfs = [i for i in range(0, len(pacfs)) if pacfs[i] < (np.array(pci[i]) - pacfs[i])[0] or pacfs[i] > (np.array(pci[i]) - pacfs[i])[1]]
        return acfs[-1], pacfs[-1]
    except Exception as e:
        print(str(e))

In [435]:
def create_features_request(target_col_df, label, window_size=False, diff_window_size=False, alpha=0.05, lag_size=0.25):
    try:
        features_request = {}
        acfs, pacfs = get_optimal_pq(target_col_df, alpha, lag_size)
        window_size = window_size if window_size else pacfs
        diff_window_size = diff_window_size if diff_window_size else acfs
        if diff_window_size != 0:
            features_request["difference"] = [diff_window_size, 1]
        if window_size != 0:
            features_request["window"] = [window_size]
        else:
            features_request["window"] = [0]
        features_request['name'] = label

        return features_request

    except Exception as e:
        print(str(e))

In [436]:
def create_lagged_df(target_col_df, features_request , target_lag=3, include_tzero=True):
    try:
        build_df = build_features(target_col_df, features_request, target_lag=target_lag,
                                  include_tzero=include_tzero)
        return build_df

    except Exception as e:
        print(str(e))

In [437]:
def scale_lagged_df(df, label_count):
    try:
        if label_count != 0:
            scaler_features = StandardScaler().fit(df[df.columns.values[:-label_count]])
            scaler_label = StandardScaler().fit(
                np.array(df[df.columns.values[-label_count:]]).reshape(-1, label_count))
        else:
            scaler_features = StandardScaler().fit(df[df.columns.values])
            scaler_label = scaler_features

        return scaler_features, scaler_label

    except Exception as e:
        print(str(e))

In [438]:
def split_lagged_df(df, label_count, train_size=0.7):
    try:
        scaler_features, scaler_label = scale_lagged_df(df, label_count)
        scaled_features = scaler_features.transform(df[df.columns.values[:-label_count]])
        scaled_label = scaler_label.transform(np.array(df[df.columns.values[-label_count:]]).reshape(-1, label_count))

        ### Split data using train proportion of 0.7
        train_size = int(scaled_features[:, :-1].shape[0] * train_size)

        X_train, y_train = scaled_features[:train_size, :-label_count], scaled_label[:train_size, :]
        X_test, y_test = scaled_features[train_size:, :-label_count], scaled_label[train_size:, :]

        return X_train, y_train, X_test, y_test

    except Exception as e:
        print(str(e))

In [470]:
def mlpregressor(X_train, y_train, X_test, y_test, h_cells):
    
    mlp = MLPRegressor(
                        hidden_layer_sizes=h_cells,
                        max_iter=1000, 
                        activation='relu', 
                        solver='adam',  
                        learning_rate='adaptive', 
                        validation_fraction=0.2,
                        shuffle=False,
                        random_state=10,
                        batch_size=math.ceil(len(X_train)/10))
    params = {
            # "n_iter_no_change": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
            "alpha": Categorical(categories=[0.001, 0.05]),
            }
    
    model = BayesSearchCV(mlp, params, n_jobs=3)
    model.fit(X_train, y_train.ravel())
    
    return model

In [471]:
def preprocess_series(df, labels, date_column, freq, training_range):
    feature_dfs = []
    label_dfs = []
    features_requests = []
    df_data_copy = df.copy()
    for label in labels:
        print('processing label '+label+'...')
        print(df_data_copy)
        df_data = pd.DataFrame({date_column: df_data_copy[date_column], label: df_data_copy[label]})
        print(df_data.head(10))
        initial_df = segregate_datetime(df_data, date_column)
        initial_df.set_index(date_column, inplace=True)
        print('created initial_df')
        print(initial_df.head(10))
        df_preprocessed_sum, df_preprocessed_ave = aggregate_interval(initial_df, label, freq)
        print('created df_preprocessed_ave')
        print(df_preprocessed_ave.head(10))
        df_preprocessed_ave = df_preprocessed_ave[:training_range]
        target_series = df_preprocessed_ave[label]
        target_series_for_transform = target_series.copy()
        target_series_for_transform = normalise_series(target_series_for_transform)
        print('normalised series')
        print(target_series_for_transform[10:])
        if not adfuller_test(pd.Series(target_series_for_transform.values)):
            target_series_for_transform = difference(target_series_for_transform)
            target_series_for_transform.fillna(0, inplace=True)
        print('tested stationarity of series')
        target_series_for_transform = remove_increasing_vol(target_series_for_transform, freq)
        print('transformed the series')
        print(target_series_for_transform[30:])
        features_request = create_features_request(target_series_for_transform, label)
        print('features_request:', features_request)
        print('created features request for series')
        features_request_for_transform = features_request.copy()
        features_request_for_transform.pop("name")
        lagged_df = create_lagged_df(target_series_for_transform, features_request_for_transform)
        print('created ARMA features for series')
        print(lagged_df.head(10))
        target_df = pd.DataFrame({label + '_target': lagged_df[lagged_df.columns.values[-1]].values},
                                 index=lagged_df.index)
        print('created dataframe for label')
        print(target_df.head(10))
        feature_df = lagged_df.drop(columns=lagged_df.columns.values[-1])
        print('dropped the label in series')
        feature_df.rename(columns={col: label + '_' + col for col in feature_df.columns.values if label not in col}, inplace=True)
        print('renamed ARMA features for series')
        print(feature_df.head(10))
        feature_df = pd.concat([feature_df, target_series], axis=1)
        features_requests.append(features_request)
        feature_dfs.append(feature_df)
        label_dfs.append(target_df)

    merged_feature_df = pd.concat(feature_dfs, axis=1)
    print('merged features from list')
    merged_label_df = pd.concat(label_dfs, axis=1)
    print('merged labels from list')
    train_df = pd.concat([merged_feature_df, merged_label_df], axis=1)
    print('created series for training')
    train_df.dropna(inplace=True, axis=0)
    print('dropped nas in the series')

    return train_df, features_requests

In [474]:
def train_model(df, labels, date_column):
    label_count = len(labels)
    freq = 'D' #transfer to payload config
    training_range = len(df) #transfer to payload config
    train_df, features_requests = preprocess_series(df,labels,date_column,freq,training_range)
    train_df_for_training = train_df.copy()
    train_df_for_training.drop(columns=labels, inplace=True)
    scaler_features, scaler_label = scale_lagged_df(train_df_for_training, label_count)
    X_train, y_train, X_test, y_test = split_lagged_df(train_df_for_training, label_count)

    feature_count = len(np.squeeze(np.array([train_df.columns.values[:-label_count]])))
    h_cells = tuple([feature_count] * label_count)
    model = mlpregressor(X_train, y_train, X_test, y_test, h_cells)

    model.features_requests = features_requests
    model.train_df = train_df
    model.scaler_label = scaler_label
    model.labels = labels
    model.date_column = date_column
    model.freq = freq

    return model

In [475]:
df = pd.read_csv("oil.csv")
labels = ['dcoilwtico']
date_column = 'date'
model = train_model(df, labels, date_column)

processing label dcoilwtico...
            date  dcoilwtico
0     2013-01-01         NaN
1     2013-01-02       93.14
2     2013-01-03       92.97
3     2013-01-04       93.12
4     2013-01-07       93.20
...          ...         ...
1213  2017-08-25       47.65
1214  2017-08-28       46.40
1215  2017-08-29       46.46
1216  2017-08-30       45.96
1217  2017-08-31       47.26

[1218 rows x 2 columns]
         date  dcoilwtico
0  2013-01-01         NaN
1  2013-01-02       93.14
2  2013-01-03       92.97
3  2013-01-04       93.12
4  2013-01-07       93.20
5  2013-01-08       93.21
6  2013-01-09       93.08
7  2013-01-10       93.81
8  2013-01-11       93.60
9  2013-01-14       94.27
created initial_df
            dcoilwtico  year  month  day
date                                    
2013-01-01         NaN  2013      1    1
2013-01-02       93.14  2013      1    2
2013-01-03       92.97  2013      1    3
2013-01-04       93.12  2013      1    4
2013-01-07       93.20  2013      1    7
2013



In [476]:
model_dir = "model_storage"
local_model_storage_path = os.path.join(model_dir, "model-new.sav")
joblib.dump(model, local_model_storage_path)

['model_storage/model-new.sav']