In [3]:
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Flatten
from sklearn.metrics import accuracy_score
from keras import metrics
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

class NeuralNetTrainer:
    
    '''
    self.key_column <- user friendly key to the data.. Date in this case
    self.features <- the fields we are interested in
    self.dataset <- DF with selected features
    self.raw <- DF with selected features and key field
    self.scaled <- DF of scaled, series data. () 
    self.processed_data <- DF with key field and scaled  
    self.time_steps_in <- Number of time steps in input
    self.time_steps_out <- Number of time steps in output
    self.train_X, self.train_Y, self.test_X, self.test_Y <- numpy arrays
    self.x_scaler, self.y_scaler <- MinMax scalers
    self.model <- the trained model 
    self.graph <- the trained graph
    
    '''
    
    float_formatter = lambda x: "%.2f" % x
    np.set_printoptions(formatter={'float_kind':float_formatter})
    
    def load_data(self, training_file='F:\\New folder\\project_stock\\cl1_oil_2000.csv', features=['close', 'macd', 'macd_signal'], key_column='date'):
        
        #load maybe called with different file.
        #if hasattr(self, 'dataset') != True:
        self.dataset = read_csv(training_file, header=0)
        self.keys = self.dataset[key_column]
        self.key_column = key_column
        self.features = features
        #print(self.dataset.columns.values.tolist())
        #dataset.drop(['Unnamed: 0', 'date', 'open', 'high', 'low', 'volume', 'sma','ema','stochOSR.fastK','stochOSR.fastD','stochWPR', 'rsi','vwma' ,'trend','change'], axis=1, inplace=True)
        print("Features Selected: " + str(features))
        # Selecting the ones I want
        self.dataset = self.dataset[features]
        self.raw = concat([self.keys, self.dataset], axis=1)
        self.raw = self.raw.set_index(self.key_column)

    def set_x_scaler(self, scaler):
        self.x_scaler = scaler
    
    def set_y_scaler(self, scaler):
        self.y_scaler = scaler
    
    def set_y(self, ycolumns):
        self.y = ycolumns
        
    def scale_(self):
        if hasattr(self, 'x_scaler') != True:
            self.x_scaler = MinMaxScaler(feature_range=(0, 1))
        if hasattr(self, 'y_scaler') != True:
            self.y_scaler = MinMaxScaler(feature_range=(0, 1))
        
        self.scaled = DataFrame(data=self.x_scaler.fit_transform(self.dataset.values), columns=self.dataset.columns)
        if hasattr(self, 'y'):
            self.scaledy = DataFrame(data=self.y_scaler.fit_transform(self.dataset[self.y].values), columns=self.y)
    
    def one_hot_encode(self, column_to_encode):
        data = self.dataset[column_to_encode]
        values = array(data)
        print(values)
        # integer encode
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(values)
        print(integer_encoded)
        # binary encode
        onehot_encoder = OneHotEncoder(sparse=False)
        integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
        onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
        #print(onehot_encoded)
        oh_df = DataFrame(data=onehot_encoded)
        dataset2 = concat([self.dataset, oh_df], axis=1)
        self.dataset = dataset2.drop([column_to_encode], axis=1)
        
        
    # convert series to supervised learning
    def series_to_supervised(self, n_in=1, n_out=1, dropnan=True):
        data = self.scaled.values
        col_names = self.scaled.columns.values.tolist()
        n_vars = 1 if type(data) is list else data.shape[1]
        df = DataFrame(data)
        #print(df.head())
        cols, names = list(), list()
        # input sequence (t-n, ... t-1)
        for i in range(n_in, 0, -1):
            cols.append(df.shift(i))
            names += [col_names[j] +'(t-%d)' % (i) for j in range(n_vars)]
        # forecast sequence (t, t+1, ... t+n)
        for i in range(0, n_out):
            cols.append(df.shift(-i))
            if i == 0:
                names += [col_names[j] +'(t)' for j in range(n_vars)]
            else:
                names += [col_names[j] +'(t+%d)' % (i) for j in range(n_vars)]
        # put it all together
        agg = concat(cols, axis=1)
        agg.columns = names
        # drop rows with NaN values
        self.processed_data = concat([self.keys, agg], axis=1)
        self.processed_data = self.processed_data.set_index(self.key_column)
        
        #print(self.processed_data.head())
        if dropnan:
            agg.dropna(inplace=True)
            self.processed_data.dropna(inplace=True)
        return agg

    def create_time_series(self, n_in=1, n_out=1):
        self.time_steps_in = n_in
        self.time_steps_out = n_out
        print('Time Steps In: ' + str(n_in))
        print('Time Steps Out: ' + str(n_out))
        self.scale_()
        self.scaled = DataFrame(data=self.series_to_supervised(n_in, n_out))
        
    
    def get_predict_cols(self):
        n_vars = len(self.y)
        names = list()
        for i in range(0, self.time_steps_out):
            if i == 0:
                names += [self.y[j] +'(t)' for j in range(n_vars)]
            else:
                names += [self.y[j] +'(t+%d)' % (i) for j in range(n_vars)]
        return names
    
    def get_ts_forward_pattern(self):
        names = list()
        for i in range(0, self.time_steps_out):
            if i == 0:
                names += ['(t)']
            else:
                names += ['(t+%d)' % (i)]
        return names
    
    def get_train_cols(self):
        all_cols = trainer.scaled.columns.values.tolist()
        predict_cols = self.get_predict_cols()
        
        for col in predict_cols:
            all_cols.remove(col) 
        
        ts_fwd_patterns = self.get_ts_forward_pattern()

        for pattern in ts_fwd_patterns:
            all_cols = list(filter(lambda coll: pattern not in coll, all_cols))
        return all_cols
    
    def vertical_split(self):      
        if hasattr(self, 'y'):
            self.Y = self.scaled.loc[:, self.get_predict_cols()]
            self.X = self.scaled.loc[:, self.get_train_cols()]           
            print('Training Columns: ' + str(self.X.columns.values.tolist()))
            print('Prediction Columns: ' + str(self.Y.columns.values.tolist()))
            #print(self.Y.head())
        else:
            no_train_cols = len(self.dataset.columns.values.tolist()) * self.time_steps_in
            self.X = self.scaled.iloc[:, 0:no_train_cols]
            self.Y = self.scaled.iloc[:, no_train_cols:]
            print(self.Y.head())
        

    def horizontal_split(self, offset, n_train_records, n_test_records=-1):
        start_train = offset
        end_train = offset + n_train_records
        start_test = end_train + 1
        end_test = start_test + n_test_records
        
        #print(self.dataset[self.key_column][])
        self.train_X = self.X.loc[start_train:end_train, :].values        
        self.train_Y = self.Y.loc[start_train:end_train, :].values
        if n_test_records != -1:
            self.test_X = self.X.loc[start_test:end_test, :].values
            self.test_Y = self.Y.loc[start_test:end_test, :].values
        else:
            self.test_X = self.X.loc[start_test:, :].values
            self.test_Y = self.Y.loc[start_test:, :].values
        self.reshape()
        
    def reshape(self):
         # reshape input to be 3D [samples, timesteps, features]
        no_features = len(self.dataset.columns.values.tolist())
        self.train_X = self.train_X.reshape((self.train_X.shape[0], self.time_steps_in, no_features))
        self.test_X = self.test_X.reshape((self.test_X.shape[0], self.time_steps_in, no_features))
        
    def split_by_factor_linear(self, train_factor):
        self.vertical_split()
        n_train_records = int(self.X.shape[0] * train_factor)
        self.horizontal_split(0, n_train_records)
    
    def split_by_factor_random(self, train_factor):
        #horizontal followed by vertical!! for this case
        self.vertical_split()
        self.train_X, self.test_X, self.train_Y, self.test_Y = train_test_split(self.X.values, self.Y.values, test_size=(1-train_factor))
        #print("type------->", str(train_X.columns.values.tolist()))
        self.reshape()
        
    def split_by_count(self, n_train_records, n_test_records, offset=0):
        self.vertical_split()
        self.horizontal_split(offset, n_train_records, n_test_records)        
        
    def split_by_key(self, train_start_key, test_start_key):
        self.vertical_split()
        train_start_ind =  self.processed_data.index.get_loc(train_start_key)
        test_start_ind =  self.processed_data.index.get_loc(test_start_key)
        print('train_start_ind ', train_start_ind)
        print('test_start_ind', test_start_ind)
        n_test_records = self.processed_data.shape[0] - test_start_ind
        if train_start_ind == 0:
            offset = 0
        else:
            offset = train_start_ind - 1
        self.horizontal_split(offset, (test_start_ind - train_start_ind), n_test_records)
            
    def set_model_builder(self, mb):
        self.model_builder = mb
        
    def build(self, epochs=12, batch_size=18, model_file='F:\\New folder\\project_stock\\cl1_oil_model.h5'):
        self.model = self.model_builder(self)
        self.model.fit(self.train_X, self.train_Y, epochs=epochs, batch_size=batch_size, validation_data=(self.test_X, self.test_Y))
        self.save()
    
    def retrain(self, epochs=12, batch_size=18, model_file='F:\\New folder\\project_stock\\cl1_oil_model.h5'):
        if not hasattr(self, 'model'):
            self.model = self.model_builder(self)
        self.model.fit(self.train_X, self.train_Y, epochs=epochs, batch_size=batch_size, validation_data=(self.test_X, self.test_Y))
        self.save()
        
    def save(self, model_file='F:\\New folder\\project_stock\\cl1_oil_model.h5'):
        if hasattr(self, 'model'):
            self.model.save(model_file)
    
    def load_trained_model(self, file='F:\\New folder\\project_stock\\cl1_oil_model.h5'):
        model = load_model(file)
        #https://github.com/fchollet/keras/issues/2397
        model._make_predict_function()
        self.graph = tf.get_default_graph()
        self.model = model
        
    def get_reshaped_data(self, index, should_print=True):
        df = self.processed_data.iloc[index:index+1, :]      
        if hasattr(self, 'y'):
            y = df[self.get_predict_cols()]
            x = df[self.get_train_cols()]
        else:               
            no_train_cols = len(self.dataset.columns.values.tolist()) * self.time_steps_in
            x = df.iloc[:, 0:no_train_cols]
            y = df[no_train_cols:]
           
        x = x.values.reshape(1, self.time_steps_in, len(self.dataset.columns.values.tolist()))[0]
        
        #trans_input = self.x_scaler.inverse_transform(x)
        if should_print:
            print("<-----------Reshaped Data------------->")
            #print("{:0.2f}".format(x) for x in trans_input)
            print(x)
        return x
    
    
    def predict(self, td):
        with self.graph.as_default():
            yhat = self.model.predict(td)
        if hasattr(self, 'y'):
            inv_yhat = self.y_scaler.inverse_transform(yhat)
        else:
            inv_yhat = self.x_scaler.inverse_transform(yhat)
        return inv_yhat
 
    def make_prediction(self, key):
        
        if not hasattr(self, 'model') or not hasattr(self, 'graph'):
            self.load_trained_model()                
        ind = self.processed_data.index.get_loc(key)
        td = self.get_reshaped_data(ind)
        return self.predict(td)

    def get_raw_data(self, key, num_record=1, offset=-1, should_print=True):
        ind =  self.raw.index.get_loc(key)
        raw_data = self.raw.iloc[(ind+offset):((ind+offset) + num_record), :]
        if should_print:
            print('<---Raw Data[Key:%s, num_records:%d, offset:%d] -------->' % (key, num_record, offset))
            print(raw_data)
        return raw_data
  
    
    def get_processed_data(self, key, transform=True, should_print=True):
        ind =  self.processed_data.index.get_loc(key)
        x = self.get_reshaped_data(ind)
        if transform:
            x = self.x_scaler.inverse_transform(x)
       
        if should_print:
            print("<-----------Prediction Input------------->")
            print(x)
        return x

In [2]:
def model_builder(trainer):
    model = Sequential()  
    #model.add(Flatten())
    model.add(LSTM(2, input_shape=trainer.train_X.shape[1:], return_sequences=True, activation='relu'))
    model.add(LSTM(2, input_shape=trainer.train_X.shape[1:], return_sequences=False, activation='relu'))
    #model.add(Dense(4))
    model.compile(loss='mse',optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

mb = model_builder
trainer = NeuralNetTrainer()
trainer.set_model_builder(mb)
trainer.load_data('F:\\New folder\\project_stock\\cl1_oil_2000.csv', features=['close','macd', 'macd_signal'])
trainer.set_y(['close','macd','macd_signal'])
trainer.create_time_series(2, 1)
trainer.split_by_factor_random(.90)
#trainer.split_by_factor_linear(.95)
#trainer.split_by_key(train_start_key='1/11/2000', test_start_key='9/1/2017')

trainer.build(epochs=50, batch_size=11)



KeyError: 'date'

In [None]:
savedModel = load_model('F:\\New folder\\project_stock\\cl1_oil_model.h5')

In [None]:
pred=savedModel.predict(trainer.test_X)
pred

In [None]:
inversePred=trainer.y_scaler.inverse_transform(pred)
inversePred

In [None]:
trainer1 = NeuralNetTrainer()
trainer1.set_model_builder(mb)
trainer1.load_data('F:\\New folder\\project_stock\\cl1_oil_2000.csv', features=['close','macd', 'macd_signal'])
trainer1.set_y(['close','macd','macd_signal'])
trainer1.create_time_series(3, 1)
trainer1.split_by_factor_linear(.90)
#trainer.split_by_factor_linear(.95)
#trainer.split_by_key(train_start_key='1/11/2000', test_start_key='9/1/2017')


In [None]:
pred1=savedModel.predict(trainer1.test_X)
pred1

In [None]:
inversePred1=trainer.y_scaler.inverse_transform(pred1)
inversePred1

In [None]:
actual=trainer.dataset['close']
actual.shape

In [None]:
actual=trainer1.test_Y
actual

In [None]:
actualTestY=trainer1.y_scaler.inverse_transform(actual)
actualTestY

In [None]:
actualPlot=actualTestY[:,0:1]
predictPlot=inversePred1[:,0:1]
print(actualPlot.shape)
print(predictPlot.shape)

In [None]:
actualPlot

In [None]:
predictPlot

In [None]:
import matplotlib.pyplot as plt
import numpy as np
t = np.arange(0, 446, 1)
s1=actualPlot
s2=predictPlot
plt.figure(1)
plt.subplot(211)
p1=plt.plot(t, s1,label='Actual')
p2=plt.plot(t,s2,label='Predicted')


#plt.subplot(212)
#plt.plot(t, 2*s1)
plt.show()

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
 mean_absolute_percentage_error(actualPlot,predictPlot)

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(actualPlot, predictPlot)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(actualPlot,predictPlot)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
rms = sqrt(mean_squared_error(actualPlot, predictPlot))
print(rms)