In [230]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os 
from sklearn.preprocessing import MinMaxScaler
# Build model
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.recurrent import LSTM
import keras
from sklearn.metrics import confusion_matrix
np.set_printoptions(precision=3, suppress=True)

In [231]:
PER_STOCK_DATA_FOLDER_NAME = 'per_stock_data'
STOCK_FILE_NAME = "{STOCK_ID}.csv"
REQUIRED_COLS_FOR_TRAINING = ['成交股數','開盤價', '最高價', '最低價', '收盤價']

In [232]:
def read_csv(stock_id):
    file_name = STOCK_FILE_NAME.format(STOCK_ID=stock_id)
    file_path = os.path.join(PER_STOCK_DATA_FOLDER_NAME, file_name)
    with open(file_path) as f:
        df = pd.read_csv(f)
        return df

In [233]:
def data_clean(data_df, required_cols):
    # Select data dimension
    data_df_selected = data_df[required_cols].copy(deep=True)
    # Transform data type to numerica. Non-transformable value will be filled with nan
    data_df_selected_numeric = data_df_selected.apply(pd.to_numeric, errors='coerce')
    # Drop row contains nan
    data_df_selected_numeric = data_df_selected_numeric.dropna(how='any')
    # 股數轉張數
    data_df_selected_numeric.loc[:, '成交股數'] = (data_df_selected_numeric['成交股數']/1000).round(0)
    return data_df_selected_numeric


In [234]:
# 命題為預測 n days 區間資料，兩週後的 漲/跌
# 取 sliding window 必須包含 (n + 10) days, n days for traing, 10 days for label
# Return value will be a 2-dimesional ndarray
# n days 利用 hstack 接成一個 ndarray
# scaling 對每一個window 進行，避免失真 (?)
def sliding_window(data_array, train_duration, label_duration, step_size, scaler):
    result = []
    window_size = train_duration + label_duration
    for i in range(0, len(data_array) - window_size + 1, step_size):
        window = data_array[i: i + window_size, :]
        # window data scaling
        data_df_scaled = scaler.fit_transform(window)
        hstack_window = np.hstack(data_df_scaled)
        result.append(hstack_window)
    
    return np.array(result)

In [235]:
# 取月資料 (train_duration = 20 days), 兩週 (label_duration = 10 days, 90%資料為 training_set (split_percent = 0.9)
def split_train_test(data_df, train_duration, label_duration, data_dimension, split_percent, step_size, scaler):
    data_array = sliding_window(stock_df.as_matrix(), train_duration, label_duration, step_size, scaler)
    input_set = []
    output_set = []
    window_size = train_duration + label_duration
    for i in range(data_array.shape[0]):
        array_reshape = data_array[[i]].reshape(window_size, data_dimension)
        
        input_set.append(array_reshape[0:train_duration])
        previous_close_price = array_reshape[train_duration-1][4]
        latest_close_price = array_reshape[-1][4]
        price_up_down = 1 if (latest_close_price - previous_close_price) > 0 else 0;
        output_set.append(price_up_down)
    
    total_len = data_array.shape[0];
    number_train = round(0.9 * total_len)
    X_train = np.array(input_set)[0:number_train, :, :]
    y_train = np.array(output_set)[0:number_train]
    
    X_test = np.array(input_set)[number_train:, :, :]
    y_test = np.array(output_set)[number_train:]
    
    return [X_train, y_train, X_test, y_test]

In [228]:
# Build model
def build_model(input_length, input_dim):
    d = 0.3
    model = Sequential()
    model.add(LSTM(
        256,
        input_shape=(input_length, input_dim),
        return_sequences=True))
    model.add(Dropout(d))
    
    model.add(LSTM(
            256,
            return_sequences=False))
    model.add(Dropout(0.2))
    #model.add(Dense(16,kernel_initializer="uniform",activation='relu'))
    model.add(Dense(1,kernel_initializer="uniform",activation='sigmoid'))
    model.compile(loss='mse',optimizer='adam', metrics=['accuracy'])
    return model

In [236]:
stock_df_orig = read_csv('1419')
stock_df = data_clean(stock_df_orig, REQUIRED_COLS_FOR_TRAINING)

X_train, y_train, X_test, y_test = split_train_test(
    data_df = stock_df, 
    train_duration = 20, 
    label_duration = 10, 
    data_dimension = len(REQUIRED_COLS_FOR_TRAINING), 
    split_percent = 0.9,
    step_size = 1,
    scaler = MinMaxScaler)

model = build_model(20,5)
model.fit( X_train, y_train, batch_size=128, epochs=50, validation_split=0.1, verbose=1)
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)
cm = confusion_matrix(y_test, y_pred)

FileNotFoundError: [Errno 2] No such file or directory: 'per_stock_data/1419.csv'

In [189]:
X_train.shape

(2718, 20, 5)

In [190]:
X_test.shape

(302, 20, 5)