In [340]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
np.set_printoptions(precision=2, suppress=True)

In [341]:
PER_STOCK_DATA_FOLDER_NAME = 'stock_data'
STOCK_FILE_NAME = "{STOCK_ID}.csv"
REQUIRED_COLS_FOR_TRAINING = ['成交股數','開盤價', '最高價', '最低價', '收盤價', '日期']

In [374]:
def read_csv(stock_id):
    file_name = STOCK_FILE_NAME.format(STOCK_ID=stock_id)
    file_path = os.path.join(PER_STOCK_DATA_FOLDER_NAME, file_name)
    with open(file_path) as f:
        df = pd.read_csv(f)
        return df

# 命題為預測 n days 區間資料，兩週後的 漲/跌

# 取 sliding window 必須包含 (n + 10) days, n days for traing, 10 days for label
def sliding_window(data_array, train_duration, label_duration, step_size):
    result = []
    window_size = train_duration + label_duration
    for i in range(0, len(data_array) - window_size + 1, step_size):
        window = data_array[i: i + window_size, :]
        hstack_window = np.hstack(window)
        result.append(hstack_window)
    return np.array(result)


# 取月資料 train_duration = 20 days), 兩週 (label_duration = 10 days, 90%資料為 training_set (split_percent = 0.9)
def split_train_test(data_array, train_duration, label_duration, data_dimension, split_percent):
    input_set = []
    output_set = []
    window_size = train_duration + label_duration
    for i in range(data_array.shape[0]):
        array_reshape = data_array[[i]].reshape(window_size, data_dimension)
        
        input_set.append(array_reshape[0:train_duration])
        previous_close_price = array_reshape[train_duration-1][4]
        latest_close_price = array_reshape[-1][4]
        output_set.append(latest_close_price - previous_close_price)
    
    total_len = data_array.shape[0];
    number_train = round(0.9 * total_len)
    X_train = np.array(input_set)[0:number_train, :, :]
    y_train = np.array(output_set)[0:number_train]
    
    X_test = np.array(input_set)[number_train:, :, :]
    y_test = np.array(output_set)[number_train:]
    
    return [X_train, y_train, X_test, y_test]

In [375]:
stock_df_orig = read_csv('1419')
stock_df_processed = stock_df_orig.apply(pd.to_numeric, errors='coerce')
stock_df_processed = stock_df_processed.dropna(how='any')
stock_df_processed['成交股數'] = (stock_df_processed['成交股數']/1000).round(0)
stock_df = stock_df_processed[REQUIRED_COLS_FOR_TRAINING]
stock_df.head()
stock_array = stock_df.as_matrix()

In [376]:
sliding_array = sliding_window(stock_array, 20, 10, 1)
sliding_array.shape



(3020, 180)

X_train, y_train, X_test, y_test = split_train_test(sliding_array, 20, 10, len(REQUIRED_COLS_FOR_TRAINING), 0.9)

In [360]:
X_train.shape

(2718, 20, 6)

In [357]:
stock_df.head(n = 30)

Unnamed: 0,成交股數,開盤價,最高價,最低價,收盤價,日期
0,2435.0,19.45,20.4,19.15,19.8,20080102
1,468.0,19.4,19.8,19.2,19.6,20080103
2,1804.0,19.4,20.3,19.2,20.05,20080104
3,1644.0,19.5,20.15,19.05,19.6,20080107
4,1203.0,19.5,20.3,19.4,19.9,20080108
5,577.0,19.5,19.8,19.35,19.75,20080109
6,1298.0,19.9,20.35,19.55,19.75,20080110
7,560.0,19.7,19.85,19.4,19.85,20080111
8,5680.0,21.2,21.2,20.55,21.2,20080114
9,5078.0,22.4,22.65,22.0,22.65,20080115


In [358]:
X_test.shape

(302, 20, 6)

In [349]:
y_train

array([ 2.65,  4.85,  3.8 , ...,  0.4 ,  0.35,  1.  ])

In [335]:
X_train.shape

(2718, 20, 6)

In [336]:
y_train.shape

(2718,)

In [337]:
y_train

array([ 2.6,  4.9,  3.8, ...,  0.4,  0.4,  1. ])

In [338]:
temp = test_set[0]
temp

2.6499999999999986

In [339]:
stock_df_orig.head()

Unnamed: 0.1,Unnamed: 0,證券代號,成交股數,開盤價,最高價,最低價,收盤價,日期
0,69,1419,2434795,19.45,20.4,19.15,19.8,20080102
1,785,1419,467867,19.4,19.8,19.2,19.6,20080103
2,1500,1419,1804486,19.4,20.3,19.2,20.05,20080104
3,2216,1419,1643662,19.5,20.15,19.05,19.6,20080107
4,2932,1419,1202742,19.5,20.3,19.4,19.9,20080108
