# Data Cleaning and Preprocessing for Market Predictions

> Copyright 2019 Dave Fernandes. All Rights Reserved.
> 
> Licensed under the Apache License, Version 2.0 (the "License");
> you may not use this file except in compliance with the License.
> You may obtain a copy of the License at
>
> http://www.apache.org/licenses/LICENSE-2.0
>  
> Unless required by applicable law or agreed to in writing, software
> distributed under the License is distributed on an "AS IS" BASIS,
> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> See the License for the specific language governing permissions and
> limitations under the License.

Data files can be downloaded from: https://www.kaggle.com/aaron7sun/stocknews

In [None]:
import numpy as np
import tensorflow as tf
import os
import csv
import datetime

Load news content and clean up strings.

In [None]:
NEWS_CSV = './data/RedditNews.csv'

def clean_string(s):
    s = ' '.join(s.splitlines())
    
    if s[0] == 'b':
        cleaned_s = s[2:-1]
    else:
        cleaned_s = s
    
    return cleaned_s

# Create dictionary of lists of news articles keyed by date string
news_by_date = {}

with open(NEWS_CSV) as csvfile:
    reader = csv.reader(csvfile)
    
    for row in reader:
        date = row[0]
        string = row[1]
        
        if date in news_by_date:
            string_list = news_by_date[date]
        elif date != 'Date':
            string_list = []
            news_by_date[date] = string_list
        else:
            string_list = None
        
        if string_list != None:
            string_list.append(clean_string(string))

Load stock index values and compute derived quantities to be used for training.

In [None]:
STOCKS_CSV = './data/DJIA_table.csv'

# Column indices from original data
DATE = 0
OPEN = 1
HIGH = 2
LOW = 3
CLOSE = 4
VOLUME = 5
ADJ_CLOSE = 6

# Column indices for derived data
DAYS_SINCE_LAST_TRADE = 0  # Days elapsed since previous trading day
DAYS_UNTIL_NEXT_TRADE = 1  # Days until next trading day
OPEN_LOG_RATIO        = 2  # Log of ratio of adjusted open to previous day's adjusted close
CLOSE_LOG_RATIO       = 3  # Log of ratio of adjusted close to previous day's adjusted close
HIGH_LOG_RATIO        = 4  # Log of ratio of adjusted high to previous day's adjusted close
LOW_LOG_RATIO         = 5  # Log of ratio of adjusted low to previous day's adjusted close
VOLUME_LOG            = 6  # Log of volume
DID_INCREASE_AT_OPEN  = 7  # 1 if adjusted open price is greater than previous day's adjusted close price; 0 otherwise
DID_INCREASE_AT_CLOSE = 8  # 1 if close price is greater than open price; 0 otherwise

stats = []
dates = []

with open(STOCKS_CSV) as csvfile:
    reader = csv.reader(csvfile)
    index = 0
    
    for row in reader:
        date_string = row[0]
        if date_string == 'Date':
            continue
        
        date = datetime.datetime.strptime(date_string, "%Y-%m-%d").date()
        open_val = float(row[OPEN])
        high_val = float(row[HIGH])
        low_val = float(row[LOW])
        close_val = float(row[CLOSE])
        volume_val = float(row[VOLUME])
        adj_close = float(row[ADJ_CLOSE])

        if close_val != adj_close:
            print(date_string, ' Adjustment =', adj_close / close_val)
        
        values = np.zeros((DID_INCREASE_AT_CLOSE + 1))
        values[VOLUME_LOG] = np.log(volume_val)
        values[DID_INCREASE_AT_CLOSE] = 1.0 if close_val > open_val else 0.0
        
        # Previous index is later date
        if index > 0:
            delta_date = (next_date - date).total_seconds() / 3600.0 / 24.0
            values[DAYS_UNTIL_NEXT_TRADE] = delta_date
            stats[index - 1][DAYS_SINCE_LAST_TRADE] = delta_date
            stats[index - 1][OPEN_LOG_RATIO] = np.log(next_adj_open / adj_close)
            stats[index - 1][CLOSE_LOG_RATIO] = np.log(next_adj_close_val / adj_close)
            stats[index - 1][HIGH_LOG_RATIO] = np.log(next_adj_high_val / adj_close)
            stats[index - 1][LOW_LOG_RATIO] = np.log(next_adj_low_val / adj_close)
            stats[index - 1][DID_INCREASE_AT_OPEN] = 1.0 if next_adj_open > adj_close else 0.0

        stats.append(values)
        dates.append(date_string)
        next_date = date
        next_adj_open = open_val * adj_close / close_val
        next_adj_high_val = high_val * adj_close / close_val
        next_adj_low_val = low_val * adj_close / close_val
        next_adj_close_val = adj_close
        index += 1

n = int(index * 0.2)
test_stats = stats[1:n][::-1]
train_stats = stats[n:-1][::-1]
test_dates = dates[1:n][::-1]
train_dates = dates[n:-1][::-1]

Save stock index stats in a TFRecord file.

In [None]:
STOCKS_TEST_OUT = './data/stock_test.tfrecords'
STOCKS_TRAIN_OUT = './data/stock_train.tfrecords'

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _float_vector_feature(values):
    return tf.train.Feature(float_list=tf.train.FloatList(value=values))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _string_feature(value):
    return _bytes_feature(value.encode('utf-8'))

def write_stats(filename, dates_list, stats_list):
    with tf.python_io.TFRecordWriter(filename) as writer:
        for (i, stat) in enumerate(stats_list):
            date = dates_list[i]
            feature_vector = stat[DAYS_SINCE_LAST_TRADE : VOLUME_LOG + 1]
            open_inc = int(stat[DID_INCREASE_AT_OPEN])
            close_inc = int(stat[DID_INCREASE_AT_CLOSE])

            example = tf.train.Example(
                features=tf.train.Features(
                    feature={
                        'statistics': _float_vector_feature(feature_vector),
                        'date': _string_feature(date),
                        'open_inc': _int64_feature(open_inc),
                        'close_inc': _int64_feature(close_inc)
                        }))
            writer.write(example.SerializeToString())
        
write_stats(STOCKS_TEST_OUT, test_dates, test_stats)
write_stats(STOCKS_TRAIN_OUT, train_dates, train_stats)

Save normalization information.

In [None]:
import pickle
STOCKS_NORM = './data/stock_norm.py'

normalization = []
normalization.append(np.mean(train_stats, axis=0))
normalization.append(np.std(train_stats, axis=0))
normalization.append(np.max(train_stats, axis=0))
normalization.append(np.min(train_stats, axis=0))

with open(STOCKS_NORM, 'wb') as file:
    pickle.dump(normalization, file)

Create input text file for the BERT feature extractor.

In [None]:
EXPECTED_COUNT = 25
NEWS_TEST_TXT = './data/news_test.txt'
NEWS_TRAIN_TXT = './data/news_train.txt'

def write_news(filename, expected_count, dates_list):
    with open(filename, 'w') as file:
        for date_id in dates_list:
            news_list = news_by_date[date_id]
            news_feature_list = []

            if len(news_list) != expected_count:
                print(date_id, 'news count:', len(news_list), '- padding to', expected_count)

            index = 0
            for news_item in news_list:
                file.write(news_item)
                file.write('\n')
                index += 1

            for i in range(index, expected_count):
                file.write(' \n')
    
write_news(NEWS_TEST_TXT, EXPECTED_COUNT, test_dates)
write_news(NEWS_TRAIN_TXT, EXPECTED_COUNT, train_dates)

Run the feature extractor.

In [None]:
from extract_features import extract
MODEL_DIR = './uncased_L-12_H-768_A-12'
NEWS_TEST_OUT = './data/news_test.tfrecords'
NEWS_TRAIN_OUT = './data/news_train.tfrecords'

extract(input_file=NEWS_TEST_TXT, output_file=NEWS_TEST_OUT, bert_model_dir=MODEL_DIR, group_count=EXPECTED_COUNT)
extract(input_file=NEWS_TRAIN_TXT, output_file=NEWS_TRAIN_OUT, bert_model_dir=MODEL_DIR, group_count=EXPECTED_COUNT)

## Next
Run the `NewsfeedTraining.ipynb` notebook next...