# Regression Analysis for Sentiment

> Copyright 2019 Dave Fernandes. All Rights Reserved.
> 
> Licensed under the Apache License, Version 2.0 (the "License");
> you may not use this file except in compliance with the License.
> You may obtain a copy of the License at
>
> http://www.apache.org/licenses/LICENSE-2.0
>  
> Unless required by applicable law or agreed to in writing, software
> distributed under the License is distributed on an "AS IS" BASIS,
> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> See the License for the specific language governing permissions and
> limitations under the License.

## Overview
This notebook performs regression to predict sentiment and helpfulness scores from text reviews.
- Data for this analysis should be prepared using the `Preprocessing.ipynb` notebook from this project.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import datetime

import tensorflow as tf
tf.enable_eager_execution()

MODEL_DIR = './data/Regression'
INPUT_DIR = './data/TFRecords'

TRAIN_REVIEW = 'train_text_512'
TRAIN_SUMMARY = 'train_summary'
TRAIN_TEXT = TRAIN_REVIEW
TRAIN_SCORES = 'train_values'

TEST_REVIEW = 'test_text_512'
TEST_SUMMARY = 'test_summary'
TEST_TEXT = TEST_REVIEW
TEST_SCORES = 'test_values'

def rec_path(filename):
    return os.path.join(INPUT_DIR, filename + '.tfrec')

### Mapping function for scores dataset
- Includes normalization

In [None]:
def parse_score_record(example):
    # Features in scores TFRecord file
    F_SCORE = 'score'              # Review score (1-5)
    F_VOTES = 'votes'              # Number of up/down votes
    F_HELPFULNESS  = 'helpfulness' # Fraction of up-votes

    features_desc = {
        F_SCORE: tf.FixedLenFeature([], tf.int64),
        F_VOTES: tf.FixedLenFeature([], tf.int64),
        F_HELPFULNESS: tf.FixedLenFeature([], tf.float32),
        }
    features = tf.parse_single_example(example, features=features_desc)
    
    score = tf.to_float(features[F_SCORE])
    votes = features[F_VOTES]
    helpfulness = features[F_HELPFULNESS]
    
    # Normalize to zero mean and unit range
    normed_score = (score - 3.0) * 0.5
    
    return normed_score

### Mapping function for review text dataset
- BERT feature vectors for each review or summary

In [None]:
FEATURE_VECTOR_LENGTH = 768*4
FEATURE_VECTOR_COUNT = 1

def parse_review_record(example):
    # Features in reviews TFRecord file
    F_LENGTH = 'vector_length' # Length of each feature vector
    F_COUNT = 'vector_count'   # Count of feature vectors in list
    F_VECTORS  = 'vector_list' # List of feature vectors

    features_desc = {
        F_LENGTH: tf.FixedLenFeature([], tf.int64),
        F_COUNT: tf.FixedLenFeature([], tf.int64),
        F_VECTORS: tf.FixedLenFeature([FEATURE_VECTOR_COUNT, FEATURE_VECTOR_LENGTH], tf.float32),
        }
    features = tf.parse_single_example(example, features=features_desc)
    
    v_length = features[F_LENGTH]
    v_count = features[F_COUNT]
    v_list = features[F_VECTORS]
    
    return {'news_embedding': v_list}

### Input functions for Estimator

In [None]:
def train_input_fn():
    review_train_set = tf.data.TFRecordDataset([rec_path(TRAIN_TEXT)]).map(parse_review_record)
    score_train_set = tf.data.TFRecordDataset([rec_path(TRAIN_SCORES)]).map(parse_score_record)
    train_set = tf.data.Dataset.zip((review_train_set, score_train_set)).repeat().shuffle(10000).batch(200)
    return train_set

PLOT_BATCH_SIZE = 1000
PLOT_BATCH_COUNT = 5
PLOT_POINT_COUNT = PLOT_BATCH_SIZE * PLOT_BATCH_COUNT

def train_plot_input_fn():
    review_train_set = tf.data.TFRecordDataset([rec_path(TRAIN_TEXT)]).map(parse_review_record)
    score_train_set = tf.data.TFRecordDataset([rec_path(TRAIN_SCORES)]).map(parse_score_record)
    train_set = tf.data.Dataset.zip((review_train_set, score_train_set)).batch(PLOT_BATCH_SIZE).take(PLOT_BATCH_COUNT)
    return train_set

def test_plot_input_fn():
    review_train_set = tf.data.TFRecordDataset([rec_path(TEST_TEXT)]).map(parse_review_record)
    score_train_set = tf.data.TFRecordDataset([rec_path(TEST_SCORES)]).map(parse_score_record)
    test_set = tf.data.Dataset.zip((review_train_set, score_train_set)).batch(PLOT_BATCH_SIZE).take(PLOT_BATCH_COUNT)
    return test_set

def test_input_fn():
    review_train_set = tf.data.TFRecordDataset([rec_path(TEST_TEXT)]).map(parse_review_record)
    score_train_set = tf.data.TFRecordDataset([rec_path(TEST_SCORES)]).map(parse_score_record)
    test_set = tf.data.Dataset.zip((review_train_set, score_train_set)).batch(1000)
    return test_set

### Define model
This model is for regression against aggregated scores. It is currently unused.

In [None]:
def news_model_fn(features, labels, mode, params):
    current_layer = tf.feature_column.input_layer(features, params['feature_columns'])
    current_layer = tf.expand_dims(tf.layers.flatten(current_layer, name='flatten_input'), -1)
    
    for i, units in enumerate(params['conv_filters']):
        if i == 0:
            kernel_size = FEATURE_VECTOR_LENGTH,
            strides = FEATURE_VECTOR_LENGTH,
        else:
            kernel_size = 1,
            strides = 1,
            
        current_layer = tf.layers.conv1d(current_layer,
            name='conv1d_' + str(i+1),
            filters=units,
            data_format='channels_last',
            kernel_size=kernel_size,
            strides=strides,
            padding='valid',
            kernel_initializer=tf.glorot_normal_initializer(),
            kernel_regularizer=tf.keras.regularizers.l2(l=0.01),
            bias_regularizer=tf.keras.regularizers.l2(l=0.01),
            activation=tf.nn.tanh)

    current_layer = tf.layers.max_pooling1d(current_layer,
        pool_size=FEATURE_VECTOR_COUNT,
        strides=1,
        padding='valid',
        data_format='channels_last')

    current_layer = tf.layers.flatten(current_layer)
    
    for i, units in enumerate(params['hidden_units']):
        current_layer = tf.layers.dense(current_layer,
            name='dense_' + str(i+1),
            units=units,
            kernel_initializer=tf.glorot_normal_initializer(),
            kernel_regularizer=tf.keras.regularizers.l2(l=0.01),
            bias_regularizer=tf.keras.regularizers.l2(l=0.01),
            activation=tf.nn.tanh)

    regression_layer = tf.layers.dense(current_layer,
        name='linear_output',
        units=REGRESSION_LABEL_COUNT,
        kernel_initializer=tf.glorot_normal_initializer(),
        kernel_regularizer=tf.keras.regularizers.l2(l=0.01),
        bias_regularizer=tf.keras.regularizers.l2(l=0.01),
        activation=None)

    # For prediction, exit here
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'predictions': regression_layer,
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    # For training and evaluation, compute the loss (MSE)
    loss = tf.losses.mean_squared_error(labels, regression_layer)

    abs_error = tf.metrics.mean_tensor(tf.reduce_mean(tf.abs(labels - regression_layer), axis=0))
    metrics = {'abs_error': abs_error}
    tf.summary.tensor_summary('abs_error', abs_error)

    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)

    # For training...
    optimizer = tf.train.AdagradOptimizer(learning_rate=0.01)
    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

### Train model

In [None]:
feature_columns = [tf.feature_column.numeric_column('news_embedding', [FEATURE_VECTOR_COUNT, FEATURE_VECTOR_LENGTH])]

"""
regressor = tf.estimator.Estimator(
    model_fn=news_model_fn,
    model_dir=MODEL_DIR,
    params={
        'feature_columns': feature_columns,
        'conv_filters': [300, 30],
        'hidden_units': [30, 8],
    })
"""

regressor = tf.estimator.DNNRegressor(
    feature_columns=feature_columns,
    hidden_units=[300, 30, 8],
    model_dir=MODEL_DIR,
    label_dimension=1,
    activation_fn=tf.nn.relu,
    optimizer='Adagrad',
    batch_norm=False,
    dropout=None
)

regressor.train(train_input_fn, steps=6000)

info = regressor.evaluate(input_fn=test_input_fn)

### Plot the predictions

In [None]:
dataset_fn = test_plot_input_fn
REGRESSION_LABEL_COUNT = 1

predictions = regressor.predict(input_fn=dataset_fn)
dataset = dataset_fn()
actual = None
predicted = []

for i, value in enumerate(predictions):
    predicted.append(value['predictions'])

j = 0
for x, y in dataset:
    batch = y.numpy()
    noise = np.random.normal(0.0, 0.15, np.size(batch))
    #noise = np.random.random(np.size(batch)) - 0.5
    batch = batch + np.reshape(noise, np.shape(batch))
    
    if j == 0:
        actual = batch
    else:
        actual = np.concatenate((actual, batch), axis=0)
        
    j += np.shape(batch)[0]

actual = np.reshape(actual, (len(actual), REGRESSION_LABEL_COUNT))
predicted = np.reshape(predicted, (len(predicted), REGRESSION_LABEL_COUNT))

titles = ['Open', 'Close', 'High', 'Low']

for offset in range(REGRESSION_LABEL_COUNT):
    maximum = max(actual[:, offset] + predicted[:, offset])
    minimum = min(actual[:, offset] + predicted[:, offset])
    plt.plot([minimum,maximum],[minimum,maximum])
    
    plt.scatter(actual[:, offset], predicted[:, offset], marker='.', s=1)
    
    plt.ylabel('predicted')
    plt.xlabel('actual')
    plt.title(titles[offset] + ' Log Ratio')
    plt.show()

In [None]:
for offset in range(REGRESSION_LABEL_COUNT):
    maximum = max(predicted[:, offset])
    minimum = min(predicted[:, offset])
    plt.plot([minimum,maximum],[minimum,maximum])
    
    plt.scatter(actual[:, offset], predicted[:, offset], marker='.', s=1)
    
    plt.ylabel('predicted')
    plt.xlabel('actual')
    plt.title(titles[offset] + ' Log Ratio')
    plt.show()