# Data Cleaning and Preprocessing for Sentiment Analysis

> Copyright 2019 Dave Fernandes. All Rights Reserved.
> 
> Licensed under the Apache License, Version 2.0 (the "License");
> you may not use this file except in compliance with the License.
> You may obtain a copy of the License at
>
> http://www.apache.org/licenses/LICENSE-2.0
>  
> Unless required by applicable law or agreed to in writing, software
> distributed under the License is distributed on an "AS IS" BASIS,
> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> See the License for the specific language governing permissions and
> limitations under the License.

Data files can be downloaded from: https://www.kaggle.com/snap/amazon-fine-food-reviews/version/2

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import re
import datetime

BASE_PATH = './data'
TRAIN_TEXT = 'train_text'
TRAIN_SUMMARY = 'train_summary.txt'
TEST_TEXT = 'test_text.txt'
TEST_SUMMARY = 'test_summary.txt'

def txt_path(filename):
    return os.path.join(BASE_PATH, filename + '.txt')

def rec_path(filename):
    return os.path.join(BASE_PATH, filename + '.tfrecords')

### Load and clean review content

In [None]:
REVIEWS_CSV = './data/amazon-fine-food-reviews/Reviews.csv'

reviews = pd.read_csv(REVIEWS_CSV)
print('Initial count:', reviews.shape)

reviews.drop(['Id', 'ProfileName', 'Time'], axis=1, inplace=True)
reviews.dropna(axis=0, inplace=True)
print('Has all data:', reviews.shape)

reviews.drop_duplicates(subset=['ProductId', 'UserId'], keep='first', inplace=True)
reviews.drop(['ProductId', 'UserId'], axis=1, inplace=True)
print('No duplicates:', reviews.shape)

### Balance the scores
- Scores at the extremes should be equally represented.
- Somewhat lower counts for middle scores is OK.

In [None]:
balanced = None
for score in range(1, 6):
    score_group = reviews[reviews['Score'] == score]
    
    if score == 1:
        balanced = score_group
        max_count = balanced.shape[0]
    else:
        if score_group.shape[0] > max_count:
            score_group = score_group.sample(max_count)
        balanced = pd.concat([balanced, score_group], axis=0)

del reviews
print(balanced.groupby('Score').size())

### Create test and train sets

In [None]:
TEST_FRACTION = 0.2

shuffled = balanced.sample(frac=1, axis=0)
del balanced

n = int(shuffled.shape[0] * TEST_FRACTION)
test_frame = shuffled[0:n]
train_frame = shuffled[n:-1]
del shuffled

Save intermediate text files for processing into BERT feature vectors.

In [None]:
def write_column(column, file_path):
    def clean_html(s):
        clean_fn = re.compile('<.*?>')
        return re.sub(clean_fn, '', s)

    with open(file_path, 'w') as file:
        text_list = column.apply(clean_html).values

        for item in text_list:
            file.write(item)
            file.write('\n')

write_column(train_frame['Text'], txt_path(TRAIN_TEXT))
write_column(train_frame['Summary'], txt_path(TRAIN_SUMMARY))
write_column(test_frame['Text'], txt_path(TEST_TEXT))
write_column(test_frame['Summary'], txt_path(TEST_SUMMARY))

Save numerical columns in a TFRecord file.

In [None]:
VALUES_TEST = 'values_test'
VALUES_TRAIN = 'values_train'

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _float_vector_feature(values):
    return tf.train.Feature(float_list=tf.train.FloatList(value=values))

def _float_feature(value):
    return _float_vector_feature([value])

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _string_feature(value):
    return _bytes_feature(value.encode('utf-8'))

def write_values(filename, data_frame):
    with tf.python_io.TFRecordWriter(filename) as writer:
        for index, row in data_frame.iterrows():
            score = row['Score']
            votes = row['HelpfulnessDenominator']
            upvotes = row['HelpfulnessNumerator']
            helpfulness = float(upvotes) / float(votes) if votes > 0 else 0.0

            example = tf.train.Example(
                features=tf.train.Features(
                    feature={
                        'score': _int64_feature(score),
                        'votes': _int64_feature(votes),
                        'helpfulness': _float_feature(helpfulness),
                        }))
            writer.write(example.SerializeToString())
        
write_values(rec_path(VALUES_TEST), test_frame)
write_values(rec_path(VALUES_TRAIN), train_frame)
del test_frame
del train_frame

- First download the BERT model from: https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
- Unzip this file into the same directory as the `extract_features.py` script.
- Either run the feature extractor from the cell below; or,
- You can also run it from the command line: (you will have to repeat this for each of the 4 text files to be processed)
```
python extract_features.py \
    --input_file=./data/train_text.txt \
    --output_file=./data/train_text.tfrecord \
    --bert_model_dir=./uncased_L-12_H-768_A-12
```

In [None]:
from extract_features import extract
MODEL_DIR = './uncased_L-12_H-768_A-12'

extract(input_file=txt_path(TEST_TEXT), output_file=rec_path(TEST_TEXT), bert_model_dir=MODEL_DIR)
extract(input_file=txt_path(TEST_SUMMARY), output_file=rec_path(TEST_SUMMARY), bert_model_dir=MODEL_DIR)
extract(input_file=txt_path(TRAIN_TEXT), output_file=rec_path(TRAIN_TEXT), bert_model_dir=MODEL_DIR)
extract(input_file=txt_path(TRAIN_SUMMARY), output_file=rec_path(TRAIN_SUMMARY), bert_model_dir=MODEL_DIR)

## Next
Run the `Regression.ipynb` notebook next...