In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib
import seaborn
import re
import statsmodels.formula.api

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Configure how graphs will show up in this notebook
%matplotlib inline
seaborn.set_context('notebook', rc={'figure.figsize': (10, 6)}, font_scale=1.5)

In [10]:
#reviews = []
def load_review_category(filename):
    reviews = []
    with open(filename, encoding="utf8") as fd:
        rd = csv.reader(fd, delimiter="\t", quotechar='"', quoting=csv.QUOTE_NONE)
        for row in rd:
            #Row 13 is the product review body. Row 7 is the star rating.
            review = []
            review.append(row[13])
            review.append(row[7])
            reviews.append(review)
    return np.asarray(reviews)
            
def load_all_reviews():
    #load_review_category('D:/fyp-data/amazon_reviews/amazon_reviews_us_Gift_Card_v1_00.tsv')
    load_review_category('../fyp-data/amazon_reviews/amazon_reviews_us_Gift_Card_v1_00.tsv')
    #load_review_category('D:/fyp-data/amazon_reviews/amazon_reviews_us_Digital_Software_v1_00.tsv')

In [11]:
training_data = load_review_category('../fyp-data/amazon_reviews/amazon_reviews_us_Gift_Card_v1_00.tsv')

In [12]:
training_data.shape

(149087, 2)

In [50]:
reviews[300001][0]

"Zero Stars...the idea of unlimited cloud storage for my photos is very attractive and more than worth the cost of Prime membership.  What beggars belief, however, is the execution of this...this app would fail a programming 101 course it's so inept.  No desktop integration, constantly redirecting to the browser (where the functionality is equally poor), limited coverage of industry RAW file formats...who knew Olympus was a major camera manufacturer?  clearly not Amazon...slow upload speeds, poor performance on rendering image previews on web interface.  Once the photos are on there, the interface is also awful.  Very disappointed so far...really want to like this, but the software and basic functionality is undermining a terrific idea....can only hope Amazon has some grand plan waiting in the wings, otherwise I'll not be hanging around for very long."

In [None]:
def load_embeddings(filename):
    """
    Load a DataFrame from the generalized text format used by word2vec, GloVe,
    fastText, and ConceptNet Numberbatch. The main point where they differ is
    whether there is an initial line with the dimensions of the matrix.
    """
    labels = []
    rows = []
    with open(filename, encoding='utf-8') as infile:
        for i, line in enumerate(infile):
            items = line.rstrip().split(' ')
            if len(items) == 2:
                # This is a header row giving the shape of the matrix
                continue
            labels.append(items[0])
            values = np.array([float(x) for x in items[1:]], 'f')
            rows.append(values)
    
    arr = np.vstack(rows)
    return pd.DataFrame(arr, index=labels, dtype='f')

embeddings = load_embeddings('D:/fyp-data/word_embeddings/glove.42B.300d.txt')
embeddings.shape
embeddings

In [None]:
def load_lexicon(filename):
    """
    Load a file from Bing Liu's sentiment lexicon
    (https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html), containing
    English words in Latin-1 encoding.
    
    One file contains a list of positive words, and the other contains
    a list of negative words. The files contain comment lines starting
    with ';' and blank lines, which should be skipped.
    """
    lexicon = []
    with open(filename, encoding='latin-1') as infile:
        for line in infile:
            line = line.rstrip()
            if line and not line.startswith(';'):
                lexicon.append(line)
    return lexicon

pos_words = load_lexicon('D:/fyp-data/sentiment_lexicons/positive-words.txt')
neg_words = load_lexicon('D:/fyp-data/sentiment_lexicons/negative-words.txt')

In [None]:
pos_vectors = embeddings.reindex(pos_words).dropna()
neg_vectors = embeddings.reindex(neg_words).dropna()

vectors = pd.concat([pos_vectors, neg_vectors])
targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
labels = list(pos_vectors.index) + list(neg_vectors.index)

train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
    train_test_split(vectors, targets, labels, test_size=0.1, random_state=0)

model = SGDClassifier(loss='log', random_state=0, max_iter=100)
model.fit(train_vectors, train_targets)
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=100, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=0, shuffle=True, verbose=0,
       warm_start=False)

In [None]:
accuracy_score(model.predict(test_vectors), test_targets)

In [None]:
def vecs_to_sentiment(vecs):
    # predict_log_proba gives the log probability for each class
    predictions = model.predict_log_proba(vecs)

    # To see an overall positive vs. negative classification in one number,
    # we take the log probability of positive sentiment minus the log
    # probability of negative sentiment.
    return predictions[:, 1] - predictions[:, 0]


def words_to_sentiment(words):
    vecs = embeddings.loc[words].dropna()
    log_odds = vecs_to_sentiment(vecs)
    return pd.DataFrame({'sentiment': log_odds}, index=vecs.index)


# Show 20 examples from the test set
words_to_sentiment(test_labels).iloc[:]

In [None]:
import re
TOKEN_RE = re.compile(r"\w.*?\b")
# The regex above finds tokens that start with a word-like character (\w), and continues
# matching characters (.+?) until the next word break (\b). It's a relatively simple
# expression that manages to extract something very much like words from text.


def text_to_sentiment(text):
    tokens = [token.casefold() for token in TOKEN_RE.findall(text)]
    sentiments = words_to_sentiment(tokens)
    return sentiments['sentiment'].mean()

def text_to_rating(text):
    return (text_to_sentiment(text) + 10) / 20 * 5

In [None]:
import spacy
nlp = spacy.load('en_core_web_lg')

example_review = (u"It's a truly great phone, at a grossly inflated price. You "
                 u"would be foolish if you didn't think a large amount of "
                 u"what you are paying for is branding. Yes - you do get a lot "
                 u"of tech with this. And I find that even though some other phones "
                 u"are technically superior the iPhone seems to run better HOWEVER "
                 u"this is nearly one thousand pounds. That is a big investment - "
                 u"more than a laptop. If you can afford it, it won't disappoint but "
                 u"it's defiantly not that much better than the older versions which "
                 u"are 2/3 of the price.")

tokens = nlp(example_review)

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)