In [None]:
import csv
import random
from nltk.tokenize import RegexpTokenizer

import numpy as np #Linear algebra.
import pandas as pd #Data processing.

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn import neighbors, svm #K-nearest neighbours and Support-vector machine
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error
from sklearn.decomposition import PCA, TruncatedSVD

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
book_reviews = pd.read_csv("D:/fyp-data/amazon_reviews/amazon_reviews_us_Books_v1_00.tsv",
                      usecols=['review_body', 'star_rating'], sep='\t', skiprows=lambda i: i>0 and random.random() > 0.0002)

music_reviews = pd.read_csv("D:/fyp-data/amazon_reviews/amazon_reviews_us_Music_v1_00.tsv",
                      usecols=['review_body', 'star_rating'], sep='\t', skiprows=lambda i: i>0 and random.random() > 0.0004)

electronics_reviews = pd.read_csv("D:/fyp-data/amazon_reviews/amazon_reviews_us_Electronics_v1_00.tsv",
                      usecols=['review_body', 'star_rating'], sep='\t', skiprows=lambda i: i>0 and random.random() > 0.0007)

kitchen_reviews = pd.read_csv("D:/fyp-data/amazon_reviews/amazon_reviews_us_Kitchen_v1_00.tsv",
                      usecols=['review_body', 'star_rating'], sep='\t', skiprows=lambda i: i>0 and random.random() > 0.0005)

automotive_reviews = pd.read_csv("D:/fyp-data/amazon_reviews/amazon_reviews_us_Automotive_v1_00.tsv",
                      usecols=['review_body', 'star_rating'], sep='\t', skiprows=lambda i: i>0 and random.random() > 0.0008)

In [None]:
book_reviews.to_csv("book_reviews.csv")
music_reviews.to_csv("music_reviews.csv")
electronics_reviews.to_csv("electronics_reviews.csv")
kitchen_reviews.to_csv("kitchen_reviews.csv")
automotive_reviews.to_csv("automotive_reviews.csv")

In [None]:
#Create and save panda dataframe containing multiple review categories.
mixed_reviews = pd.concat([book_reviews, music_reviews, electronics_reviews, kitchen_reviews, automotive_reviews])
mixed_reviews.to_csv("mixed_reviews.csv")

In [None]:
#Load reviews.
reviews = pd.read_csv("mixed_reviews.csv")
#Drop first column with previous indexes.
reviews = reviews.drop(reviews.columns[0], axis=1)

In [None]:
reviews = pd.read_csv("kitchen_reviews.csv")

In [None]:
#Convert all review bodies from object to string.
reviews["review_body"] = reviews["review_body"].astype(str)

In [None]:
reviews.isnull().sum()

In [None]:
#Create panda column with number of capital letters in each review body.
reviews["capital_letters"] = [sum(map(str.isupper, body)) for body in reviews["review_body"]]

In [None]:
def text_preprocessing(frame, text):
    frame[text] = frame[text].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    frame[text] = frame[text].str.replace(r"http\S+", "")
    frame[text] = frame[text].str.lower()
    return frame

In [None]:
reviews = text_preprocessing(reviews, "review_body")

In [None]:
#Tokenise the review body.
tokeniser = RegexpTokenizer(r"\w+")
reviews["tokens"] = reviews["review_body"].apply(tokeniser.tokenize)

In [None]:
#Create another column with sentence length of each review body.
reviews["sentence_length"] = [len(tokens) for tokens in reviews["tokens"]]

In [None]:
#Apply the sentiment score function to each review and discard those whose embeddings were not found.
reviews["sentiment_score"] = reviews["tokens"].apply(get_sentiment_score)
reviews = reviews.dropna(subset=["sentiment_score"])

In [None]:
reviews.head()

In [None]:
reviews.describe()

In [None]:
reviews.groupby("star_rating").count()

Sentiment Score

In [None]:
features = reviews[["sentiment_score"]]
labels = reviews["star_rating"].tolist()
#Splitting train/test data 80/20 known as Pareto principle, random state so that we can reproduce our results.
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [None]:
#Scaling
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)

Bag of Words and TfIdf

In [None]:
features = reviews["review_body"].tolist()
labels = reviews["star_rating"].tolist()
#Splitting train/test data 80/20 known as Pareto principle, random state so that we can reproduce our results.
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [None]:
def get_bow_embeddings(data):
    vectoriser = CountVectorizer()

    emb = vectoriser.fit_transform(data)
    return emb, vectoriser

In [None]:
X_train, vectoriser = get_bow_embeddings(X_train)
X_test = vectoriser.transform(X_test)

In [None]:
def tfidf(data):
    transformer = TfidfTransformer()
    
    train = transformer.fit_transform(data)
    return train, transformer

In [None]:
X_train, vectoriser = tfidf(X_train)
X_test = vectoriser.transform(X_test)

In [None]:
pca = PCA(n_components=2)
pca.fit(X_train)
X = pca.transform(X_train)

In [None]:
X.shape

In [None]:
plt.scatter(X[:, 0], X[:, 1],
            c=y_train, edgecolor='none', alpha=0.5,
            cmap=plt.cm.get_cmap('Spectral', 10))
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar();

In [None]:
import time
start = time.time()

rf = RandomForestRegressor(n_estimators = 50, random_state = 42)
model = rf.fit(X_train, y_train)

predictions = rf.predict(X_test)

end = time.time()
print(end - start)

Linear regression 

In [None]:
lr = LinearRegression()
model = lr.fit(X_train, y_train)

predictions = model.predict(X_test)

In [None]:
# Calculate abs erros
errors = abs(predictions - y_test)
# Mean absolute error
mae = round(np.mean(errors), 2)
#Mean absolute percentage error
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)

In [None]:
mae

In [None]:
accuracy

Logistic regression

In [None]:
regressor = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', multi_class='multinomial',
                         n_jobs=-1, random_state=40)
model = regressor.fit(X_train, y_train)

predictions = regressor.predict(X_test)

In [None]:
print(accuracy_score(y_test, predictions))

In [None]:
print(classification_report(y_test, predictions))

K-nearest neighbours

In [None]:
classifier = neighbors.KNeighborsClassifier(n_neighbors = 5)
model = classifier.fit(X_train, y_train)

predictions = model.predict(X_test)

In [None]:
print(accuracy_score(y_test, predictions))

In [None]:
print(classification_report(y_test, predictions))

In [None]:
cm = confusion_matrix(y_test, predictions)
print(cm)

In [None]:
plt.figure(figsize=(6, 6))
ax = sns.heatmap(cm, annot=True, cmap='Greens', fmt='g', linewidths=.5, cbar=False, xticklabels = ['1-star', '2-star', '3-star', '4-star', '5-star'],
                yticklabels = ['1-star', '2-star', '3-star', '4-star', '5-star'])
plt.title(accuracy_score(y_test, predictions))
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

Support Vector Machines

In [None]:
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

#Predict the response for test dataset
predictions = clf.predict(X_test)

In [None]:
print(accuracy_score(y_test, predictions))

In [None]:
print(classification_report(y_test, predictions))

Random Forest Classifier

In [None]:
rf = RandomForestClassifier(n_estimators = 50, random_state = 42)
model = rf.fit(X_train, y_train)

predictions = model.predict(X_test)

In [None]:
print(accuracy_score(y_test, predictions))

In [None]:
print(classification_report(y_test, predictions))

Random Forest Regressor

In [None]:
rf = RandomForestRegressor(n_estimators = 50, random_state = 42)
model = rf.fit(X_train, y_train)

predictions = rf.predict(X_test)

In [None]:
# Calculate abs erros
errors = abs(predictions - y_test)
# Mean absolute error
mae = round(np.mean(errors), 2)
#Mean absolute percentage error
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)

In [None]:
accuracy

In [None]:
mae

In [None]:
labels = []
rows = []
with open("D:/fyp-data/word_embeddings/glove.6B.300d.txt", encoding='utf-8') as infile:
    for i, line in enumerate(infile):
        items = line.rstrip().split(' ')
        if len(items) == 2:
            continue
        labels.append(items[0])
        values = np.array([float(x) for x in items[1:]], 'f')
        rows.append(values)

arr = np.vstack(rows)
word_embeddings = pd.DataFrame(arr, index=labels, dtype='f')

In [None]:
word_embeddings.shape

In [None]:
word_embeddings.head()

In [None]:
def sentiment_lexicon(filename):
    lex = []
    with open(filename, encoding='utf-8') as infile:
        for line in infile:
            line = line.rstrip()
            if line and not line.startswith(';'):
                lexicon.append(line)
    return lexicon

pos_words = load_lexicon('D:/fyp-data/sentiment_lexicons/positive-words.txt')
neg_words = load_lexicon('D:/fyp-data/sentiment_lexicons/negative-words.txt')

In [None]:
pos_vectors = word_embeddings.reindex(pos_words).dropna()
neg_vectors = word_embeddings.reindex(neg_words).dropna()

In [None]:
vectors = pd.concat([pos_vectors, neg_vectors])
targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
labels = list(pos_vectors.index) + list(neg_vectors.index)

In [None]:
train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
    train_test_split(vectors, targets, labels, test_size=0.2, random_state=42)

In [None]:
model = SGDClassifier(loss='log', random_state=0, max_iter=200)
model.fit(train_vectors, train_targets)


In [None]:
accuracy_score(model.predict(test_vectors), test_targets)

In [None]:
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf_model = rf.fit(train_vectors, train_targets)

In [None]:
accuracy_score(rf_model.predict(test_vectors), test_targets)

In [None]:
def words_to_sentiment(words):
    #Find embeddings for each word and discard words whose embeddings are not in the glove word embedding.
    embedding = word_embeddings.reindex(words).dropna()
    
    if embedding.shape[0] == 0:
        #Return 0 as sentiment score.
        word_sentiment = 0
    else:
        log_predictions = model.predict_log_proba(embedding)
        word_sentiment = log_predictions[:, 1] - log_predictions[:, 0]
    
    return pd.DataFrame({'sentiment': word_sentiment}, index=embedding.index)

In [None]:
# Show examples from the test set
words_to_sentiment(test_labels).iloc[:15]

In [None]:
def get_sentiment_score(text):
    sentiment = words_to_sentiment(text)
    #Will return empty cell when sentiment is 0(when the word embedding is not found)
    return sentiment['sentiment'].mean()

In [None]:
get_sentiment_score(['These', 'headphones', 'are', 'awful'])