In [None]:
import csv
import random
import numpy as np # Linear algebra.
import pandas as pd # Data processing.
import itertools #Used in plot_confusion_matrix
from nltk.tokenize import RegexpTokenizer

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.linear_model import SGDClassifier, LogisticRegression, LinearRegression
from sklearn import neighbors
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, mean_squared_error
from gensim import models

import matplotlib.pyplot as plt
import seaborn as sns
import codecs

In [None]:
reviews = pd.read_csv("D:/fyp-data/amazon_reviews/amazon_reviews_us_Books_v1_00.tsv",
                      usecols=['review_body', 'star_rating'], sep='\t', skiprows=lambda i: i>0 and random.random() > 0.0001)
reviews.to_csv("book_reviews.csv")

reviews = pd.read_csv("D:/fyp-data/amazon_reviews/amazon_reviews_us_Music_v1_00.tsv",
                      usecols=['review_body', 'star_rating'], sep='\t', skiprows=lambda i: i>0 and random.random() > 0.0002)
reviews.to_csv("music_reviews.csv")

reviews = pd.read_csv("D:/fyp-data/amazon_reviews/amazon_reviews_us_Electronics_v1_00.tsv",
                      usecols=['review_body', 'star_rating'], sep='\t', skiprows=lambda i: i>0 and random.random() > 0.0004)
reviews.to_csv("electronics_reviews.csv")

reviews = pd.read_csv("D:/fyp-data/amazon_reviews/amazon_reviews_us_Kitchen_v1_00.tsv",
                      usecols=['review_body', 'star_rating'], sep='\t', skiprows=lambda i: i>0 and random.random() > 0.0003)
reviews.to_csv("kitchen_reviews.csv")

reviews = pd.read_csv("D:/fyp-data/amazon_reviews/amazon_reviews_us_Automotive_v1_00.tsv",
                      usecols=['review_body', 'star_rating'], sep='\t', skiprows=lambda i: i>0 and random.random() > 0.00045)
reviews.to_csv("autmotive_reviews.csv")

In [None]:
reviews = pd.read_csv("book_reviews.csv")

In [None]:
reviews = pd.read_csv("D:/fyp-data/amazon_reviews/amazon_reviews_us_Gift_Card_v1_00.tsv",
                      usecols=['review_body', 'star_rating'], sep='\t', skiprows=lambda i: i>0 and random.random() > 0.001)

In [None]:
#Convert all review bodies from object to string.
reviews["review_body"] = reviews["review_body"].astype(str)

In [None]:
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.lower()
    return df

reviews = standardize_text(reviews, "review_body")

In [None]:
tokenizer = RegexpTokenizer(r"\w+")
reviews["tokens"] = reviews["review_body"].apply(tokenizer.tokenize)

In [None]:
reviews["sentence_length"] = [len(tokens) for tokens in reviews["tokens"]]

In [None]:
reviews.groupby("star_rating").count()

In [None]:
reviews.describe()

In [None]:
reviews.tail()

In [None]:
reviews.to_csv("test_csv")

In [None]:
#TODO:
reviews["sentiment_score"] = reviews["tokens"].apply(get_sentiment_score)

In [None]:
all_words = [word for tokens in reviews["tokens"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in reviews["tokens"]]
VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))

In [None]:
sns.distplot(sentence_lengths, kde=False, rug=True);

In [None]:
#Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
#TODO: with sentiment score

In [None]:
features = reviews["review_body"].tolist()
labels = reviews["star_rating"].tolist()
#Splitting train/test data 80/20 known as Pareto principle.
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [None]:
def get_bow_embeddings(data):
    vectorizer = CountVectorizer()

    emb = vectorizer.fit_transform(data)
    return emb, vectorizer

In [None]:
def tfidf(data):
    vectorizer = TfidfTransformer()
    
    train = vectorizer.fit_transform(data)
    return train, vectorizer

In [None]:
X_train, count_vectorizer = get_bow_embeddings(X_train)
X_test = count_vectorizer.transform(X_test)

In [None]:
X_train, vectorizer = tfidf(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
vectors = models.KeyedVectors.load_word2vec_format("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", binary=True)

In [None]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, reviews, generate_missing=False):
    embeddings = reviews['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [None]:
features = get_word2vec_embeddings(vectors, reviews)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=40)

In [None]:
def get_metrics(y_test, y_predicted):  
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None, average='weighted')     
    
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # f1 = harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    return accuracy, precision, recall, f1

In [None]:
def print_metrics(y_test, y_predicted):
    accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted)
    print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

In [None]:
#Linear regression 
lr = LinearRegression()
model = lr.fit(X_train, y_train)

predicted = model.predict(X_test)

mean_squared_error(y_test, predicted)
model.score(X_test,y_test)

In [None]:
#Logistic regression
regressor = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', multi_class='multinomial',
                         n_jobs=-1, random_state=40)
model = regressor.fit(X_train, y_train)

predicted = regressor.predict(X_test)

print_metrics(y_test, predicted)

In [None]:
#K-nearest neighbours
classifier = neighbors.KNeighborsClassifier(n_neighbors = 6)
model = classifier.fit(X_train, y_train)

predicted = model.predict(X_test)
print_metrics(y_test, predicted)

In [None]:
#Support Vector Machines
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

#Predict the response for test dataset
predicted = clf.predict(X_test)
print_metrics(y_test, predicted)

In [None]:
#Random Forest Classifier
rf = RandomForestClassifier(n_estimators = 50, random_state = 42)
model = rf.fit(X_train, y_train)

predicted = model.predict(X_test)
print_metrics(y_test, predicted)

In [None]:
#Random Forest Regressor
rf = RandomForestRegressor(n_estimators = 50, random_state = 42)
model = rf.fit(X_train, y_train)

predicted = model.predict(X_test)

In [None]:
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
def load_embeddings(filename):
    labels = []
    rows = []
    with open(filename, encoding='utf-8') as infile:
        for i, line in enumerate(infile):
            items = line.rstrip().split(' ')
            if len(items) == 2:
                # This is a header row giving the shape of the matrix
                continue
            labels.append(items[0])
            values = np.array([float(x) for x in items[1:]], 'f')
            rows.append(values)
    
    arr = np.vstack(rows)
    return pd.DataFrame(arr, index=labels, dtype='f')

embeddings = load_embeddings('D:/fyp-data/word_embeddings/glove.6B.300d.txt')

In [None]:
def load_lexicon(filename):
    lexicon = []
    with open(filename, encoding='latin-1') as infile:
        for line in infile:
            line = line.rstrip()
            if line and not line.startswith(';'):
                lexicon.append(line)
    return lexicon

pos_words = load_lexicon('D:/fyp-data/sentiment_lexicons/positive-words.txt')
neg_words = load_lexicon('D:/fyp-data/sentiment_lexicons/negative-words.txt')

In [None]:
pos_vectors = embeddings.reindex(pos_words).dropna()
neg_vectors = embeddings.reindex(neg_words).dropna()

In [None]:
vectors = pd.concat([pos_vectors, neg_vectors])
targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
labels = list(pos_vectors.index) + list(neg_vectors.index)

In [None]:
train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
    train_test_split(vectors, targets, labels, test_size=0.2, random_state=42)

In [None]:
model = SGDClassifier(loss='log', random_state=0, max_iter=50)
model.fit(train_vectors, train_targets)

In [None]:
accuracy_score(model.predict(test_vectors), test_targets)

In [None]:
def vecs_to_sentiment(vecs):
    predictions = model.predict_log_proba(vecs)
    return predictions[:, 1] - predictions[:, 0]

def words_to_sentiment(words):
    vecs = embeddings.reindex(words).dropna()
    log_odds = vecs_to_sentiment(vecs)
    return pd.DataFrame({'sentiment': log_odds}, index = vecs.index)

# Show examples from the test set
words_to_sentiment(test_labels).iloc[:10]

In [None]:
def get_sentiment_score(text):
    sentiment = words_to_sentiment(text)
    return sentiment['sentiment'].mean()