In [None]:
# Use amazon api for getting products and reviews
import requests, json, math, os

PRODUCT_SEARCH_URL = "https://amazon-product-reviews-keywords.p.rapidapi.com/product/search"
PRODUCT_REVIEWS_URL = "https://amazon-product-reviews-keywords.p.rapidapi.com/product/reviews"
API_HOST = "amazon-product-reviews-keywords.p.rapidapi.com"
API_KEY = "80a2576396msh3e6de04d43574c0p126d25jsn67dc9e6c1fd7"

AD_PREFIX = "Sponsored Ad"
BRAND_REVIEWS_TARGET = 10000

def get_products_page(brand, page):
  params = {
      "keyword":brand,
      "country":"GB",
      "category":"aps",
      "page": page}
  headers = {
      'x-rapidapi-host': API_HOST,
      'x-rapidapi-key': API_KEY}
  response = requests.request("GET", PRODUCT_SEARCH_URL, headers=headers, params=params)
  return json.loads(response.text)

def filter_out_ads_and_parse_products(page, brand, index):
  products = list()
  for product in page["products"]:
    # filter out ads because these are not the products we are looking for
    if not product["title"].startswith(AD_PREFIX):
      products.append({
          "asin": product["asin"],
          "name": product["title"]})
  return products

def get_brand_to_products():
  brands = ["Apple Iphone", "Samsung Galaxy", "Google pixel", "Oppo", "Huawei Smartphone"]
  brand_to_products = dict.fromkeys(brands, list())

  for brand in brands:
    page_index = 1
    while True:
      current_page = []
      current_page_products = []
      current_page = get_products_page(brand, page_index)
      page_has_products = "products" in current_page
      if not page_has_products:
        break
      current_page_products = filter_out_ads_and_parse_products(current_page, brand, page_index)
      
      existing_products = brand_to_products[brand]
      existing_and_new_products = existing_products + current_page_products if len(existing_products) > 0 else current_page_products
      brand_to_products[brand] = existing_and_new_products

      page_index += 1

  return brand_to_products

def get_product_reviews(asin, page):
  params = {
      "asin": asin,
      "page": page,
      "country": "GB",
      "variants": "1"}
  headers = {
      'x-rapidapi-host': API_HOST,
      'x-rapidapi-key': API_KEY}
  response = requests.request("GET", PRODUCT_REVIEWS_URL, headers=headers, params=params)
  return json.loads(response.text)

def parse_reviews(page, asin, name, index, brand):
  reviews = []
  for review in page["reviews"]:
    reviews.append({
        "review": review["review"],
        "id": review["id"],
        "title": review["title"],
        "rating": review["rating"],
        "product_asin": asin,
        "product_name": name})
  return reviews


def get_reviews(brand_to_products):
  reviews = []
  for brand in brand_to_products:
    products = brand_to_products[brand]
    reviews_found_for_brand = 0

    for product in products:
      page_index = 1
      while reviews_found_for_brand < BRAND_REVIEWS_TARGET:
        current_review_page = get_product_reviews(product["asin"], page_index)
        if "reviews" not in current_review_page or len(current_review_page["reviews"]) == 0:
          break
        current_reviews = parse_reviews(current_review_page, product["asin"], product["name"], page_index, brand)
        reviews.extend(current_reviews)

        reviews_found_for_brand += len(current_reviews)
        is_last_page = len(current_reviews) == 0
        page_index +=1 

  return reviews

brand_to_products = get_brand_to_products()
reviews = get_reviews(brand_to_products)

for review in reviews:
  reviews_path = '/content/drive/MyDrive/reviews'
  file_name = "%s.txt" %review["id"]
  complete_path_name = os.path.join(reviews_path, file_name)

  file_content = "%s\n%s\n%s\n%s\n%s\n" %(review["product_name"], review["product_asin"], review["rating"], review["title"], review["review"])
  review_file = open(complete_path_name, "w")
  review_file.write(file_content)
  review_file.close()

In [None]:
# The first idea was to only use reviews that are comparing two phones
# but eventually all the reviews have been used
!python -m spacy download en_core_web_lg
!python -m spacy validate

import shutil, spacy, os, random
from spacy.util import minibatch, compounding
from pathlib import Path

def should_include_review(entities):
  product_entities = list(filter(lambda e: e.label_ == "PRODUCT", doc.ents))
  unique_product_entites = set()
  for product in product_entities:
    unique_product_entites.add(product.text)
  if len(unique_product_entites) >= 2:
    return True
  else:
    return False 

reviews_directory = '/content/drive/MyDrive/reviews'
chosen_reviews_directory = '/content/drive/MyDrive/chosen_reviews'
nlp = spacy.load("en_core_web_lg")

for filename in os.listdir(reviews_directory):
  maybe_file = os.path.join(reviews_directory, filename)
  if os.path.isfile(maybe_file):
    file_content = open(maybe_file, 'r')
    file_lines = file_content.readlines()
    review = '%s.%s' %(file_lines[3], file_lines[4])
    
    doc = nlp(review)

    if should_include_review(doc.ents):
      review_to_include = os.path.join(chosen_reviews_directory, filename)
      shutil.copy(maybe_file, review_to_include)

In [None]:
# Preprocess the reviews by adding a space after a dot if the following character is a letter.
# This will help the sentence tokenisation
def preprocess(review):
  for index in range(0, len(review)):
    char = review[index]
    if char == '.':
      next_char = review[index+1]
      if next_char.isalpha():
        review = '%s %s' % (review[:index+1], review[index+1:])
  return review

reviews_directory = '/content/drive/MyDrive/reviews'
preprocessed_directory = '/content/drive/MyDrive/preprocessed_reviews'

for filename in os.listdir(reviews_directory):
  maybe_file = os.path.join(reviews_directory, filename)
  if os.path.isfile(maybe_file):
    review_file = open(maybe_file, 'r')
    file_lines = review_file.readlines()
    review_file.close()

    review = file_lines[4]
    preprocessed_review = preprocess(review)

    file_content = "%s%s%s%s%s" %(file_lines[0], file_lines[1], file_lines[2], file_lines[3], preprocessed_review)

    preprocessed_review_path = os.path.join(preprocessed_directory, filename)
    review_file = open(preprocessed_review_path, 'w')
    review_file.write(file_content)
    review_file.close()


In [None]:
!pip install transformers

In [None]:
# This cell does sentiment analysis at a sentence level
# and builds a json file for each review consisting of the sentiments found
import torch, nltk, json
nltk.download('punkt')
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

preprocessed_directory = '/content/drive/MyDrive/preprocessed_reviews'
reviews_with_sentiments_directory = '/content/drive/MyDrive/sentimental_reviews'

for filename in os.listdir(preprocessed_directory):
  maybe_file = os.path.join(preprocessed_directory, filename)
  if os.path.isfile(maybe_file):
    review_file = open(maybe_file, 'r')
    file_lines = review_file.readlines()
    review_file.close()

    review_json = {
        'name': file_lines[0].replace('\n', ''),
        'asin': file_lines[1].replace('\n', ''),
        'rating': int(file_lines[2].replace('\n', '')),
        'title': file_lines[3].replace('\n', ''),
        'review': file_lines[4].replace('\n', ''),
        'sentiment_to_sentences': {'POSITIVE': [], 'NEUTRAL': [], 'NEGATIVE': []}
    }

    review_sentences = nltk.tokenize.sent_tokenize(review_json['review'])

    for sentence in review_sentences:
      sentiment_result = classifier(sentence)[0]

      # if the certainty is less then 75% then assign a NEUTRAL sentiment
      if sentiment_result['score'] > 0.75:
        label = sentiment_result['label']
      else:
        label = 'NEUTRAL'

      review_json['sentiment_to_sentences'][label].append(sentence)

      review_json_path = os.path.join(reviews_with_sentiments_directory, '%s.json' %filename.split('.')[0])
      with open(review_json_path, 'w') as review_json_file:
        json.dump(review_json, review_json_file)



In [None]:
import os, nltk
import pandas as pd

from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')


from nltk.corpus import wordnet
nltk.download('wordnet')

from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from nltk.corpus import stopwords
nltk.download('stopwords')
from itertools import chain

In [None]:
# This cell prepares the data for training the LDA model for topic extraction
reviews = []
preprocessed_directory = '/content/drive/MyDrive/preprocessed_reviews'
preprocessed_files = os.listdir(preprocessed_directory)
for filename in preprocessed_files:
  review_file_path = os.path.join(preprocessed_directory, filename)
  review_file = open(review_file_path, 'r')
  file_lines = review_file.readlines()
  review_file.close()
  
  review_text = file_lines[4]
  reviews.append(review_text)

data = pd.DataFrame(reviews, columns=['reviews'])
data['sentences'] = data['reviews'].apply(sent_tokenize)
data['tokens_sentences'] = data['sentences'].apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])
data['tokens_with_pos'] = data['tokens_sentences'].apply(lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])

# Only get nouns and adjectives for topic extraction
def get_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    else:
        return ''

# Lemmatise token with POS
data['lemmatised_tokens'] = data['tokens_with_pos'].apply(
    lambda tokens_with_pos: [[lemmatizer.lemmatize(el[0], get_pos(el[1])) if get_pos(el[1]) != '' else el[0] for el in token_pos] for token_pos in tokens_with_pos])

#Remove stopwords
custom_stopwords = ['can', 'come', 'get', 'go', 'know', 'like', 'make', 'may', 'need', 'say', 'see', 'take', 'use', 'want', 'would',
                    'good', 'really', 'new', 'bad', 'even', 'also', 'well', 'great', 'excellent', 
                    'fantastic', 'happy', 'however', 'easy', 'nice', 'perfect', 'love', 'samsung',
                    'one', 'also', 'ok', 'amazing', 'brilliant', 'best', 'smart', 'awesome', 'liked', 'pleased']
all_stopwords = stopwords.words('english') + custom_stopwords
data['tokens'] = data['lemmatised_tokens'].map(lambda sentences: list(chain.from_iterable(sentences)))
data['tokens'] = data['tokens'].map(lambda tokens: [token.lower() for token in tokens if token.isalpha() and token.lower() not in all_stopwords and len(token)>1])

In [None]:
# Train the LDA model for topic extraction
from gensim.models import Phrases
from gensim import corpora
from gensim import models
import numpy as np

tokens = data['tokens'].tolist()
bigram_model = Phrases(tokens)
trigram_model = Phrases(bigram_model[tokens], min_count=1)
tokens = list(trigram_model[bigram_model[tokens]])


dictionary_LDA = corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]


np.random.seed(123456)
num_topics = 10
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary_LDA, passes=4, alpha=[0.01]*num_topics, eta=[0.01]*len(dictionary_LDA.keys()))

#Display the 10 most frequent topics
for index, topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20):
  print('%s: %s' %(str(index), topic))

In [None]:
# This cell initialises the asins final data
import os, json
reviews_with_sentiments_directory = '/content/drive/MyDrive/sentimental_reviews'

# Map the found topics to aspects
topic_id_to_aspect = {
    0: 'overall',
    1: 'battery',
    2: 'camera',
    3: 'overall',
    4: 'value for price',
    5: 'service & warranty',
    6: 'shopping experience / delivery',
    7: 'screen',
    8: 'brand & os',
    9: 'screen'}

def init_aspect_to_sentiments():
  return {
      'overall': {'POSITIVE': 0, 'NEUTRAL': 0, 'NEGATIVE': 0},
      'battery': {'POSITIVE': 0, 'NEUTRAL': 0, 'NEGATIVE': 0},
      'camera': {'POSITIVE': 0, 'NEUTRAL': 0, 'NEGATIVE': 0},
      'screen': {'POSITIVE': 0, 'NEUTRAL': 0, 'NEGATIVE': 0},
      'value for price': {'POSITIVE': 0, 'NEUTRAL': 0, 'NEGATIVE': 0},
      'service & warranty': {'POSITIVE': 0, 'NEUTRAL': 0, 'NEGATIVE': 0},
      'brand & os': {'POSITIVE': 0, 'NEUTRAL': 0, 'NEGATIVE': 0},
      'shopping experience / delivery': {'POSITIVE': 0, 'NEUTRAL': 0, 'NEGATIVE': 0}}

def get_all_asins():
  asins = set()
  
  reviews_with_sentiments = os.listdir(reviews_with_sentiments_directory)
  for filename in reviews_with_sentiments:
    review_file_path = os.path.join(reviews_with_sentiments_directory, filename)
    review_file = open(review_file_path, 'r')
    review_dict = json.load(review_file)
    review_file.close()
    asin = review_dict['asin']
    asins.add(asin)

  return asins

asin_to_data = {}
asins = get_all_asins()
for asin in asins:
  asin_to_data[asin] = {
      'asin': asin,
      'name': '',
      'ratings': [],
      'aspect_to_sentiments': init_aspect_to_sentiments()}


In [None]:
# This cell build the phones final data by taking the sentences with sentiments
# and figuring out what is the aspect described in them
import json, os

reviews_with_sentiments_directory = '/content/drive/MyDrive/sentimental_reviews'

def build_aspect_to_sentiments(sentiment_to_sentences):
  aspect_to_sentiments = init_aspect_to_sentiments()
  for sentiment in sentiment_to_sentences:
    sentences = sentiment_to_sentences[sentiment]
    for sentence in sentences:
      tokens = word_tokenize(sentence)
      topics = lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20)
      topic_id = sorted(lda_model[dictionary_LDA.doc2bow(tokens)], key=lambda result: result[1], reverse=True)[0][0]
      aspect = topic_id_to_aspect[topic_id]

      aspect_to_sentiments[aspect][sentiment] += 1

  return aspect_to_sentiments

reviews_with_sentiments_files = os.listdir(reviews_with_sentiments_directory)
for filename in reviews_with_sentiments_files:
  review_file_path = os.path.join(reviews_with_sentiments_directory, filename)
  review_file = open(review_file_path, 'r')
  review_dict = json.load(review_file)
  review_file.close()

  review_aspect_to_sentiments = build_aspect_to_sentiments(review_dict['sentiment_to_sentences'])
  asin = review_dict['asin']

  asin_to_data[asin]['ratings'].append(int(review_dict['rating']))
  asin_to_data[asin]['name'] = review_dict['name']

  for aspect in review_aspect_to_sentiments:
    aspect_sentiments = review_aspect_to_sentiments[aspect]
    for sentiment in aspect_sentiments:
      asin_to_data[asin]['aspect_to_sentiments'][aspect][sentiment] += aspect_sentiments[sentiment]

# Finally store the data into json files that are going to be read by the web platform
asin_data_directory = '/content/drive/MyDrive/asin_data'
for asin in asin_to_data:
  ratings = asin_to_data[asin]['ratings']
  asin_to_data[asin]['rating'] = round(sum(ratings) / len(ratings), 2)

  asin_data_json_path = os.path.join(asin_data_directory, '%s.json' %asin)
  with open(asin_data_json_path, 'w') as asin_data_json_file:
    json.dump(asin_to_data[asin], asin_data_json_file)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
