In [50]:
import time
import random
from typing import Optional, List, Dict, Union
from itertools import chain

from selectorlib import Extractor
import requests
import pandas as pd
import plotly.express as px

In [2]:
extractor = Extractor.from_yaml_file('selectors.yml')

In [10]:
def scrape(url: str):
    headers = {
        'authority': 'www.amazon.com',
        'pragma': 'no-cache',
        'cache-control': 'no-cache',
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-dest': 'document',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print(f"Downloading {url}")
    r = requests.get(url, headers=headers)

    # Simple check to check if page was blocked (Usually 503)
    if r.status_code > 500:
        if "To discuss automated access to Amazon data please contact" in r.text:
            print("Page %s was blocked by Amazon. Please try using better proxies\n"%url)
        else:
            print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code))
        return None

    # Pass the HTML of the page and create
    return extractor.extract(r.text)


def crawl(url: str, n_pages: Optional[int] = None) -> List[Dict[str, str]]:
    
    if n_pages is None:
        n_pages = float('inf')
    
    all_data = []
    
    counter = 0
    while counter < n_pages:
            
        try:
            data = scrape(url)
            time.sleep(random.randint(2, 10))
            all_data.append(data)

            pre_url = "https://www.amazon.com"
            url = pre_url + data['next_page']

            counter += 1

        except (KeyboardInterrupt, Exception):
            break
        
    return all_data


def format_reviews(data: List[Dict[str, str]]) -> pd.DataFrame:
    all_reviews = [review['reviews'] for review in data]
    return pd.DataFrame(list(chain(*all_reviews)))

In [4]:
URL = "https://www.amazon.com/Keychron-Wireless-Bluetooth-Mechanical-Keyboard/product-reviews/B07YB32H52/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"

In [5]:
data = crawl(URL)

Downloading https://www.amazon.com/Keychron-Wireless-Bluetooth-Mechanical-Keyboard/product-reviews/B07YB32H52/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
Downloading https://www.amazon.com/Keychron-Wireless-Bluetooth-Mechanical-Keyboard/product-reviews/B07YB32H52/ref=cm_cr_arp_d_paging_btm_2?ie=UTF8&pageNumber=2&reviewerType=all_reviews
Downloading https://www.amazon.com/Keychron-Wireless-Bluetooth-Mechanical-Keyboard/product-reviews/B07YB32H52/ref=cm_cr_arp_d_paging_btm_3?ie=UTF8&pageNumber=3&reviewerType=all_reviews
Downloading https://www.amazon.com/Keychron-Wireless-Bluetooth-Mechanical-Keyboard/product-reviews/B07YB32H52/ref=cm_cr_arp_d_paging_btm_4?ie=UTF8&pageNumber=4&reviewerType=all_reviews
Downloading https://www.amazon.com/Keychron-Wireless-Bluetooth-Mechanical-Keyboard/product-reviews/B07YB32H52/ref=cm_cr_arp_d_paging_btm_5?ie=UTF8&pageNumber=5&reviewerType=all_reviews
Downloading https://www.amazon.com/Keychron-Wireless-Bluetooth-Mechanical-Keyboard/produc

Downloading https://www.amazon.com/Keychron-Wireless-Bluetooth-Mechanical-Keyboard/product-reviews/B07YB32H52/ref=cm_cr_arp_d_paging_btm_45?ie=UTF8&pageNumber=45&reviewerType=all_reviews
Downloading https://www.amazon.com/Keychron-Wireless-Bluetooth-Mechanical-Keyboard/product-reviews/B07YB32H52/ref=cm_cr_arp_d_paging_btm_46?ie=UTF8&pageNumber=46&reviewerType=all_reviews
Downloading https://www.amazon.com/Keychron-Wireless-Bluetooth-Mechanical-Keyboard/product-reviews/B07YB32H52/ref=cm_cr_arp_d_paging_btm_47?ie=UTF8&pageNumber=47&reviewerType=all_reviews
Downloading https://www.amazon.com/Keychron-Wireless-Bluetooth-Mechanical-Keyboard/product-reviews/B07YB32H52/ref=cm_cr_arp_d_paging_btm_48?ie=UTF8&pageNumber=48&reviewerType=all_reviews
Downloading https://www.amazon.com/Keychron-Wireless-Bluetooth-Mechanical-Keyboard/product-reviews/B07YB32H52/ref=cm_cr_arp_d_paging_btm_49?ie=UTF8&pageNumber=49&reviewerType=all_reviews
Downloading https://www.amazon.com/Keychron-Wireless-Bluetooth-Me

In [11]:
df = format_reviews(data)

In [18]:
df.to_csv('keychron_K2_reviews.csv')

In [16]:
data[0]['product_title']

'Keychron K2 Wireless Bluetooth/USB Wired Gaming Mechanical Keyboard, Compact 84 Keys RGB LED Backlit N-Key Rollover Aluminum Frame for Mac Windows, Gateron Brown Switch, Version 2'

In [17]:
df

Unnamed: 0,title,content,date,variant,images,verified,author,rating
0,This is the best compromise I can find,"First off, unlike the other review currently u...","Reviewed in the United States on September 23,...",,[https://images-na.ssl-images-amazon.com/image...,Verified Purchase,Dan,4.0 out of 5 stars
1,better than Gateron Brown -,Althgouh still prone to typos. I am goign to w...,"Reviewed in the United States on September 20,...",,,Verified Purchase,Isra Ignateous,4.0 out of 5 stars
2,"OK for the price, but not ""the one""",This keyboard has a lot to offer in a competit...,"Reviewed in the United States on January 6, 2020",Color: Gateron Red Switch v2,,Verified Purchase,Carl,3.0 out of 5 stars
3,Simple... IT'S AMAZING!,"Well, there's not to much reviews about this k...","Reviewed in the United States on September 25,...",,[https://images-na.ssl-images-amazon.com/image...,Verified Purchase,Arturo,5.0 out of 5 stars
4,I was not expecting to fall in love with the k...,Here's the story from the beginning. When I wa...,"Reviewed in the United States on May 21, 2020",,,Verified Purchase,Christ,5.0 out of 5 stars
...,...,...,...,...,...,...,...,...
568,,Excelente teclado. El Bluetooth se conecta e i...,"Reviewed in Mexico on October 15, 2020",Color: Gateron Red Switch v2,,Verified Purchase,Edwin,
569,,1,"Reviewed in Canada on July 23, 2020",Color: Gateron Red Switch v2,,Verified Purchase,Miguel Wang,
570,,"It's great. Not perfect, but great.","Reviewed in Australia on February 23, 2021",Color: Gateron Red Switch v2,,Verified Purchase,Samuel Nielsen,
571,,Love it,"Reviewed in Australia on August 15, 2020",Color: Gateron Brown Switch v2,,Verified Purchase,wei cui,


In [43]:
def format_rating(rating: str) -> Union[float, None]:
    try:
        rate = rating.split()[0]
        return eval(rate)
    except (AttributeError, IndexError):
        pass

In [24]:
df_copy = df.copy(deep=True)

In [46]:
df_copy.rating = df_copy.rating.apply(format_rating)

Unnamed: 0,title,content,date,variant,images,verified,author,rating
0,This is the best compromise I can find,"First off, unlike the other review currently u...","Reviewed in the United States on September 23,...",,[https://images-na.ssl-images-amazon.com/image...,Verified Purchase,Dan,4.0
1,better than Gateron Brown -,Althgouh still prone to typos. I am goign to w...,"Reviewed in the United States on September 20,...",,,Verified Purchase,Isra Ignateous,4.0
2,"OK for the price, but not ""the one""",This keyboard has a lot to offer in a competit...,"Reviewed in the United States on January 6, 2020",Color: Gateron Red Switch v2,,Verified Purchase,Carl,3.0
3,Simple... IT'S AMAZING!,"Well, there's not to much reviews about this k...","Reviewed in the United States on September 25,...",,[https://images-na.ssl-images-amazon.com/image...,Verified Purchase,Arturo,5.0
4,I was not expecting to fall in love with the k...,Here's the story from the beginning. When I wa...,"Reviewed in the United States on May 21, 2020",,,Verified Purchase,Christ,5.0
...,...,...,...,...,...,...,...,...
568,,Excelente teclado. El Bluetooth se conecta e i...,"Reviewed in Mexico on October 15, 2020",Color: Gateron Red Switch v2,,Verified Purchase,Edwin,
569,,1,"Reviewed in Canada on July 23, 2020",Color: Gateron Red Switch v2,,Verified Purchase,Miguel Wang,
570,,"It's great. Not perfect, but great.","Reviewed in Australia on February 23, 2021",Color: Gateron Red Switch v2,,Verified Purchase,Samuel Nielsen,
571,,Love it,"Reviewed in Australia on August 15, 2020",Color: Gateron Brown Switch v2,,Verified Purchase,wei cui,


In [58]:
px.histogram(df_copy.rating, marginal='box')

In [64]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/d.e.magno/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [65]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [66]:
vds = SentimentIntensityAnalyzer()

In [75]:
def label_sentiment(text: str) -> int:
    sentiment_stats = vds.polarity_scores(text)
    sentiment = sentiment_stats['compound']
    
    if sentiment >= 0.05:
        return 1
    elif (sentiment > -0.05) and (sentiment < 0.05):
        return 0
    else:
        return -1

In [78]:
df_copy['sentiment'] = df_copy.content.apply(label_sentiment)

In [79]:
df_copy

Unnamed: 0,title,content,date,variant,images,verified,author,rating,sentiment
0,This is the best compromise I can find,"First off, unlike the other review currently u...","Reviewed in the United States on September 23,...",,[https://images-na.ssl-images-amazon.com/image...,Verified Purchase,Dan,4.0,1
1,better than Gateron Brown -,Althgouh still prone to typos. I am goign to w...,"Reviewed in the United States on September 20,...",,,Verified Purchase,Isra Ignateous,4.0,-1
2,"OK for the price, but not ""the one""",This keyboard has a lot to offer in a competit...,"Reviewed in the United States on January 6, 2020",Color: Gateron Red Switch v2,,Verified Purchase,Carl,3.0,1
3,Simple... IT'S AMAZING!,"Well, there's not to much reviews about this k...","Reviewed in the United States on September 25,...",,[https://images-na.ssl-images-amazon.com/image...,Verified Purchase,Arturo,5.0,1
4,I was not expecting to fall in love with the k...,Here's the story from the beginning. When I wa...,"Reviewed in the United States on May 21, 2020",,,Verified Purchase,Christ,5.0,1
...,...,...,...,...,...,...,...,...,...
568,,Excelente teclado. El Bluetooth se conecta e i...,"Reviewed in Mexico on October 15, 2020",Color: Gateron Red Switch v2,,Verified Purchase,Edwin,,1
569,,1,"Reviewed in Canada on July 23, 2020",Color: Gateron Red Switch v2,,Verified Purchase,Miguel Wang,,0
570,,"It's great. Not perfect, but great.","Reviewed in Australia on February 23, 2021",Color: Gateron Red Switch v2,,Verified Purchase,Samuel Nielsen,,1
571,,Love it,"Reviewed in Australia on August 15, 2020",Color: Gateron Brown Switch v2,,Verified Purchase,wei cui,,1


In [87]:
px.bar(df_copy.sentiment.value_counts())