<a href="https://colab.research.google.com/github/gidee725/webscraping-Amazon-reviews/blob/main/Amazon_reviews_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# # Install necessary packages
# !pip install selenium
# !apt-get update # to update ubuntu to correctly run apt install
# !apt install -y chromium-chromedriver
# !cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [None]:
import sys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime
from selenium.common.exceptions import TimeoutException

In [None]:
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')


In [None]:
def scrape_reviews(url, pages):
    driver = webdriver.Chrome(options=chrome_options)  # or whichever browser driver you prefer
    review_text = []
    Review_date = []
    product_ratings = []
    Review_title = []

    for i in range(1,pages):
        try:
            driver.get(f'{url}{i}')
            ids = wait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, "//*[contains(@id,'customer_review-')]")))
        except TimeoutException:
            print(f"TimeoutException occurred on page {i}.")
            continue  # skip to the next iteration
        review_ids = []
        for id in ids:
            review_ids.append(id.get_attribute('id'))
        for x in review_ids:
            #Get the  review text
            review = driver.find_element(By.XPATH, f'//*[@id="{x}"]/div[4]/span/span')
            review_text.append(review.text)

            # Get the dates
            date = driver.find_element(By.XPATH, f'//*[@id="{x}"]/span')
            Review_date.append(date.text)

            # Get the titles
            title = driver.find_element(By.XPATH, f'//*[@id="{x}"]/div[2]/a/span[2]')
            Review_title.append(title.text)

            # find ratings box
            ratings_box = driver.find_element(By.XPATH, f'//*[@id="{x}"]/div[2]/a/i')

            # find ratings
            ratings = ratings_box.get_attribute('innerHTML')
            product_ratings.append(ratings)

    _dict = {'title':Review_title,'review':review_text,'product_ratings':product_ratings, 'Date':Review_date }
    _df = pd.DataFrame(_dict)
    driver.quit()
    return _df


In [None]:
def process_ratings(data, column_name):
    """
    Process ratings in a specific column of a dataframe.

    Parameters:
    data (pandas.DataFrame): The dataframe to process.
    column_name (str): The name of the column containing the ratings.

    Returns:
    pandas.DataFrame: The processed dataframe.

    Raises:
    ValueError: If the specified column does not exist in the dataframe.
    """

    ratings = [rating.split('out')[0] for rating in data[column_name]]
    data[column_name] = [float(row.split('>')[1].strip()) for row in ratings]

    return data


In [None]:
from datetime import datetime

def extract_dates(df, column_name):
    """
    Extracts dates from a dataframe column, converts them to datetime objects, and formats them.

    Parameters:
    df (pandas.DataFrame): The dataframe containing the dates.
    column_name (str): The name of the column containing the dates.

    Returns:
    list: A list of strings representing the extracted dates in 'YYYY-MM-DD' format.
    """

    dates = []

    for row in df[column_name]:
        # The date is always after the last 'on' in the string
        date_string = row.split(' on ')[-1]

        # Convert the date string to a datetime object
        date = datetime.strptime(date_string, '%B %d, %Y')

        # Format the datetime object
        formatted_date = date.strftime('%Y-%m-%d')

        dates.append(formatted_date)

    return dates


In [None]:
import re

def preprocess_tweet(review):
    """
    Preprocesses a review by removing URLs, mentions, hashtags, special characters, numbers, and extra spaces.
    Also converts the review to lowercase.

    Parameters:
    review (str): The review to preprocess.

    Returns:
    str: The preprocessed review.
    """

    # remove URLs from the review
    review = re.sub(r'http\S+|www\S+', '', review)

    # remove mentions (@) and hashtags (#)
    review = re.sub(r'[@#]\w+', '', review)

    # remove special characters and numbers
    review = re.sub(r'[^\w\s]', '', review)
    review = re.sub(r'\d+', '', review)

    # convert to lowercase
    review = review.lower()

    # remove extra spaces
    review = re.sub(r'\s+', ' ', review).strip()

    return review


In [None]:
def preprocess_review(df):
    """
    Preprocesses reviews in a DataFrame by removing URLs, mentions, hashtags, special characters, numbers, and extra spaces.
    Also converts the tweets to lowercase.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the review.

    Returns:
    pd.DataFrame: The DataFrame with preprocessed review.
    """

    # remove URLs from the review
    df['review'] = df['review'].apply(lambda x: re.sub(r'http\S+|www\S+', '', x))

    # remove mentions (@) and hashtags (#)
    df['review'] = df['review'].apply(lambda x: re.sub(r'[@#]\w+', '', x))

    # remove special characters and numbers
    df['review'] = df['review'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    df['review'] = df['review'].apply(lambda x: re.sub(r'\d+', '', x))

    # convert to lowercase
    df['review'] = df['review'].apply(lambda x: x.lower())

    # remove extra spaces
    df['review'] = df['review'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

    return df


In [None]:
from transformers import pipeline

def analyze_sentiments(df, column_name):
    """
    Analyzes sentiments of a dataframe column using the transformers library.

    Parameters:
    df (pandas.DataFrame): The dataframe containing the reviews.
    column_name (str): The name of the column containing the reviews.

    Returns:
    pandas.DataFrame: The dataframe with an additional column 'sentiments' containing the sentiment analysis results.
    """

    # Initialize the sentiment analysis pipeline
    sentiment_pipeline = pipeline("sentiment-analysis")

    # Analyze sentiments of the reviews and store the results in a list
    sentiments = []
    for review in df[column_name]:
        # Truncate the review to the first 512 tokens
        review = " ".join(review.split()[:400])
        sentiment = sentiment_pipeline(review)[0]['label']
        sentiments.append(sentiment)

    # Add the sentiments to the dataframe
    df['sentiments'] = sentiments

    return df


In [None]:
url = 'https://www.amazon.com/Monster-Energy-Drink-Green-Original/product-reviews/B019AKA6YU/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber='
pages = 15
monster_reviews = scrape_reviews(url, pages)
monster_reviews.head()


TimeoutException occurred on page 11.
TimeoutException occurred on page 12.
TimeoutException occurred on page 13.
TimeoutException occurred on page 14.


Unnamed: 0,title,review,product_ratings,Date
0,Good,"I very much like this one, just get tired of w...","<span class=""a-icon-alt"">5.0 out of 5 stars</s...","Reviewed in the United States on October 13, 2023"
1,Exactly as advertised,Best value and exactly what I expected,"<span class=""a-icon-alt"">5.0 out of 5 stars</s...","Reviewed in the United States on October 7, 2023"
2,The best,The best drink ever,"<span class=""a-icon-alt"">5.0 out of 5 stars</s...","Reviewed in the United States on September 20,..."
3,Good alternative when I can't hit a big box store,I order online or use delivery when I can't go...,"<span class=""a-icon-alt"">4.0 out of 5 stars</s...","Reviewed in the United States on August 8, 2023"
4,"Great Taste, But Pricey",I recently tried Monster Energy Drink in the g...,"<span class=""a-icon-alt"">4.0 out of 5 stars</s...","Reviewed in the United States on March 29, 2023"


In [None]:
# monster_reviews = monster.copy()

In [None]:
monster_reviews= process_ratings(monster_reviews, 'product_ratings')
monster_reviews['Date'] =extract_dates(monster_reviews, 'Date')
monster_reviews = preprocess_review( monster_reviews)
monster_reviews  = analyze_sentiments(monster_reviews, 'review' )
monster_reviews.head()
monster_reviews.to_csv('monster_reviews.csv')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
url = 'https://www.amazon.com/Red-Bull-Energy-Drink-Pack/product-reviews/B000MTST70/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber='
pages = 15
Redbull_reviews = scrape_reviews(url, pages)
Redbull_reviews.head()

TimeoutException occurred on page 11.
TimeoutException occurred on page 12.
TimeoutException occurred on page 13.
TimeoutException occurred on page 14.


Unnamed: 0,title,review,product_ratings,Date
0,I love Red Bull.,My case of Red Bull came undamaged. No cans we...,"<span class=""a-icon-alt"">5.0 out of 5 stars</s...","Reviewed in the United States on November 7, 2023"
1,Pairs Great with Jager!,Most definitely buy this to mix with alcohal.....,"<span class=""a-icon-alt"">5.0 out of 5 stars</s...","Reviewed in the United States on November 16, ..."
2,Arrived on time and in perfect condition,I love me some Red Bull and I really love that...,"<span class=""a-icon-alt"">5.0 out of 5 stars</s...","Reviewed in the United States on October 20, 2023"
3,Good buy,Good deal cheaper than at the stores,"<span class=""a-icon-alt"">5.0 out of 5 stars</s...","Reviewed in the United States on November 4, 2023"
4,Red Bull Energy Drink,nice drink,"<span class=""a-icon-alt"">5.0 out of 5 stars</s...","Reviewed in the United States on November 2, 2023"


In [None]:
Redbull_reviews.shape

(100, 5)

In [None]:
Redbull_reviews= process_ratings(Redbull_reviews, 'product_ratings')
Redbull_reviews['Date'] =extract_dates(Redbull_reviews, 'Date')
Redbull_reviews = preprocess_review( Redbull_reviews)
Redbull_reviews  = analyze_sentiments(Redbull_reviews, 'review' )
Redbull_reviews.head()
Redbull_reviews.to_csv('Redbull_reviews.csv')