### Importing libraries and declaring headers

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

headers = {'Accept-Language': 'en-GB,en;q=0.9,en-US;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 '
            'Safari/537.36 Edg/107.0.1418.62',
            'Connection': 'keep-alive',
           }

### Defining soup and accessing the webpage

In [7]:
def get_soup(url):
    response = requests.get(url, headers = headers)
    if response.status_code !=200: 
        print("Error in accessing webpage")
        exit(-1)
    
    soup = BeautifulSoup(response.text, "lxml")
    return soup

### Using soup to find the following from Amazon reviews: 
#### Author name, Rating, Title, Content, Date, Verified, Image Source 

In [8]:
def get_reviews(soup):
    review_elements = soup.select("div.review")
    scraped_reviews = []
    
    for review in review_elements:
        r_author_element = review.select_one("span.a-profile-name")
        r_author = r_author_element.text if r_author_element else None
        
        r_rating_element = review.select_one("i.review-rating")
        r_rating = r_rating_element.text.replace("out of 5 stars", "") if r_rating_element else None
        
        r_title_element = review.select_one("a.review-title")
        r_title_span_element = r_title_element.select_one("span:not([class])") if r_title_element else None
        r_title = r_title_span_element.text if r_title_span_element else None
        
        r_content_element = review.select_one("span.review-text")
        r_content = r_content_element.text if r_content_element else None
        
        r_date_element = review.select_one("span.review-date")
        r_date = r_date_element.text if r_date_element else None
        
        r_verified_element = review.select_one("span.a-size-mini")
        r_verified = r_verified_element.text if r_verified_element else None
        
        r_image_element = review.select_one("img.review-image-tile")
        r_image = r_image_element.attrs['src'] if r_image_element else None 
        
        r = {
            "author": r_author,
            "rating": r_rating, 
            "title": r_title, 
            "content": r_content,
            "date": r_date, 
            "verified": r_verified, 
            "image_url": r_image 
        }
        
        scraped_reviews.append(r)
    return scraped_reviews

### Providing the URL for extraction pipeline

In [11]:
def main(): 
    search_url = "https://www.amazon.in/Sony-Ragnarok-Standard-Game-PlayStation/product-reviews/B0B6FGSKCQ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
    soup = get_soup(search_url)
    acquired_data = get_reviews(soup)
    
    df = pd.DataFrame(data = acquired_data)
    df.to_csv("amzn_review.csv")
    
if __name__ == "__main__": 
    main()