## Web Scraping an Amazon page

In this useful project we are going to explore the basis of 
Web Scraping and the packages needed to do so.

We are going to pull information (specifically the name of the 
product and its price) directly from this link:
https://www.amazon.es/Apple-2023-MacBook-Port%C3%A1til-Chip/dp/B0CM66W4YT/ref=sr_1_3?dib=eyJ2IjoiMSJ9._jo4OKaENbblzd-M4KhXK3ZG4b69ZnUcgyHF9stQ9OH0Rga97Ck6KxIBBZca2iCvgCRiwsg8Zwa5L5e6OVcuCnWfDavA4MPkOaz3FLp3Hwa3E3WSr5-je4Y1Evoi4CrPhZlm0PV9W5OOnRVgNgYEXo_w37TEpIDNDsyHQP3fhNNjLwMBFpq2g9MPMTS2HWjdv8CUtyaK4OCKiDJE8pYpzK1ZsNqK2zfObivFl1QJIbr-aYF_yz8Ye7CWVdB4R_R2eyn7pp1wYvn9dQC5WOnr-9QcT9QmMR6hE96iXrvsGSQ.T11mhfbKDSFalrPaYaWW0qizxMjOdi97WMu1KHNncf0&dib_tag=se&keywords=macbook+pro&qid=1709839898&s=computers&sr=1-3

Then we are going to save the information in a csv file in our local machine.

In [1]:
from bs4 import BeautifulSoup
import requests
import time
import datetime


In [2]:
# Connect to Website selected
url = "https://www.amazon.es/Apple-2023-MacBook-Port%C3%A1til-Chip/dp/B0CM66W4YT/ref=sr_1_3?dib=eyJ2IjoiMSJ9._jo4OKaENbblzd-M4KhXK3ZG4b69ZnUcgyHF9stQ9OH0Rga97Ck6KxIBBZca2iCvgCRiwsg8Zwa5L5e6OVcuCnWfDavA4MPkOaz3FLp3Hwa3E3WSr5-je4Y1Evoi4CrPhZlm0PV9W5OOnRVgNgYEXo_w37TEpIDNDsyHQP3fhNNjLwMBFpq2g9MPMTS2HWjdv8CUtyaK4OCKiDJE8pYpzK1ZsNqK2zfObivFl1QJIbr-aYF_yz8Ye7CWVdB4R_R2eyn7pp1wYvn9dQC5WOnr-9QcT9QmMR6hE96iXrvsGSQ.T11mhfbKDSFalrPaYaWW0qizxMjOdi97WMu1KHNncf0&dib_tag=se&keywords=macbook+pro&qid=1709839898&s=computers&sr=1-3"

# Use headers to ensure that the scraper gets the most relevant and high-quality data
# header available in https://httpbin.org/get
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"}

# Send a GET request for the url
page = requests.get(url, headers = headers)

In [3]:
# Start using BeautifulSoup to pull the html code from the page
soup = BeautifulSoup(page.content, "html.parser")

# Take a better look at the hmtl extracted
soup = BeautifulSoup(soup.prettify(), 'html.parser')

# Now we are going to extract the name of the product after doing an inspection
# in the page and getting the name of the id needed for the title
# get_text method is used to fet rid of the html code around the title information

title = soup.find(id='productTitle').get_text().strip()
print(title)

Apple 2023 MacBook Pro Portátil con Chip M3 Pro: CPU de 11 núcleos, GPU de 14 núcleos, Pantalla Liquid Retina XDR de 14,2 Pulgadas, 18 GB de Memoria unificada, 512 GB de SSD, Plata, Teclado portugués


In [4]:
# Pull the price in, similar way we obtained the title, pulling only the price
price = soup.find_all('span', class_='a-price-whole')[0].get_text().strip().rstrip(',').replace('.', '')
price = int(price)

print (price)

2599


In [5]:
type(price)

int

In [6]:
# Using datetime library we can set the date into the file
today = datetime.date.today()

print(today)

2024-03-09


In [7]:
# Using datetime library we can set timestamp into the file
# now = datetime.datetime.now()
# now = now.isoformat(timespec="seconds")
now=datetime.datetime.now().isoformat(sep=" ", timespec="seconds")[10:]
print(now)

 11:38:18


### Create csv and insert data into it

In [8]:
# For that we must set main parameters
header = ['Title', 'Price EUR', 'Date','Time']
data = [title, price, today, now]

type(data)

list

In [9]:
# Create csv, define mode of operation, and how to write content to it
# the we define what contents to write to it. Module csv will allow us
# to create a file and define how to create newlines and encoding
import csv

with open ('AmazonWebScraping.csv', mode ='w', newline='', encoding='UTF8') as scraped_data:
    writer = csv.writer(scraped_data)
    writer.writerow(header)
    writer.writerow(data)
    

In [10]:
# Read generated csv file using Pandas
import pandas as pd
import os
df = pd.read_csv(r'/Users/danielsmacbookpro/Documents/Coding/Data Analysis Undemy/AmazonWebScraping.csv', index_col = 0)
print(df)

                                                    Price EUR        Date  \
Title                                                                       
Apple 2023 MacBook Pro Portátil con Chip M3 Pro...       2599  2024-03-09   

                                                         Time  
Title                                                          
Apple 2023 MacBook Pro Portátil con Chip M3 Pro...   11:38:18  


In [11]:
# Previous code is for creating a csv with only 1 row. Let's make it possible 
# to append other rows to the existing file.
with open ('AmazonWebScraping.csv', mode ='a+', newline='', encoding='UTF8') as scraped_data:
    writer = csv.writer(scraped_data)
    writer.writerow(data)
    

In [12]:
# We can automate the process of checking the price with a function

def check_price():
    url = "https://www.amazon.es/Apple-2023-MacBook-Port%C3%A1til-Chip/dp/B0CM66W4YT/ref=sr_1_3?dib=eyJ2IjoiMSJ9._jo4OKaENbblzd-M4KhXK3ZG4b69ZnUcgyHF9stQ9OH0Rga97Ck6KxIBBZca2iCvgCRiwsg8Zwa5L5e6OVcuCnWfDavA4MPkOaz3FLp3Hwa3E3WSr5-je4Y1Evoi4CrPhZlm0PV9W5OOnRVgNgYEXo_w37TEpIDNDsyHQP3fhNNjLwMBFpq2g9MPMTS2HWjdv8CUtyaK4OCKiDJE8pYpzK1ZsNqK2zfObivFl1QJIbr-aYF_yz8Ye7CWVdB4R_R2eyn7pp1wYvn9dQC5WOnr-9QcT9QmMR6hE96iXrvsGSQ.T11mhfbKDSFalrPaYaWW0qizxMjOdi97WMu1KHNncf0&dib_tag=se&keywords=macbook+pro&qid=1709839898&s=computers&sr=1-3"
    
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"}

    page = requests.get(url, headers = headers)
    
    soup = BeautifulSoup(page.content, "html.parser")

    soup = BeautifulSoup(soup.prettify(), 'html.parser')

    title = soup.find(id='productTitle').get_text().strip()

    price = soup.find_all('span', class_='a-price-whole')[0].get_text().strip().rstrip(',').replace('.', '')
    
    price = int(price)
    
    today = datetime.date.today()
    
    now = datetime.datetime.now().isoformat(sep=" ", timespec="seconds")[10:]
    
    header = ['Title', 'Price EUR', 'Date', 'Time']
    
    data = [title, price, today, now]
    
    with open ('AmazonWebScraping.csv', mode ='a+', newline='', encoding='UTF8') as scraped_data:
        writer = csv.writer(scraped_data)
        writer.writerow(data)
    

In [15]:
# And also a timer to work with the function

while(True):
    try:
        check_price()
        time.sleep(4)
    except KeyboardInterrupt:
        print('Timer interrupted by user')
        break
    except Exception as e:
        print(f'An error occurred: {e}')
       

Timer interrupted by user


In [16]:
# Check we have data corresponding to the time spent
df = pd.read_csv(r'/Users/danielsmacbookpro/Documents/Coding/Data Analysis Undemy/AmazonWebScraping.csv')
display(df)

Unnamed: 0,Title,Price EUR,Date,Time
0,Apple 2023 MacBook Pro Portátil con Chip M3 Pr...,2599,2024-03-09,11:38:18
1,Apple 2023 MacBook Pro Portátil con Chip M3 Pr...,2599,2024-03-09,11:38:18
2,Apple 2023 MacBook Pro Portátil con Chip M3 Pr...,2599,2024-03-09,11:38:23
3,Apple 2023 MacBook Pro Portátil con Chip M3 Pr...,2599,2024-03-09,11:38:29
4,Apple 2023 MacBook Pro Portátil con Chip M3 Pr...,2599,2024-03-09,11:38:36
5,Apple 2023 MacBook Pro Portátil con Chip M3 Pr...,2599,2024-03-09,11:38:42
6,Apple 2023 MacBook Pro Portátil con Chip M3 Pr...,2599,2024-03-09,11:38:49
7,Apple 2023 MacBook Pro Portátil con Chip M3 Pr...,2599,2024-03-09,11:38:55
8,Apple 2023 MacBook Pro Portátil con Chip M3 Pr...,2599,2024-03-09,11:39:01
9,Apple 2023 MacBook Pro Portátil con Chip M3 Pr...,2599,2024-03-09,11:39:29
