In [1]:
# Imports
# Reference: https://www.youtube.com/watch?v=gzLIJZkOGYQ

from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import date

In [2]:
# URL for searching Android tablets on Newegg.ca
url = "https://www.newegg.ca/p/pl?N=100165903%20600502553"

# Get Request
response = requests.get(url)

In [3]:
response.status_code

200

In [4]:
# Get the soup
soup = BeautifulSoup(response.content, 'html.parser')

## Obtain Results & Some Testing

In [5]:
# Results
results = soup.find_all('div', {'class': 'item-cell'})

In [6]:
# Number of items on page
len(results)

36

In [7]:
# Obtain link

results[0].find('a', {'class': 'item-img'})['href']

'https://www.newegg.ca/p/0EJ-000P-005K7'

In [8]:
# Product Company:
results[0].find('a', {'class': 'item-brand'}).find('img')['title']

'SAMSUNG'

In [9]:
# Product Name:
results[0].find('a', {'class': 'item-title'}).get_text()

'Samsung Galaxy Tab S6 Lite 10.4", 64GB WiFi Tablet Oxford Gray - SM-P610NZAAXAR - S Pen Included'

In [10]:
# Current Price of Product:
results[0].find('li', {'class': 'price-current'}).find('strong').get_text() 

'469'

In [11]:
results[0].find('li', {'class': 'price-current'}).find('sup').get_text() 

'.99'

In [12]:
# Combine price:
results[0].find('li', {'class': 'price-current'}).find('strong').get_text() + results[0].find('li', {'class': 'price-current'}).find('sup').get_text() 

'469.99'

In [13]:
# Shipping Cost:
results[0].find('li', {'class': 'price-ship'}).get_text() 

'$27.02 Shipping'

## Obtain Data

Extract Product Name, Price, Shipping Cost, Link

In [14]:
product_name = [item.find('a', {'class': 'item-title'}).get_text() for item in results]

In [15]:
price = [item.find('li', {'class': 'price-current'}).find('strong').get_text() +
         item.find('li', {'class': 'price-current'}).find('sup').get_text() 
         for item in results]

In [16]:
shipping_costs = [item.find('li', {'class': 'price-ship'}).get_text() for item in results]

In [17]:
links = [item.find('a', {'class': 'item-img'})['href'] for item in results]

## Make Dataframe

In [18]:
df = pd.DataFrame({
     'Product': product_name,
     'Price': price,
     'Shipping Costs': shipping_costs,
     'URL Link': links
})

In [19]:
# Sample of dataframe:
df.head(12)

Unnamed: 0,Product,Price,Shipping Costs,URL Link
0,"Samsung Galaxy Tab S6 Lite 10.4"", 64GB WiFi Ta...",469.99,$27.02 Shipping,https://www.newegg.ca/p/0EJ-000P-005K7
1,"Samsung Galaxy Tab A8 10.5"" (2021) 32GB ROM + ...",275.0,Free Shipping,https://www.newegg.ca/samsung-galaxy-tab-a8-10...
2,"Lenovo Tab P11, 11.0"" IPS Touch 400 nits, 4GB...",269.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-003V6
3,"Lenovo Tab P12 Pro, 12.6"" Touch 400 nits, 6GB...",959.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-00486
4,"Lenovo Tab P11, 11.0"" IPS Touch 400 nits, 4GB,...",289.99,Free Shipping,https://www.newegg.ca/lenovo-za7r0144us-tab-p1...
5,"Lenovo Smart Tab M10 Plus, 10.3"" FHD IPS Touch...",199.99,Free Shipping,https://www.newegg.ca/lenovo-za5w0029us-smart-...
6,"Lenovo Tab P11 + pen + keyboard bundle, 11.0"" ...",369.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-00487
7,"Samsung Galaxy Tab S7 11"" T870 128 GB Wi-Fi Ta...",839.99,Free Shipping,https://www.newegg.ca/samsung-sm-t870nzsaxar-g...
8,"Lenovo Yoga Tab 13, 13.0"" Touch 60Hz 60Hz, 8...",769.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-003Z0
9,"Lenovo Tab P11, 11.0"" IPS Touch 400 nits, 6GB...",399.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-003U6


# Multiple Pages Case

The previous work was extracting data from one page. What if I want to extract data from multiple pages?

For the multiple pages case, I want to extract more pages from the Android tablets. Just 3 pages

In [20]:
# Initalize lists:

tablet_names = []
price_list = []
shipping_costs_list = []
tablet_urls = []

# Scrape 3 pages for android tablets on newegg.ca

for i in range(1, 4, 1):
    tablet_url = f'https://www.newegg.ca/p/pl?N=100165903%20600502553&page={i}'
    # Get Request, soup and results for page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    results = soup.find_all('div', {'class': 'item-cell'})
    
    # Loop through results:
    for item in results:
        # Tablet Name
        try:
            tablet_names.append(item.find('a', {'class': 'item-title'}).get_text())
        except:
            tablet_names.append('N/A')
        # Price (CAD):
        try:
            price_list.append(item.find('li', {'class': 'price-current'}).find('strong').get_text() +
                              item.find('li', {'class': 'price-current'}).find('sup').get_text())
        except:
            price_list.append('N/A')
        # Shipping Costs:
        try:
            shipping_costs_list.append(item.find('li', {'class': 'price-ship'}).get_text())
        except:
            shipping_costs_list.append('N/A')
        # Tablet URL Links for more description.
        try:
            tablet_urls.append(item.find('a', {'class': 'item-img'})['href'])
        except:
            tablet_urls.append('N/A')
                                       

In [21]:
# Make dataframe:

newegg_tablets_df = pd.DataFrame({
     'Product': tablet_names,
     'Price (CAD)': price_list,
     'Shipping Costs': shipping_costs_list,
     'URL Link': tablet_urls
})

In [22]:
# Preview newegg_tablets dataframe:

newegg_tablets_df.head(12)

Unnamed: 0,Product,Price (CAD),Shipping Costs,URL Link
0,"Samsung Galaxy Tab S6 Lite 10.4"", 64GB WiFi Ta...",469.99,$27.02 Shipping,https://www.newegg.ca/p/0EJ-000P-005K7
1,"Samsung Galaxy Tab A8 10.5"" (2021) 32GB ROM + ...",275.0,Free Shipping,https://www.newegg.ca/samsung-galaxy-tab-a8-10...
2,"Lenovo Tab P11, 11.0"" IPS Touch 400 nits, 4GB...",269.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-003V6
3,"Lenovo Tab P12 Pro, 12.6"" Touch 400 nits, 6GB...",959.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-00486
4,"Lenovo Tab P11, 11.0"" IPS Touch 400 nits, 4GB,...",289.99,Free Shipping,https://www.newegg.ca/lenovo-za7r0144us-tab-p1...
5,"Lenovo Smart Tab M10 Plus, 10.3"" FHD IPS Touch...",199.99,Free Shipping,https://www.newegg.ca/lenovo-za5w0029us-smart-...
6,"Lenovo Tab P11 + pen + keyboard bundle, 11.0"" ...",369.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-00487
7,"Samsung Galaxy Tab S7 11"" T870 128 GB Wi-Fi Ta...",839.99,Free Shipping,https://www.newegg.ca/samsung-sm-t870nzsaxar-g...
8,"Lenovo Yoga Tab 13, 13.0"" Touch 60Hz 60Hz, 8...",769.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-003Z0
9,"Lenovo Tab P11, 11.0"" IPS Touch 400 nits, 6GB...",399.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-003U6


## Save Raw Data

In [23]:
# Save raw data to .csv file

newegg_tablets_df.to_csv('newegg_android_tablets_' + str(date.today()) + '.csv')

## Optional: Data Cleaning & Data Filtering

Now that the data has been obtained, we have to do a little bit of data cleaning before doing some data analysis.

In [28]:
# Check data types:

newegg_tablets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Product         108 non-null    object 
 1   Price (CAD)     108 non-null    float64
 2   Shipping Costs  108 non-null    object 
 3   URL Link        108 non-null    object 
dtypes: float64(1), object(3)
memory usage: 3.5+ KB


In [27]:
# Convert price into numeric while removing commas too:
newegg_tablets_df['Price (CAD)'] = pd.to_numeric(newegg_tablets_df['Price (CAD)'].str.replace(',', ""))

In [29]:
# Check out shipping costs column:
newegg_tablets_df['Shipping Costs'].unique()

array(['$27.02 Shipping', 'Free Shipping', '$9.99 Shipping',
       'Special Shipping'], dtype=object)

The original plan was to convert the shipping values into numeric values. Having Special Shipping in there does cause a problem. Not sure what numeric value can be associated with Special Shipping. The column will be left alone.

In [46]:
# Sorting the tablets by price, Cheapest tablets
newegg_tablets_df.sort_values(['Price (CAD)']).head(8)

Unnamed: 0,Product,Price (CAD),Shipping Costs,URL Link
28,Contixo V8-2 7 inch Kids Tablets - Tablet for ...,99.99,Free Shipping,https://www.newegg.ca/p/0EJ-003T-00099
62,Contixo V8-2 7 inch Kids Tablets - Tablet for ...,99.99,Free Shipping,https://www.newegg.ca/p/0EJ-003T-00099
98,Contixo V8-2 7 inch Kids Tablets - Tablet for ...,99.99,Free Shipping,https://www.newegg.ca/p/0EJ-003T-00099
51,"Lenovo Tab M8 HD, 8.0"" IPS Touch 350 nits, 2G...",129.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-002M7
16,"Lenovo Tab M8 HD, 8.0"" IPS Touch 350 nits, 2G...",129.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-002M7
87,"Lenovo Tab M8 HD, 8.0"" IPS Touch 350 nits, 2G...",129.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-002M7
81,"Lenovo Tab M8 FHD, 8.0"" FHD IPS Touch 350 nit...",154.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-002P9
12,"Lenovo Tab M8 FHD, 8.0"" FHD IPS Touch 350 nit...",154.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-002P9


In [47]:
# Sorting the tablets by price, Most expensive tablets
newegg_tablets_df.sort_values(['Price (CAD)'], ascending = False).head(8)

Unnamed: 0,Product,Price (CAD),Shipping Costs,URL Link
65,Samsung Galaxy Tab S6 (Wi-Fi + 4G LTE) SM-T865...,1699.0,Free Shipping,https://www.newegg.ca/samsung-sm-t865n-grey/p/...
101,Samsung Galaxy Tab S6 (Wi-Fi + 4G LTE) SM-T865...,1699.0,Free Shipping,https://www.newegg.ca/samsung-sm-t865n-grey/p/...
29,Samsung Galaxy Tab S6 (Wi-Fi + 4G LTE) SM-T865...,1699.0,Free Shipping,https://www.newegg.ca/samsung-sm-t865n-grey/p/...
24,Samsung Galaxy Tablet S7+ Mystic Black - 256 GB,1199.99,Free Shipping,https://www.newegg.ca/p/2RC-0034-001S7
68,Samsung Galaxy Tablet S7+ Mystic Black - 256 GB,1199.99,Free Shipping,https://www.newegg.ca/p/2RC-0034-001S7
104,Samsung Galaxy Tablet S7+ Mystic Black - 256 GB,1199.99,Free Shipping,https://www.newegg.ca/p/2RC-0034-001S7
84,Samsung Galaxy Tab S7 11-in 128GB Tablet - Mys...,1064.14,Free Shipping,https://www.newegg.ca/p/0EJ-000P-00658
48,Samsung Galaxy Tab S7 11-in 128GB Tablet - Mys...,1064.14,Free Shipping,https://www.newegg.ca/p/0EJ-000P-00658


In [48]:
## Searching Android tablets that is from Samsung:
newegg_tablets_df[newegg_tablets_df['Product'].str.contains('Samsung')].head(8)

Unnamed: 0,Product,Price (CAD),Shipping Costs,URL Link
0,"Samsung Galaxy Tab S6 Lite 10.4"", 64GB WiFi Ta...",469.99,$27.02 Shipping,https://www.newegg.ca/p/0EJ-000P-005K7
1,"Samsung Galaxy Tab A8 10.5"" (2021) 32GB ROM + ...",275.0,Free Shipping,https://www.newegg.ca/samsung-galaxy-tab-a8-10...
7,"Samsung Galaxy Tab S7 11"" T870 128 GB Wi-Fi Ta...",839.99,Free Shipping,https://www.newegg.ca/samsung-sm-t870nzsaxar-g...
13,Samsung Galaxy Tab S7 11-in 128GB Tablet - Mys...,1064.14,Free Shipping,https://www.newegg.ca/p/0EJ-000P-00658
24,Samsung Galaxy Tablet S7+ Mystic Black - 256 GB,1199.99,Free Shipping,https://www.newegg.ca/p/2RC-0034-001S7
29,Samsung Galaxy Tab S6 (Wi-Fi + 4G LTE) SM-T865...,1699.0,Free Shipping,https://www.newegg.ca/samsung-sm-t865n-grey/p/...
35,Samsung Galaxy Tab A7 10.4 Wi-Fi 32GB Tablet -...,368.13,Free Shipping,https://www.newegg.ca/p/0EJ-000P-00673
36,"Samsung Galaxy Tab S6 Lite 10.4"", 64GB WiFi Ta...",469.99,$27.02 Shipping,https://www.newegg.ca/p/0EJ-000P-005K7


In [49]:
## Searching Android tablet products that is from Lenovo
newegg_tablets_df[newegg_tablets_df['Product'].str.contains('Lenovo')].head(8)

Unnamed: 0,Product,Price (CAD),Shipping Costs,URL Link
2,"Lenovo Tab P11, 11.0"" IPS Touch 400 nits, 4GB...",269.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-003V6
3,"Lenovo Tab P12 Pro, 12.6"" Touch 400 nits, 6GB...",959.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-00486
4,"Lenovo Tab P11, 11.0"" IPS Touch 400 nits, 4GB,...",289.99,Free Shipping,https://www.newegg.ca/lenovo-za7r0144us-tab-p1...
5,"Lenovo Smart Tab M10 Plus, 10.3"" FHD IPS Touch...",199.99,Free Shipping,https://www.newegg.ca/lenovo-za5w0029us-smart-...
6,"Lenovo Tab P11 + pen + keyboard bundle, 11.0"" ...",369.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-00487
8,"Lenovo Yoga Tab 13, 13.0"" Touch 60Hz 60Hz, 8...",769.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-003Z0
9,"Lenovo Tab P11, 11.0"" IPS Touch 400 nits, 6GB...",399.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-003U6
10,"Lenovo Tab P12 Pro, 12.6"" Touch 400 nits, 8GB...",989.99,Free Shipping,https://www.newegg.ca/p/0EJ-001J-00498


In [50]:
## Searching Android tablet products that is from Huawei
newegg_tablets_df[newegg_tablets_df['Product'].str.contains('Huawei')]

Unnamed: 0,Product,Price (CAD),Shipping Costs,URL Link
31,Original Huawei MediaPad M5 Lite Tablet with 1...,578.0,Free Shipping,https://www.newegg.ca/p/0EJ-00BW-000N7
33,"Huawei MediaPad M5 Lite 64GB + 4GB RAM 10.1"" (...",909.0,Special Shipping,https://www.newegg.ca/p/0EJ-00BW-000T0
70,Original Huawei MediaPad M5 Lite Tablet with 1...,578.0,Free Shipping,https://www.newegg.ca/p/0EJ-00BW-000N6
106,Original Huawei MediaPad M5 Lite Tablet with 1...,578.0,Free Shipping,https://www.newegg.ca/p/0EJ-00BW-000N6
