<b><font size='6'>Import Packages</font></b><br>
<font size='4'>Setup Notebook Environment.</font>

In [2]:
# WebScraping
from bs4 import BeautifulSoup
import requests

# Google Cloud BigQuery
from google.cloud import bigquery

# Data Manipulation & Exploration
import pandas as pd 
import numpy as np

# Time Intervals
import time     

# Set DateTime Values
from datetime import datetime

# WebScraping Progress Bar
from tqdm import trange

# Import OS
import os

<b><font size='6'>Initialize Client Object</font></b><br>
<font size='4'>Provide path to google cloud credentials and initialize client object.</font>

In [3]:
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'adrianjuliusaluoch.json'

In [4]:
# Initialize Client Object
client = bigquery.Client(project='project-adrian-julius-aluoch')

<b><font size='6'>Build Webscraper : Appliances</font></b><br>
<font size='4'>Scrape Jumia Products and Load data into Google BigQuery.</font>

In [9]:
stores = [
    'https://www.jumia.co.ke/',
    'https://www.jumia.com.ng/',
]

for store in stores:
    print(f'E-Commerce ====> {store}')
    bigdata = pd.DataFrame()
    for i in trange(1,51):
        url = str(store) + 'home-office-appliances/?page=' + str(i) + '#catalog-listing'
            
        content = requests.get(url).text
        soup = BeautifulSoup(content,'lxml')
            
        pages = soup.find_all('div',class_='info')
        data = pd.DataFrame()
        for page in pages:
            try:
                item_name = page.find('h3',class_='name').text    
                ratings = page.find('div',class_='stars _s').text   
                try:
                    initial_price = page.find('div',class_='old').text
                except Exception as e:
                    initial_price = np.NAN
                final_price = page.find('div',class_='prc').text
                last_scraped = datetime.now()
                item_category = 'Appliances'
                
                # Create DataFrame to Temporarily store each listing
                dataframe = pd.DataFrame({
                                        'item_name':[item_name],
                                        'item_category':[item_category],
                                        'ratings':[ratings],
                                        'initial_price':[initial_price],
                                        'final_price':[final_price],
                                        'last_scraped':[last_scraped]
                                        })
                data = pd.concat([data,dataframe],ignore_index=True)
                    
            except Exception as e:
                pass
                
        bigdata = pd.concat([bigdata,data],ignore_index=True)

    table_id = 'project-adrian-julius-aluoch.cronjobs.jumia_data'
    job = client.load_table_from_dataframe(bigdata,table_id)

    while job.state != 'DONE':
        time.sleep(1)
        job.reload()
        print(f'{store} ----> {job.state}')      

E-Commerce ====> https://www.jumia.co.ke/


100%|██████████| 4/4 [00:06<00:00,  1.64s/it]


https://www.jumia.co.ke/ ----> RUNNING
https://www.jumia.co.ke/ ----> DONE
E-Commerce ====> https://www.jumia.ug/


100%|██████████| 4/4 [00:08<00:00,  2.11s/it]


https://www.jumia.ug/ ----> RUNNING
https://www.jumia.ug/ ----> DONE
E-Commerce ====> https://www.jumia.com.ng/


100%|██████████| 4/4 [00:06<00:00,  1.50s/it]


https://www.jumia.com.ng/ ----> RUNNING
https://www.jumia.com.ng/ ----> DONE


<b><font size='6'>Build Webscraper : Baby Products</font></b><br>
<font size='4'>Scrape Jumia Products and Load data into Google BigQuery.</font>

In [None]:
stores = [
    'https://www.jumia.co.ke/',
    'https://www.jumia.com.ng/'
]

for store in stores:
    print(f'E-Commerce ====> {store}')
    bigdata = pd.DataFrame()
    for i in trange(1,51):
        url = str(store) + 'baby-products/?page=' + str(i) + '#catalog-listing'
            
        content = requests.get(url).text
        soup = BeautifulSoup(content,'lxml')
            
        pages = soup.find_all('div',class_='info')
        data = pd.DataFrame()
        for page in pages:
            try:
                item_name = page.find('h3',class_='name').text    
                ratings = page.find('div',class_='stars _s').text   
                try:
                    initial_price = page.find('div',class_='old').text
                except Exception as e:
                    initial_price = np.NAN
                final_price = page.find('div',class_='prc').text
                last_scraped = datetime.now()
                item_category = 'Baby Products'
                
                # Create DataFrame to Temporarily store each listing
                dataframe = pd.DataFrame({
                                        'item_name':[item_name],
                                        'item_category':[item_category],
                                        'ratings':[ratings],
                                        'initial_price':[initial_price],
                                        'final_price':[final_price],
                                        'last_scraped':[last_scraped]
                                        })
                data = pd.concat([data,dataframe],ignore_index=True)
                    
            except Exception as e:
                pass
                
        bigdata = pd.concat([bigdata,data],ignore_index=True)

    table_id = 'project-adrian-julius-aluoch.cronjobs.jumia_data'
    job = client.load_table_from_dataframe(bigdata,table_id)

    while job.state != 'DONE':
        time.sleep(1)
        job.reload()
        print(f'{store} ----> {job.state}')      

E-Commerce ====> https://www.jumia.co.ke/


100%|██████████| 50/50 [01:21<00:00,  1.63s/it]


https://www.jumia.co.ke/ ----> RUNNING
https://www.jumia.co.ke/ ----> DONE
E-Commerce ====> https://www.jumia.ug/


 18%|█▊        | 9/50 [00:13<01:04,  1.58s/it]

<b><font size='6'>Build Webscraper : Computing</font></b><br>
<font size='4'>Scrape Jumia Products and Load data into Google BigQuery.</font>

In [None]:
stores = [
    'https://www.jumia.co.ke/',
    'https://www.jumia.com.ng/'
]

for store in stores:
    print(f'E-Commerce ====> {store}')
    bigdata = pd.DataFrame()
    for i in trange(1,51):
        url = str(store) + 'computing/?page=' + str(i) + '#catalog-listing'
            
        content = requests.get(url).text
        soup = BeautifulSoup(content,'lxml')
            
        pages = soup.find_all('div',class_='info')
        data = pd.DataFrame()
        for page in pages:
            try:
                item_name = page.find('h3',class_='name').text    
                ratings = page.find('div',class_='stars _s').text   
                try:
                    initial_price = page.find('div',class_='old').text
                except Exception as e:
                    initial_price = np.NAN
                final_price = page.find('div',class_='prc').text
                last_scraped = datetime.now()
                item_category = 'Computing'
                
                # Create DataFrame to Temporarily store each listing
                dataframe = pd.DataFrame({
                                        'item_name':[item_name],
                                        'item_category':[item_category],
                                        'ratings':[ratings],
                                        'initial_price':[initial_price],
                                        'final_price':[final_price],
                                        'last_scraped':[last_scraped]
                                        })
                data = pd.concat([data,dataframe],ignore_index=True)
                    
            except Exception as e:
                pass
                
        bigdata = pd.concat([bigdata,data],ignore_index=True)

    table_id = 'project-adrian-julius-aluoch.cronjobs.jumia_data'
    job = client.load_table_from_dataframe(bigdata,table_id)

    while job.state != 'DONE':
        time.sleep(1)
        job.reload()
        print(f'{store} ----> {job.state}')       

<b><font size='6'>Build Webscraper : Electronics</font></b><br>
<font size='4'>Scrape Jumia Products and Load data into Google BigQuery.</font>

In [None]:
stores = [
    'https://www.jumia.co.ke/',
    'https://www.jumia.ug/',
    'https://www.jumia.com.ng/'
]

for store in stores:
    print(f'E-Commerce ====> {store}')
    bigdata = pd.DataFrame()
    for i in trange(1,51):
        url = str(store) + 'electronics/?page=' + str(i) + '#catalog-listing'
            
        content = requests.get(url).text
        soup = BeautifulSoup(content,'lxml')
            
        pages = soup.find_all('div',class_='info')
        data = pd.DataFrame()
        for page in pages:
            try:
                item_name = page.find('h3',class_='name').text    
                ratings = page.find('div',class_='stars _s').text   
                try:
                    initial_price = page.find('div',class_='old').text
                except Exception as e:
                    initial_price = np.NAN
                final_price = page.find('div',class_='prc').text
                last_scraped = datetime.now()
                item_category = 'Electronics'
                
                # Create DataFrame to Temporarily store each listing
                dataframe = pd.DataFrame({
                                        'item_name':[item_name],
                                        'item_category':[item_category],
                                        'ratings':[ratings],
                                        'initial_price':[initial_price],
                                        'final_price':[final_price],
                                        'last_scraped':[last_scraped]
                                        })
                data = pd.concat([data,dataframe],ignore_index=True)
                    
            except Exception as e:
                pass
                
        bigdata = pd.concat([bigdata,data],ignore_index=True)

    table_id = 'project-adrian-julius-aluoch.cronjobs.jumia_data'
    job = client.load_table_from_dataframe(bigdata,table_id)

    while job.state != 'DONE':
        time.sleep(1)
        job.reload()
        print(f'{store} ----> {job.state}')      

<b><font size='6'>Build Webscraper : Fashion</font></b><br>
<font size='4'>Scrape Jumia Products and Load data into Google BigQuery.</font>

In [None]:
stores = [
    'https://www.jumia.co.ke/',
    'https://www.jumia.com.ng/'
]

for store in stores:
    print(f'E-Commerce ====> {store}')
    bigdata = pd.DataFrame()
    for i in trange(1,51):
        url = str(store) + 'category-fashion-by-jumia/?page=' + str(i) + '#catalog-listing'
            
        content = requests.get(url).text
        soup = BeautifulSoup(content,'lxml')
            
        pages = soup.find_all('div',class_='info')
        data = pd.DataFrame()
        for page in pages:
            try:
                item_name = page.find('h3',class_='name').text    
                ratings = page.find('div',class_='stars _s').text   
                try:
                    initial_price = page.find('div',class_='old').text
                except Exception as e:
                    initial_price = np.NAN
                final_price = page.find('div',class_='prc').text
                last_scraped = datetime.now()
                item_category = 'Fashion'
                
                # Create DataFrame to Temporarily store each listing
                dataframe = pd.DataFrame({
                                        'item_name':[item_name],
                                        'item_category':[item_category],
                                        'ratings':[ratings],
                                        'initial_price':[initial_price],
                                        'final_price':[final_price],
                                        'last_scraped':[last_scraped]
                                        })
                data = pd.concat([data,dataframe],ignore_index=True)
                    
            except Exception as e:
                pass
                
        bigdata = pd.concat([bigdata,data],ignore_index=True)

    table_id = 'project-adrian-julius-aluoch.cronjobs.jumia_data'
    job = client.load_table_from_dataframe(bigdata,table_id)

    while job.state != 'DONE':
        time.sleep(1)
        job.reload()
        print(f'{store} ----> {job.state}')     

<b><font size='6'>Build Webscraper : Grocery</font></b><br>
<font size='4'>Scrape Jumia Products and Load data into Google BigQuery.</font>

In [None]:
stores = [
    'https://www.jumia.co.ke/',
    'https://www.jumia.com.ng/'
]

for store in stores:
    print(f'E-Commerce ====> {store}')
    bigdata = pd.DataFrame()
    for i in trange(1,51):
        url = str(store) + 'groceries/?page=' + str(i) + '#catalog-listing'
            
        content = requests.get(url).text
        soup = BeautifulSoup(content,'lxml')
            
        pages = soup.find_all('div',class_='info')
        data = pd.DataFrame()
        for page in pages:
            try:
                item_name = page.find('h3',class_='name').text    
                ratings = page.find('div',class_='stars _s').text   
                try:
                    initial_price = page.find('div',class_='old').text
                except Exception as e:
                    initial_price = np.NAN
                final_price = page.find('div',class_='prc').text
                last_scraped = datetime.now()
                item_category = 'Grocery'
                
                # Create DataFrame to Temporarily store each listing
                dataframe = pd.DataFrame({
                                        'item_name':[item_name],
                                        'item_category':[item_category],
                                        'ratings':[ratings],
                                        'initial_price':[initial_price],
                                        'final_price':[final_price],
                                        'last_scraped':[last_scraped]
                                        })
                data = pd.concat([data,dataframe],ignore_index=True)
                    
            except Exception as e:
                pass
                
        bigdata = pd.concat([bigdata,data],ignore_index=True)

    table_id = 'project-adrian-julius-aluoch.cronjobs.jumia_data'
    job = client.load_table_from_dataframe(bigdata,table_id)

    while job.state != 'DONE':
        time.sleep(1)
        job.reload()
        print(f'{store} ----> {job.state}')      

<b><font size='6'>Build Webscraper : Health and Beauty</font></b><br>
<font size='4'>Scrape Jumia Products and Load data into Google BigQuery.</font>

In [None]:
stores = [
    'https://www.jumia.co.ke/',
    'https://www.jumia.com.ng/'
]

for store in stores:
    print(f'E-Commerce ====> {store}')
    bigdata = pd.DataFrame()
    for i in trange(1,51):
        url = str(store) + 'health-beauty/?page=' + str(i) + '#catalog-listing'
            
        content = requests.get(url).text
        soup = BeautifulSoup(content,'lxml')
            
        pages = soup.find_all('div',class_='info')
        data = pd.DataFrame()
        for page in pages:
            try:
                item_name = page.find('h3',class_='name').text    
                ratings = page.find('div',class_='stars _s').text   
                try:
                    initial_price = page.find('div',class_='old').text
                except Exception as e:
                    initial_price = np.NAN
                final_price = page.find('div',class_='prc').text
                last_scraped = datetime.now()
                item_category = 'Health & Beauty'
                
                # Create DataFrame to Temporarily store each listing
                dataframe = pd.DataFrame({
                                        'item_name':[item_name],
                                        'item_category':[item_category],
                                        'ratings':[ratings],
                                        'initial_price':[initial_price],
                                        'final_price':[final_price],
                                        'last_scraped':[last_scraped]
                                        })
                data = pd.concat([data,dataframe],ignore_index=True)
                    
            except Exception as e:
                pass
                
        bigdata = pd.concat([bigdata,data],ignore_index=True)

    table_id = 'project-adrian-julius-aluoch.cronjobs.jumia_data'
    job = client.load_table_from_dataframe(bigdata,table_id)

    while job.state != 'DONE':
        time.sleep(1)
        job.reload()
        print(f'{store} ----> {job.state}')      

<b><font size='6'>Build Webscraper : Home and Office</font></b><br>
<font size='4'>Scrape Jumia Products and Load data into Google BigQuery.</font>

In [None]:
stores = [
    'https://www.jumia.co.ke/',
    'https://www.jumia.com.ng/'
]

for store in stores:
    print(f'E-Commerce ====> {store}')
    bigdata = pd.DataFrame()
    for i in trange(1,51):
        url = str(store) + 'home-office/?page=' + str(i) + '#catalog-listing'
            
        content = requests.get(url).text
        soup = BeautifulSoup(content,'lxml')
            
        pages = soup.find_all('div',class_='info')
        data = pd.DataFrame()
        for page in pages:
            try:
                item_name = page.find('h3',class_='name').text    
                ratings = page.find('div',class_='stars _s').text   
                try:
                    initial_price = page.find('div',class_='old').text
                except Exception as e:
                    initial_price = np.NAN
                final_price = page.find('div',class_='prc').text
                last_scraped = datetime.now()
                item_category = 'Home & Office'
                
                # Create DataFrame to Temporarily store each listing
                dataframe = pd.DataFrame({
                                        'item_name':[item_name],
                                        'item_category':[item_category],
                                        'ratings':[ratings],
                                        'initial_price':[initial_price],
                                        'final_price':[final_price],
                                        'last_scraped':[last_scraped]
                                        })
                data = pd.concat([data,dataframe],ignore_index=True)
                    
            except Exception as e:
                pass
                
        bigdata = pd.concat([bigdata,data],ignore_index=True)

    table_id = 'project-adrian-julius-aluoch.cronjobs.jumia_data'
    job = client.load_table_from_dataframe(bigdata,table_id)

    while job.state != 'DONE':
        time.sleep(1)
        job.reload()
        print(f'{store} ----> {job.state}')      

<b><font size='6'>Build Webscraper : Phones and Tablets</font></b><br>
<font size='4'>Scrape Jumia Products and Load data into Google BigQuery.</font>

In [None]:
stores = [
    'https://www.jumia.co.ke/',
    'https://www.jumia.com.ng/'
]

for store in stores:
    print(f'E-Commerce ====> {store}')
    bigdata = pd.DataFrame()
    for i in trange(1,51):
        url = str(store) + 'phones-tablets/?page=' + str(i) + '#catalog-listing'
            
        content = requests.get(url).text
        soup = BeautifulSoup(content,'lxml')
            
        pages = soup.find_all('div',class_='info')
        data = pd.DataFrame()
        for page in pages:
            try:
                item_name = page.find('h3',class_='name').text    
                ratings = page.find('div',class_='stars _s').text   
                try:
                    initial_price = page.find('div',class_='old').text
                except Exception as e:
                    initial_price = np.NAN
                final_price = page.find('div',class_='prc').text
                last_scraped = datetime.now()
                item_category = 'Phones & Tablets'
                
                # Create DataFrame to Temporarily store each listing
                dataframe = pd.DataFrame({
                                        'item_name':[item_name],
                                        'item_category':[item_category],
                                        'ratings':[ratings],
                                        'initial_price':[initial_price],
                                        'final_price':[final_price],
                                        'last_scraped':[last_scraped]
                                        })
                data = pd.concat([data,dataframe],ignore_index=True)
                    
            except Exception as e:
                pass
                
        bigdata = pd.concat([bigdata,data],ignore_index=True)

    table_id = 'project-adrian-julius-aluoch.cronjobs.jumia_data'
    job = client.load_table_from_dataframe(bigdata,table_id)

    while job.state != 'DONE':
        time.sleep(1)
        job.reload()
        print(f'{store} ----> {job.state}')      

<b><font size='6'>Build Webscraper : Sporting</font></b><br>
<font size='4'>Scrape Jumia Products and Load data into Google BigQuery.</font>

In [None]:
stores = [
    'https://www.jumia.co.ke/',
    'https://www.jumia.com.ng/'
]

for store in stores:
    print(f'E-Commerce ====> {store}')
    bigdata = pd.DataFrame()
    for i in trange(1,51):
        url = str(store) + 'sporting-goods/?page=' + str(i) + '#catalog-listing'
            
        content = requests.get(url).text
        soup = BeautifulSoup(content,'lxml')
            
        pages = soup.find_all('div',class_='info')
        data = pd.DataFrame()
        for page in pages:
            try:
                item_name = page.find('h3',class_='name').text    
                ratings = page.find('div',class_='stars _s').text   
                try:
                    initial_price = page.find('div',class_='old').text
                except Exception as e:
                    initial_price = np.NAN
                final_price = page.find('div',class_='prc').text
                last_scraped = datetime.now()
                item_category = 'Sporting'
                
                # Create DataFrame to Temporarily store each listing
                dataframe = pd.DataFrame({
                                        'item_name':[item_name],
                                        'item_category':[item_category],
                                        'ratings':[ratings],
                                        'initial_price':[initial_price],
                                        'final_price':[final_price],
                                        'last_scraped':[last_scraped]
                                        })
                data = pd.concat([data,dataframe],ignore_index=True)
                    
            except Exception as e:
                pass
                
        bigdata = pd.concat([bigdata,data],ignore_index=True)

    table_id = 'project-adrian-julius-aluoch.cronjobs.jumia_data'
    job = client.load_table_from_dataframe(bigdata,table_id)

    while job.state != 'DONE':
        time.sleep(1)
        job.reload()
        print(f'{store} ----> {job.state}')      

<b><font size='6'>Query data from Google Cloud Database.</font></b><br>
<font size='4'>Run SQL query to get data from database.</font>

In [None]:
# !pip install db-dtypes

In [None]:
table_id = 'project-adrian-julius-aluoch.cronjobs.jumia_data'

In [None]:
# query data from google cloud bigquery
sql = (
'SELECT *'
'FROM `project-adrian-julius-aluoch.cronjobs.jumia_data`'
)

data = client.query(sql).to_dataframe()

# Check Total Number of Duplicate Records
duplicated = data.duplicated(subset=['item_name','item_category','ratings',
                                     'initial_price','final_price'
                                    ]).sum()
print(f"Initial Shape of Dataset : {data.shape}\nTotal Duplicate Records : {duplicated:,.0f}")

# Remove Duplicate Records
data.drop_duplicates(subset=['item_name','item_category','ratings',
                             'initial_price','final_price'
                            ],inplace=True)

print(f"Final Shape of Dataset : {data.shape}")

# Drop Original Real Estate Table 
table_id = 'project-adrian-julius-aluoch.cronjobs.jumia_data'
client.delete_table(table_id)

# Upload Final Real Estate Table
job = client.load_table_from_dataframe(data,table_id)
while job.state != 'DONE':
    time.sleep(1)
    job.reload()
    print(job.state)

In [None]:
display = data.iloc[:50].sort_values(by='last_scraped').reset_index(drop=True)
display