In [2]:

import requests
from bs4 import BeautifulSoup
import csv
import time
import re

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
}

BASE_DOMAIN = "https://www.jumia.com.ng"


def parse_price(text):
    if not text:
        return None
    cleaned = re.sub(r"[^\d]", "", text)
    return float(cleaned) if cleaned else None


def scrape_products_from_page(url):
    resp = requests.get(url, headers=HEADERS, timeout=15)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    products = []

    for card in soup.select("article.prd"):
        name = card.select_one("h3.name")
        name = name.text.strip() if name else None

        link = card.find("a", href=True)
        product_url = BASE_DOMAIN + link["href"] if link else None

        price = card.select_one("div.prc")
        price = parse_price(price.text) if price else None

        old_price = card.select_one("div.old")
        old_price = parse_price(old_price.text) if old_price else None

        discount_pct = None
        if price and old_price and old_price > 0:
            discount_pct = round((old_price - price) / old_price * 100, 2)

        # rating from CSS variable
        rating = None
        stars = card.select_one("div.stars")
        if stars and "style" in stars.attrs:
            match = re.search(r"--rating:(\d+(\.\d+)?)", stars["style"])
            if match:
                rating = float(match.group(1))

        products.append({
            "name": name,
            "url": product_url,
            "price": price,
            "old_price": old_price,
            "discount_pct": discount_pct,
            "rating": rating
        })

    # Pagination
    next_page = soup.select_one('a[aria-label="Next"]')
    next_url = BASE_DOMAIN + next_page["href"] if next_page else None

    return products, next_url


def scrape_jumia_smartphones(max_pages=5, delay=2):
    url = "https://www.jumia.com.ng/catalog/?q=smartphones"
    page = 1
    all_products = []

    while url and page <= max_pages:
        print(f"Scraping page {page}: {url}")
        products, url = scrape_products_from_page(url)
        all_products.extend(products)
        page += 1
        time.sleep(delay)

    return all_products


if __name__ == "__main__":
    products = scrape_jumia_smartphones(max_pages=3)

    with open("jumia_smartphones_full.csv", "w", newline="", encoding="utf-8") as f:
        fieldnames = products[0].keys() if products else []
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(products)

    print(f"✅ Total products scraped: {len(products)}")



Scraping page 1: https://www.jumia.com.ng/catalog/?q=smartphones


ConnectionError: HTTPSConnectionPool(host='www.jumia.com.ng', port=443): Max retries exceeded with url: /catalog/?q=smartphones (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000230DA2BB250>: Failed to resolve 'www.jumia.com.ng' ([Errno 11001] getaddrinfo failed)"))

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 

In [4]:
df=pd.read_csv('jumia_smartphones_full_data.csv')

In [5]:
df

Unnamed: 0,name,brand,price,old_price,discount_percent,rating,specs,product_link
0,XIAOMI REDMI A5 - 6.88 4GB RAM/128GB ROM -...,XIAOMI,102000,111742.0,8.7,,—,https://www.jumia.com.ng/xiaomi-redmi-a5-6.88-...
1,XIAOMI Redmi 15C 6.9'' 4GB RAM/128GB ROM Andro...,XIAOMI,123643,136125.0,9.2,,—,https://www.jumia.com.ng/xiaomi-redmi-15c-6.9-...
2,XIAOMI REDMI A5 - 6.88 3GB RAM/64GB ROM --...,XIAOMI,91634,98036.0,6.5,,—,https://www.jumia.com.ng/xiaomi-redmi-a5-6.88-...
3,"itel 2165 Wireless FM, Torch, Dual SIM Phone +...",itel,8950,18000.0,50.3,,—,https://www.jumia.com.ng/customer/account/logi...
4,XIAOMI Redmi 15C 6.9'' 4GBRAM/128GB ROM Androi...,XIAOMI,123643,144235.0,14.3,,—,https://www.jumia.com.ng/xiaomi-redmi-15c-6.9-...
5,XIAOMI Redmi 15C 6.9'' 8GBRAM/256GB ROM Androi...,XIAOMI,161778,175642.0,7.9,,—,https://www.jumia.com.ng/redmi-15c-6.9-8gbram2...
6,"Samsung Galaxy A07 6.7"" 4GB RAM/64GBGB ROM - B...",Samsung,121011,135476.0,10.7,,—,https://www.jumia.com.ng/samsung-galaxy-a07-6....
7,"Sunelan S8 1.8"" Screen Phone, Dual SIM, Game, ...",Sunelan,6646,7188.0,7.5,,—,https://www.jumia.com.ng/customer/account/logi...
8,"itel City 100 7.65mm"" Slim Design 5200mah And...",itel,109900,120910.0,9.1,,—,https://www.jumia.com.ng/itel-city-100-7.65mm-...
9,"Poco C71 6.88"" 3GB RAM / 64GB ROM Android 15 ...",Poco,81316,92732.0,12.3,,—,https://www.jumia.com.ng/poco-c71-6.88-3gb-ram...


In [6]:
df.shape

(40, 8)

In [7]:
df = df.drop('product_link',axis=1)


In [8]:
df

Unnamed: 0,name,brand,price,old_price,discount_percent,rating,specs
0,XIAOMI REDMI A5 - 6.88 4GB RAM/128GB ROM -...,XIAOMI,102000,111742.0,8.7,,—
1,XIAOMI Redmi 15C 6.9'' 4GB RAM/128GB ROM Andro...,XIAOMI,123643,136125.0,9.2,,—
2,XIAOMI REDMI A5 - 6.88 3GB RAM/64GB ROM --...,XIAOMI,91634,98036.0,6.5,,—
3,"itel 2165 Wireless FM, Torch, Dual SIM Phone +...",itel,8950,18000.0,50.3,,—
4,XIAOMI Redmi 15C 6.9'' 4GBRAM/128GB ROM Androi...,XIAOMI,123643,144235.0,14.3,,—
5,XIAOMI Redmi 15C 6.9'' 8GBRAM/256GB ROM Androi...,XIAOMI,161778,175642.0,7.9,,—
6,"Samsung Galaxy A07 6.7"" 4GB RAM/64GBGB ROM - B...",Samsung,121011,135476.0,10.7,,—
7,"Sunelan S8 1.8"" Screen Phone, Dual SIM, Game, ...",Sunelan,6646,7188.0,7.5,,—
8,"itel City 100 7.65mm"" Slim Design 5200mah And...",itel,109900,120910.0,9.1,,—
9,"Poco C71 6.88"" 3GB RAM / 64GB ROM Android 15 ...",Poco,81316,92732.0,12.3,,—


In [9]:
df = df.drop(['rating',	'specs'], axis=1) # drop multiple column 


In [10]:
df

Unnamed: 0,name,brand,price,old_price,discount_percent
0,XIAOMI REDMI A5 - 6.88 4GB RAM/128GB ROM -...,XIAOMI,102000,111742.0,8.7
1,XIAOMI Redmi 15C 6.9'' 4GB RAM/128GB ROM Andro...,XIAOMI,123643,136125.0,9.2
2,XIAOMI REDMI A5 - 6.88 3GB RAM/64GB ROM --...,XIAOMI,91634,98036.0,6.5
3,"itel 2165 Wireless FM, Torch, Dual SIM Phone +...",itel,8950,18000.0,50.3
4,XIAOMI Redmi 15C 6.9'' 4GBRAM/128GB ROM Androi...,XIAOMI,123643,144235.0,14.3
5,XIAOMI Redmi 15C 6.9'' 8GBRAM/256GB ROM Androi...,XIAOMI,161778,175642.0,7.9
6,"Samsung Galaxy A07 6.7"" 4GB RAM/64GBGB ROM - B...",Samsung,121011,135476.0,10.7
7,"Sunelan S8 1.8"" Screen Phone, Dual SIM, Game, ...",Sunelan,6646,7188.0,7.5
8,"itel City 100 7.65mm"" Slim Design 5200mah And...",itel,109900,120910.0,9.1
9,"Poco C71 6.88"" 3GB RAM / 64GB ROM Android 15 ...",Poco,81316,92732.0,12.3


In [11]:
df.describe()

Unnamed: 0,price,old_price,discount_percent
count,40.0,39.0,39.0
mean,103731.45,123110.230769,15.982051
std,45359.886834,52297.989021,13.25717
min,6646.0,7188.0,0.9
25%,85358.25,105000.0,7.4
50%,108945.0,127990.0,9.5
75%,127990.0,153797.5,21.45
max,206513.0,216839.0,52.4


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              40 non-null     object 
 1   brand             40 non-null     object 
 2   price             40 non-null     int64  
 3   old_price         39 non-null     float64
 4   discount_percent  39 non-null     float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.7+ KB


In [13]:
 df['name'].str.split('-', n=1).str[0].str.strip()


0                                       XIAOMI REDMI A5
1     XIAOMI Redmi 15C 6.9'' 4GB RAM/128GB ROM Andro...
2                                       XIAOMI REDMI A5
3     itel 2165 Wireless FM, Torch, Dual SIM Phone +...
4     XIAOMI Redmi 15C 6.9'' 4GBRAM/128GB ROM Androi...
5     XIAOMI Redmi 15C 6.9'' 8GBRAM/256GB ROM Androi...
6            Samsung Galaxy A07 6.7" 4GB RAM/64GBGB ROM
7     Sunelan S8 1.8" Screen Phone, Dual SIM, Game, ...
8     itel City 100  7.65mm" Slim Design 5200mah And...
9         Poco C71  6.88" 3GB RAM / 64GB ROM Android 15
10    Samsung Galaxy A06 6.7" 4GB RAM/64GB ROM Andro...
11            Samsung Galaxy A07 6.7" 4GB RAM/128GB ROM
12    itel City 100  7.65mm" Slim Design 5200mah And...
13    itel City 100  7.65mm" Slim Design 5200mah And...
14       Poco C71  6.88" 4GB RAM / 128GB ROM Android 15
15    Tecno Spark 40 6.67" 4GB RAM/128GB ROM Android 15
16    XIAOMI Redmi A3 Pro 6.88" 4GB RAM / 128GB ROM ...
17    itel A100 6.75'' 90hz Bright Display IP65 

In [14]:
df

Unnamed: 0,name,brand,price,old_price,discount_percent
0,XIAOMI REDMI A5 - 6.88 4GB RAM/128GB ROM -...,XIAOMI,102000,111742.0,8.7
1,XIAOMI Redmi 15C 6.9'' 4GB RAM/128GB ROM Andro...,XIAOMI,123643,136125.0,9.2
2,XIAOMI REDMI A5 - 6.88 3GB RAM/64GB ROM --...,XIAOMI,91634,98036.0,6.5
3,"itel 2165 Wireless FM, Torch, Dual SIM Phone +...",itel,8950,18000.0,50.3
4,XIAOMI Redmi 15C 6.9'' 4GBRAM/128GB ROM Androi...,XIAOMI,123643,144235.0,14.3
5,XIAOMI Redmi 15C 6.9'' 8GBRAM/256GB ROM Androi...,XIAOMI,161778,175642.0,7.9
6,"Samsung Galaxy A07 6.7"" 4GB RAM/64GBGB ROM - B...",Samsung,121011,135476.0,10.7
7,"Sunelan S8 1.8"" Screen Phone, Dual SIM, Game, ...",Sunelan,6646,7188.0,7.5
8,"itel City 100 7.65mm"" Slim Design 5200mah And...",itel,109900,120910.0,9.1
9,"Poco C71 6.88"" 3GB RAM / 64GB ROM Android 15 ...",Poco,81316,92732.0,12.3


In [15]:
df['RAM'] = df['name'].str.extract(r'(\d+)\s*GB\s*RAM', expand=False) # create new column for RAM 

In [16]:
df

Unnamed: 0,name,brand,price,old_price,discount_percent,RAM
0,XIAOMI REDMI A5 - 6.88 4GB RAM/128GB ROM -...,XIAOMI,102000,111742.0,8.7,4.0
1,XIAOMI Redmi 15C 6.9'' 4GB RAM/128GB ROM Andro...,XIAOMI,123643,136125.0,9.2,4.0
2,XIAOMI REDMI A5 - 6.88 3GB RAM/64GB ROM --...,XIAOMI,91634,98036.0,6.5,3.0
3,"itel 2165 Wireless FM, Torch, Dual SIM Phone +...",itel,8950,18000.0,50.3,
4,XIAOMI Redmi 15C 6.9'' 4GBRAM/128GB ROM Androi...,XIAOMI,123643,144235.0,14.3,4.0
5,XIAOMI Redmi 15C 6.9'' 8GBRAM/256GB ROM Androi...,XIAOMI,161778,175642.0,7.9,8.0
6,"Samsung Galaxy A07 6.7"" 4GB RAM/64GBGB ROM - B...",Samsung,121011,135476.0,10.7,4.0
7,"Sunelan S8 1.8"" Screen Phone, Dual SIM, Game, ...",Sunelan,6646,7188.0,7.5,
8,"itel City 100 7.65mm"" Slim Design 5200mah And...",itel,109900,120910.0,9.1,
9,"Poco C71 6.88"" 3GB RAM / 64GB ROM Android 15 ...",Poco,81316,92732.0,12.3,3.0


In [17]:
df['ROM'] = df['name'].str.extract(r'(\d+)\s*GB\s*(?:ROM|Storage)', expand=False) # create a new column for ROM


In [18]:
df

Unnamed: 0,name,brand,price,old_price,discount_percent,RAM,ROM
0,XIAOMI REDMI A5 - 6.88 4GB RAM/128GB ROM -...,XIAOMI,102000,111742.0,8.7,4.0,128.0
1,XIAOMI Redmi 15C 6.9'' 4GB RAM/128GB ROM Andro...,XIAOMI,123643,136125.0,9.2,4.0,128.0
2,XIAOMI REDMI A5 - 6.88 3GB RAM/64GB ROM --...,XIAOMI,91634,98036.0,6.5,3.0,64.0
3,"itel 2165 Wireless FM, Torch, Dual SIM Phone +...",itel,8950,18000.0,50.3,,
4,XIAOMI Redmi 15C 6.9'' 4GBRAM/128GB ROM Androi...,XIAOMI,123643,144235.0,14.3,4.0,128.0
5,XIAOMI Redmi 15C 6.9'' 8GBRAM/256GB ROM Androi...,XIAOMI,161778,175642.0,7.9,8.0,256.0
6,"Samsung Galaxy A07 6.7"" 4GB RAM/64GBGB ROM - B...",Samsung,121011,135476.0,10.7,4.0,
7,"Sunelan S8 1.8"" Screen Phone, Dual SIM, Game, ...",Sunelan,6646,7188.0,7.5,,
8,"itel City 100 7.65mm"" Slim Design 5200mah And...",itel,109900,120910.0,9.1,,
9,"Poco C71 6.88"" 3GB RAM / 64GB ROM Android 15 ...",Poco,81316,92732.0,12.3,3.0,64.0


In [19]:
total_nan = df.isna().sum().sum()  # total NaN in the allcoulumns and row 
print(total_nan)


32


In [21]:
df['name'] = (
    df['name']
    .str.replace(r'\s*\d+(\.\d+)?\".*', '', regex=True)
    .str.replace(r'\s*\d+GB.*', '', regex=True)
    .str.strip()
)


In [22]:
df

Unnamed: 0,name,brand,price,old_price,discount_percent,RAM,ROM
0,XIAOMI REDMI A5 - 6.88,XIAOMI,102000,111742.0,8.7,4.0,128.0
1,XIAOMI Redmi 15C 6.9'',XIAOMI,123643,136125.0,9.2,4.0,128.0
2,XIAOMI REDMI A5 - 6.88,XIAOMI,91634,98036.0,6.5,3.0,64.0
3,"itel 2165 Wireless FM, Torch, Dual SIM Phone +...",itel,8950,18000.0,50.3,,
4,XIAOMI Redmi 15C 6.9'',XIAOMI,123643,144235.0,14.3,4.0,128.0
5,XIAOMI Redmi 15C 6.9'',XIAOMI,161778,175642.0,7.9,8.0,256.0
6,Samsung Galaxy A07,Samsung,121011,135476.0,10.7,4.0,
7,Sunelan S8,Sunelan,6646,7188.0,7.5,,
8,"itel City 100 7.65mm"" Slim Design 5200mah And...",itel,109900,120910.0,9.1,,
9,Poco C71,Poco,81316,92732.0,12.3,3.0,64.0


In [23]:
df['name'] = (
    df['name']
    .str.upper()
    .str.replace(r'\s*\d+(\.\d+)?\".*', '', regex=True)  # remove screen size
    .str.replace(r'\s*\d+GB.*', '', regex=True)         # remove RAM/ROM
    .str.replace(r'\s*ANDROID.*', '', regex=True)       # remove Android text
    .str.replace(r'\s*-.*', '', regex=True)             # remove hyphen text
    .str.strip()
)


In [24]:
df

Unnamed: 0,name,brand,price,old_price,discount_percent,RAM,ROM
0,XIAOMI REDMI A5,XIAOMI,102000,111742.0,8.7,4.0,128.0
1,XIAOMI REDMI 15C 6.9'',XIAOMI,123643,136125.0,9.2,4.0,128.0
2,XIAOMI REDMI A5,XIAOMI,91634,98036.0,6.5,3.0,64.0
3,"ITEL 2165 WIRELESS FM, TORCH, DUAL SIM PHONE +...",itel,8950,18000.0,50.3,,
4,XIAOMI REDMI 15C 6.9'',XIAOMI,123643,144235.0,14.3,4.0,128.0
5,XIAOMI REDMI 15C 6.9'',XIAOMI,161778,175642.0,7.9,8.0,256.0
6,SAMSUNG GALAXY A07,Samsung,121011,135476.0,10.7,4.0,
7,SUNELAN S8,Sunelan,6646,7188.0,7.5,,
8,"ITEL CITY 100 7.65MM"" SLIM DESIGN 5200MAH",itel,109900,120910.0,9.1,,
9,POCO C71,Poco,81316,92732.0,12.3,3.0,64.0


In [27]:
df=df.dropna(how='any')

In [28]:
df

Unnamed: 0,name,brand,price,old_price,discount_percent,RAM,ROM
0,XIAOMI REDMI A5,XIAOMI,102000,111742.0,8.7,4,128
1,XIAOMI REDMI 15C 6.9'',XIAOMI,123643,136125.0,9.2,4,128
2,XIAOMI REDMI A5,XIAOMI,91634,98036.0,6.5,3,64
4,XIAOMI REDMI 15C 6.9'',XIAOMI,123643,144235.0,14.3,4,128
5,XIAOMI REDMI 15C 6.9'',XIAOMI,161778,175642.0,7.9,8,256
9,POCO C71,Poco,81316,92732.0,12.3,3,64
10,SAMSUNG GALAXY A06,Samsung,97632,118264.0,17.4,4,64
11,SAMSUNG GALAXY A07,Samsung,135000,150150.0,10.1,4,128
14,POCO C71,Poco,94789,104778.0,9.5,4,128
15,TECNO SPARK 40,Tecno,144210,151422.0,4.8,4,128


In [32]:
df.loc[:, 'name'] = (
    df['name']
    .astype(str)
    .str.replace('"', '', regex=False)
    .str.strip()
)


In [33]:
df

Unnamed: 0,name,brand,price,old_price,discount_percent,RAM,ROM
0,XIAOMI REDMI A5,XIAOMI,102000,111742.0,8.7,4,128
1,XIAOMI REDMI 15C 6.9'',XIAOMI,123643,136125.0,9.2,4,128
2,XIAOMI REDMI A5,XIAOMI,91634,98036.0,6.5,3,64
4,XIAOMI REDMI 15C 6.9'',XIAOMI,123643,144235.0,14.3,4,128
5,XIAOMI REDMI 15C 6.9'',XIAOMI,161778,175642.0,7.9,8,256
9,POCO C71,Poco,81316,92732.0,12.3,3,64
10,SAMSUNG GALAXY A06,Samsung,97632,118264.0,17.4,4,64
11,SAMSUNG GALAXY A07,Samsung,135000,150150.0,10.1,4,128
14,POCO C71,Poco,94789,104778.0,9.5,4,128
15,TECNO SPARK 40,Tecno,144210,151422.0,4.8,4,128


In [40]:
df = df.drop_duplicates(subset=['name'], keep='first')

In [41]:
df

Unnamed: 0,name,brand,price,old_price,discount_percent,RAM,ROM
0,XIAOMI REDMI A5,XIAOMI,102000,111742.0,8.7,4,128
1,XIAOMI REDMI 15C 6.9'',XIAOMI,123643,136125.0,9.2,4,128
9,POCO C71,Poco,81316,92732.0,12.3,3,64
10,SAMSUNG GALAXY A06,Samsung,97632,118264.0,17.4,4,64
11,SAMSUNG GALAXY A07,Samsung,135000,150150.0,10.1,4,128
15,TECNO SPARK 40,Tecno,144210,151422.0,4.8,4,128
16,XIAOMI REDMI A3 PRO,XIAOMI,116895,127990.0,8.7,4,128
21,ITEL POWER 70 6.67'' HD,itel,123287,135997.0,9.3,8,128
27,XIAOMI REDMI 15 6.9'',XIAOMI,206513,216839.0,4.8,8,256
35,POCO C61,Poco,81476,92732.0,12.1,3,64


In [42]:
df.to_csv('cleaned_phones.csv', index=False)

In [43]:
df.to_excel('clean_phone.xlsx',index=False)

In [44]:
df.describe()

Unnamed: 0,price,old_price,discount_percent
count,11.0,11.0,11.0
mean,122613.727273,134805.545455,9.590909
std,35129.198021,34401.057589,3.533116
min,81316.0,92732.0,4.8
25%,99816.0,115003.0,8.4
50%,123287.0,135997.0,9.2
75%,135889.5,149509.0,11.1
max,206513.0,216839.0,17.4


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22 entries, 0 to 38
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              22 non-null     object 
 1   brand             22 non-null     object 
 2   price             22 non-null     int64  
 3   old_price         22 non-null     float64
 4   discount_percent  22 non-null     float64
 5   RAM               22 non-null     object 
 6   ROM               22 non-null     object 
dtypes: float64(2), int64(1), object(4)
memory usage: 1.4+ KB
