In [184]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [185]:
files = os.listdir('../data/raw')[1:]

all_data = []
for file in files:
    file_path = '../data/raw/' + file
    df = pd.read_csv(file_path, sep=',')
    all_data.append(df)

df = pd.concat(all_data, ignore_index=True)

In [186]:
df.head()

Unnamed: 0,product_image,product_name,product_brand,product_price_after_discount,product_price_before_discount,discount,rating,product_url,category_name,sub_category_title
0,https://hanoutedz.com/wp-content/uploads/2016/...,ipsa quae ab,,,152.0,,,https://hanoutedz.com/produit/ipsa-quae-ab/,Bathroom,
1,https://hanoutedz.com/wp-content/uploads/2016/...,eaque ipsa quae,,70.0,60.0,,,https://hanoutedz.com/produit/eaque-ipsa-quae/,Bathroom,
2,https://hanoutedz.com/wp-content/uploads/2016/...,quia voluptas sit,,150.0,200.0,,,https://hanoutedz.com/produit/quia-voluptas-si...,Bathroom,
3,https://hanoutedz.com/wp-content/uploads/2016/...,beatae vitae dicta,,,150.0,,,https://hanoutedz.com/produit/beatae-vitae-dic...,Bathroom,
4,https://hanoutedz.com/wp-content/uploads/2016/...,qui ratione volup,,,100.0,,,https://hanoutedz.com/produit/qui-ratione-volu...,Bathroom,


In [187]:
df.shape

(2774, 10)

In [188]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2774 entries, 0 to 2773
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   product_image                  2774 non-null   object 
 1   product_name                   2774 non-null   object 
 2   product_brand                  2347 non-null   object 
 3   product_price_after_discount   2603 non-null   object 
 4   product_price_before_discount  1734 non-null   object 
 5   discount                       1287 non-null   float64
 6   rating                         2325 non-null   float64
 7   product_url                    2774 non-null   object 
 8   category_name                  2774 non-null   object 
 9   sub_category_title             2325 non-null   object 
dtypes: float64(2), object(8)
memory usage: 216.8+ KB


In [189]:
df.product_price_before_discount

0       152.0
1        60.0
2       200.0
3       150.0
4       100.0
        ...  
2769      NaN
2770      NaN
2771    3,500
2772      NaN
2773      NaN
Name: product_price_before_discount, Length: 2774, dtype: object

In [190]:
df.product_price_after_discount

0         NaN
1        70.0
2       150.0
3         NaN
4         NaN
        ...  
2769    3,500
2770    3,000
2771    2,990
2772    3,000
2773    1,950
Name: product_price_after_discount, Length: 2774, dtype: object

After diplaying the general information of the data, we can notice that the columns **product_price_before_discount** and **product_price_after_discount** are stored as object types instead of nmerical types, even though they represent prices. This indicates that they should be converted to numeric values for proper analysis 

In [191]:
%%html
<!doctype html>
<html lang="en">

<body>
  <article class="card">
    <div class="eyebrow"><span class="dot"></span>EDA Finding</div>
    <h1>Price columns stored as text instead of numbers</h1>
    <p>
      After displaying the dataset’s general information, we noticed that
      the columns <span class="chip">product_price_before_discount</span>
      and <span class="chip">product_price_after_discount</span> are stored as
      <span class="kbd">object</span> types even though they represent prices.
      These should be converted to numeric values for accurate analysis and modeling.
    </p>
    <p class="hint">Tip: strip currency symbols and thousand separators, then cast with
      <span class="kbd">pd.to_numeric(errors="coerce")</span>.
    </p>
    <div class="chips">
      <span class="chip">Convert to float</span>
      <span class="chip">Handle missing</span>
      <span class="chip">Normalize format</span>
    </div>
  </article>
</body>
</html>


In [192]:
df['product_price_before_discount'].map(type).value_counts()

product_price_before_discount
<class 'float'>    1487
<class 'str'>      1287
Name: count, dtype: int64

In [193]:
df['product_price_after_discount'].map(type).value_counts()

product_price_after_discount
<class 'str'>      2325
<class 'float'>     449
Name: count, dtype: int64

we notice that the two columns contain mixed types (float and str) so we need to change only the str

In [194]:
df[df['product_price_before_discount'].apply(lambda x: isinstance(x, str))]

Unnamed: 0,product_image,product_name,product_brand,product_price_after_discount,product_price_before_discount,discount,rating,product_url,category_name,sub_category_title
449,https://dz.jumia.is/unsafe/fit-in/500x500/filt...,Lumière ( lot de 3 lampes) LED réglable sous-m...,Notification sur la confidentialité et les co...,829,2950,72.0,3.3,https://www.jumia.com.dz/lumiere-lot-de-3-lamp...,Électro-ménager,Accessoires High Tech
450,https://dz.jumia.is/unsafe/fit-in/500x500/filt...,"Magcubic Mini Projecteur, HY300 Data show Port...",Notification sur la confidentialité et les co...,18400,22500,18.0,4.6,https://www.jumia.com.dz/magcubic-mini-project...,Électro-ménager,Accessoires High Tech
456,https://dz.jumia.is/unsafe/fit-in/500x500/filt...,Uniross 4 x Pile rechargeable Lithium AA USB-C...,Notification sur la confidentialité et les co...,4750,5200,9.0,4.5,https://www.jumia.com.dz/uniross-4-x-pile-rech...,Électro-ménager,Accessoires High Tech
457,https://dz.jumia.is/unsafe/fit-in/500x500/filt...,Yunteng Trépied Mini Poulpe (Pieuvre) pour Cam...,Notification sur la confidentialité et les co...,1999,2550,22.0,0.0,https://www.jumia.com.dz/yunteng-trepied-mini-...,Électro-ménager,Accessoires High Tech
459,https://dz.jumia.is/unsafe/fit-in/500x500/filt...,Uniross 4 x Pile rechargeable Lithium AAA USB-...,Notification sur la confidentialité et les co...,4750,5200,9.0,5.0,https://www.jumia.com.dz/uniross-4-x-pile-rech...,Électro-ménager,Accessoires High Tech
...,...,...,...,...,...,...,...,...,...,...
2740,https://dz.jumia.is/unsafe/fit-in/500x500/filt...,Compresseur Gonflleur Voitures 1 Piston 12V.,Notification sur la confidentialité et les co...,2490,3200,22.0,3.0,https://www.jumia.com.dz/generique-compresseur...,Auto & Moto,Pneus & Jantes
2743,https://dz.jumia.is/unsafe/fit-in/500x500/filt...,Michelin MANO GONFLEUR de 0.7 à 11 bars R-18...,Notification sur la confidentialité et les co...,15500,16000,3.0,0.0,https://www.jumia.com.dz/michelin-mano-gonfleu...,Auto & Moto,Pneus & Jantes
2744,https://dz.jumia.is/unsafe/fit-in/500x500/filt...,TOTAL Mini Compremsseur D'Air Auto 12V De Voit...,Notification sur la confidentialité et les co...,7500,9500,21.0,0.0,https://www.jumia.com.dz/total-mini-compremsse...,Auto & Moto,Pneus & Jantes
2765,https://dz.jumia.is/unsafe/fit-in/500x500/filt...,Kit De Réparation De Pare-Brise De Voiture Ave...,Notification sur la confidentialité et les co...,999,1480,33.0,2.0,https://www.jumia.com.dz/generique-kit-de-repa...,Auto & Moto,Voiture


In [195]:
def convert_to_float(price_str):
    if type(price_str) == float:
        pass
    else:
        price_str = price_str.replace(',', '')
        price = float(price_str)
        return price

In [196]:
df.product_price_before_discount = df.product_price_before_discount.apply(convert_to_float)
df.product_price_after_discount = df.product_price_after_discount.apply(convert_to_float)


In [197]:
print('product_price_before_discount type is ',df.product_price_before_discount.dtype)
print('product_price_after_discount type is ',df.product_price_after_discount.dtype)


product_price_before_discount type is  float64
product_price_after_discount type is  float64


In [198]:
df.isnull().sum()

product_image                       0
product_name                        0
product_brand                     427
product_price_after_discount      449
product_price_before_discount    1487
discount                         1487
rating                            449
product_url                         0
category_name                       0
sub_category_title                449
dtype: int64

we notice that some columns have a lot of null values, for further analysis, we use the following strategy to replace these missing values:
* **product_brand**: since there is no other way to get the brand from the websites, we chose to replace it with "No Data"
* **pruduct_price_before_discount** : if there is a discount than we calculate it if not then it's the same after the discount
* **pruduct_price_after_discount** : if there is a discount than we calculate it if not then it's the same before the discount
* **discount** : if there is the price before and after then we calculate it else 0
* **rating** : replace it by 0 since it means that no one voted
* **sub _category_title** : since there is no other way to get the brand from the websites, we chose to replace it with "No Data"

### **Visualize data**

### **export data**

In [199]:
df.to_csv('../data/processed/ecommerce_dz_data.csv')