# Traitement & Nettoyages des donnees provenant de webscrapper

In [382]:
import pandas as pd # type: ignore
import numpy as np # type: ignore

## Je lis mes datasets

In [383]:
chiens = pd.read_csv("./data/chiens.csv")
moutons = pd.read_csv("./data/moutons.csv")
volailles = pd.read_csv("./data/volailles.csv")
other_animals = pd.read_csv("./data/other-animals.csv")

# Tratement du datasets Motos

## Obsertations

In [384]:
chiens.head()

Unnamed: 0,web-scraper-order,web-scraper-start-url,links,links-href,name,price,adresse,image-link-src
0,1739070143-1,https://sn.coinafrique.com/categorie/chiens?pa...,Pitt Bull red nose femelle,https://sn.coinafrique.com/annonce/chiens/pitt...,Pitt Bull red nose femelle,Prix sur demande,"Dakar, Sénégal","background-image: url(""https://images.coinafri..."
1,1739070148-2,https://sn.coinafrique.com/categorie/chiens?pa...,Chiots Pitbull,https://sn.coinafrique.com/annonce/chiens/chio...,Chiots Pitbull,250 000 CFA,"Grand Yoff, Dakar, Sénégal","background-image: url(""https://images.coinafri..."
2,1739070153-3,https://sn.coinafrique.com/categorie/chiens?pa...,Caniche bichon,https://sn.coinafrique.com/annonce/chiens/cani...,Caniche bichon,200 000 CFA,"Mermoz-Sacré Coeur, Dakar, Sénégal","background-image: url(""https://images.coinafri..."
3,1739070157-4,https://sn.coinafrique.com/categorie/chiens?pa...,Chiot Boerboel,https://sn.coinafrique.com/annonce/chiens/chio...,Chiot Boerboel,450 000 CFA,"Mermoz-Sacré Coeur, Dakar, Sénégal","background-image: url(""https://images.coinafri..."
4,1739070162-5,https://sn.coinafrique.com/categorie/chiens?pa...,Pitbull Red Nose,https://sn.coinafrique.com/annonce/chiens/pitb...,Pitbull Red Nose,220 000 CFA,"Mermoz-Sacré Coeur, Dakar, Sénégal","background-image: url(""https://images.coinafri..."


In [385]:
chiens["links-href"].iloc[0] 
# C'est pour voir si cette feature correspond au lien d'access de chaque page d'article -> c'est le cas

'https://sn.coinafrique.com/annonce/chiens/pitt-bull-red-nose-femelle-742530'

In [386]:
chiens.columns

Index(['web-scraper-order', 'web-scraper-start-url', 'links', 'links-href',
       'name', 'price', 'adresse', 'image-link-src'],
      dtype='object')

In [387]:
chiens.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   web-scraper-order      801 non-null    object
 1   web-scraper-start-url  801 non-null    object
 2   links                  801 non-null    object
 3   links-href             801 non-null    object
 4   name                   801 non-null    object
 5   price                  801 non-null    object
 6   adresse                801 non-null    object
 7   image-link-src         801 non-null    object
dtypes: object(8)
memory usage: 50.2+ KB


In [388]:
chiens.describe().transpose()

Unnamed: 0,count,unique,top,freq
web-scraper-order,801,801,1739073284-801,1
web-scraper-start-url,801,10,https://sn.coinafrique.com/categorie/chiens?pa...,84
links,801,389,Chiot Berger Allemand,35
links-href,801,801,https://sn.coinafrique.com/annonce/chiens/chio...,1
name,801,389,Chiot Berger Allemand,35
price,801,78,Prix sur demande,155
adresse,801,52,"Dakar, Sénégal",173
image-link-src,801,801,"background-image: url(""https://images.coinafri...",1


### <font color="yellow"> Traitement gnrale

In [389]:
all_df = {"chiens": chiens, "moutons": moutons, "volailles": volailles, "other-animals": other_animals}

In [390]:
# Je redefinis mes col pour prendre que les cols concernees
new_columns = ['links-href', 'name', 'price', 'adresse', 'image-link-src']

for key, df in all_df.items():

    new_df = df[new_columns]

    # Je renomme la col image-link-src to image-link, et links-href to page-link
    new_df = new_df.rename({'image-link-src': "image-link","links-href": "page-link"}, axis=1)

    # je valide dans le all_df original
    all_df[key] = new_df

In [391]:
# un coup d'oeil
all_df["chiens"].head()

Unnamed: 0,page-link,name,price,adresse,image-link
0,https://sn.coinafrique.com/annonce/chiens/pitt...,Pitt Bull red nose femelle,Prix sur demande,"Dakar, Sénégal","background-image: url(""https://images.coinafri..."
1,https://sn.coinafrique.com/annonce/chiens/chio...,Chiots Pitbull,250 000 CFA,"Grand Yoff, Dakar, Sénégal","background-image: url(""https://images.coinafri..."
2,https://sn.coinafrique.com/annonce/chiens/cani...,Caniche bichon,200 000 CFA,"Mermoz-Sacré Coeur, Dakar, Sénégal","background-image: url(""https://images.coinafri..."
3,https://sn.coinafrique.com/annonce/chiens/chio...,Chiot Boerboel,450 000 CFA,"Mermoz-Sacré Coeur, Dakar, Sénégal","background-image: url(""https://images.coinafri..."
4,https://sn.coinafrique.com/annonce/chiens/pitb...,Pitbull Red Nose,220 000 CFA,"Mermoz-Sacré Coeur, Dakar, Sénégal","background-image: url(""https://images.coinafri..."


#### Nettoyage des textes

In [392]:
chiens = all_df["chiens"]

In [393]:
chiens.head()

Unnamed: 0,page-link,name,price,adresse,image-link
0,https://sn.coinafrique.com/annonce/chiens/pitt...,Pitt Bull red nose femelle,Prix sur demande,"Dakar, Sénégal","background-image: url(""https://images.coinafri..."
1,https://sn.coinafrique.com/annonce/chiens/chio...,Chiots Pitbull,250 000 CFA,"Grand Yoff, Dakar, Sénégal","background-image: url(""https://images.coinafri..."
2,https://sn.coinafrique.com/annonce/chiens/cani...,Caniche bichon,200 000 CFA,"Mermoz-Sacré Coeur, Dakar, Sénégal","background-image: url(""https://images.coinafri..."
3,https://sn.coinafrique.com/annonce/chiens/chio...,Chiot Boerboel,450 000 CFA,"Mermoz-Sacré Coeur, Dakar, Sénégal","background-image: url(""https://images.coinafri..."
4,https://sn.coinafrique.com/annonce/chiens/pitb...,Pitbull Red Nose,220 000 CFA,"Mermoz-Sacré Coeur, Dakar, Sénégal","background-image: url(""https://images.coinafri..."


In [394]:
def get_price_numeric(price:str):
    return "".join(price.split(" ")[:-1])

In [395]:
# Mettre en forme "marque"
# Separer le price en deux col: "price -> float" ete "devise" -> object
for key, df in all_df.items():

    try:
        price = df["price"].apply(get_price_numeric)
        devise = df["price"].str.split(" ").str[-1]
    except Exception as e:
        price = "Sur demande"
        devise = np.nan

    all_df[key]["price"] = price
    all_df[key]["devise"] = devise

    # Je profite pour juste recuperer le nom du quartier dans "adresse"
    quartier = df["adresse"].str.split(" ").str[0]
    ville = df["adresse"].str.split(" ").str[-2]
    pays = df["adresse"].str.split(" ").str[-1]

    all_df[key]["quartier"] = quartier
    all_df[key]["ville"] = ville
    all_df[key]["pays"] = pays

    all_df[key]["adresse"] = df["adresse"].str.strip()


In [396]:
chiens["devise"] = chiens["devise"].replace("demande", "Non défini")
chiens["price"] = chiens["price"].replace("Prixsur", "Sur demande")

In [397]:
chiens.head()

Unnamed: 0,page-link,name,price,adresse,image-link,devise,quartier,ville,pays
0,https://sn.coinafrique.com/annonce/chiens/pitt...,Pitt Bull red nose femelle,Sur demande,"Dakar, Sénégal","background-image: url(""https://images.coinafri...",Non défini,"Dakar,","Dakar,",Sénégal
1,https://sn.coinafrique.com/annonce/chiens/chio...,Chiots Pitbull,250000,"Grand Yoff, Dakar, Sénégal","background-image: url(""https://images.coinafri...",CFA,Grand,"Dakar,",Sénégal
2,https://sn.coinafrique.com/annonce/chiens/cani...,Caniche bichon,200000,"Mermoz-Sacré Coeur, Dakar, Sénégal","background-image: url(""https://images.coinafri...",CFA,Mermoz-Sacré,"Dakar,",Sénégal
3,https://sn.coinafrique.com/annonce/chiens/chio...,Chiot Boerboel,450000,"Mermoz-Sacré Coeur, Dakar, Sénégal","background-image: url(""https://images.coinafri...",CFA,Mermoz-Sacré,"Dakar,",Sénégal
4,https://sn.coinafrique.com/annonce/chiens/pitb...,Pitbull Red Nose,220000,"Mermoz-Sacré Coeur, Dakar, Sénégal","background-image: url(""https://images.coinafri...",CFA,Mermoz-Sacré,"Dakar,",Sénégal


In [398]:
chiens[["name", "price", "devise", "quartier", "ville", "pays", "adresse"]]

Unnamed: 0,name,price,devise,quartier,ville,pays,adresse
0,Pitt Bull red nose femelle,Sur demande,Non défini,"Dakar,","Dakar,",Sénégal,"Dakar, Sénégal"
1,Chiots Pitbull,250000,CFA,Grand,"Dakar,",Sénégal,"Grand Yoff, Dakar, Sénégal"
2,Caniche bichon,200000,CFA,Mermoz-Sacré,"Dakar,",Sénégal,"Mermoz-Sacré Coeur, Dakar, Sénégal"
3,Chiot Boerboel,450000,CFA,Mermoz-Sacré,"Dakar,",Sénégal,"Mermoz-Sacré Coeur, Dakar, Sénégal"
4,Pitbull Red Nose,220000,CFA,Mermoz-Sacré,"Dakar,",Sénégal,"Mermoz-Sacré Coeur, Dakar, Sénégal"
...,...,...,...,...,...,...,...
796,Chiot,500000,CFA,"Yoff,","Dakar,",Sénégal,"Yoff, Dakar, Sénégal"
797,Chiot Rottweiler,350000,CFA,"Sacré-Coeur,","Dakar,",Sénégal,"Sacré-Coeur, Dakar, Sénégal"
798,Chiot Husky,350000,CFA,"Sacré-Coeur,","Dakar,",Sénégal,"Sacré-Coeur, Dakar, Sénégal"
799,Chiot Chihuahua,150000,CFA,"Diamniadio,","Diamniadio,",Sénégal,"Diamniadio, Sénégal"


In [399]:
chiens.head()

Unnamed: 0,page-link,name,price,adresse,image-link,devise,quartier,ville,pays
0,https://sn.coinafrique.com/annonce/chiens/pitt...,Pitt Bull red nose femelle,Sur demande,"Dakar, Sénégal","background-image: url(""https://images.coinafri...",Non défini,"Dakar,","Dakar,",Sénégal
1,https://sn.coinafrique.com/annonce/chiens/chio...,Chiots Pitbull,250000,"Grand Yoff, Dakar, Sénégal","background-image: url(""https://images.coinafri...",CFA,Grand,"Dakar,",Sénégal
2,https://sn.coinafrique.com/annonce/chiens/cani...,Caniche bichon,200000,"Mermoz-Sacré Coeur, Dakar, Sénégal","background-image: url(""https://images.coinafri...",CFA,Mermoz-Sacré,"Dakar,",Sénégal
3,https://sn.coinafrique.com/annonce/chiens/chio...,Chiot Boerboel,450000,"Mermoz-Sacré Coeur, Dakar, Sénégal","background-image: url(""https://images.coinafri...",CFA,Mermoz-Sacré,"Dakar,",Sénégal
4,https://sn.coinafrique.com/annonce/chiens/pitb...,Pitbull Red Nose,220000,"Mermoz-Sacré Coeur, Dakar, Sénégal","background-image: url(""https://images.coinafri...",CFA,Mermoz-Sacré,"Dakar,",Sénégal


# Traitement des valeurs manquantes

In [400]:
for key, df in all_df.items():
    print(f"{key}\n", df.isnull().sum())

chiens
 page-link     0
name          0
price         0
adresse       0
image-link    0
devise        0
quartier      0
ville         0
pays          0
dtype: int64
moutons
 page-link     0
name          0
price         0
adresse       0
image-link    0
devise        0
quartier      0
ville         0
pays          0
dtype: int64
volailles
 page-link      0
name           0
price          0
adresse        0
image-link     0
devise         0
quartier       0
ville         10
pays           0
dtype: int64
other-animals
 page-link     0
name          0
price         0
adresse       0
image-link    0
devise        0
quartier      0
ville         5
pays          0
dtype: int64


In [405]:
chiens = pd.read_csv("./data/chiens.csv")
moutons = pd.read_csv("./data/moutons.csv")
volailles = pd.read_csv("./data/volailles.csv")
other_animals = pd.read_csv("./data/other-animals.csv")

# Sauvegarde des donnees nettoyees

In [406]:
for key, df in all_df.items():
    df.to_csv(f"./data-cleaned/{key}.csv", index=False)