In [13]:
import pandas as pd

data = pd.read_csv('../data/Chocolate Sales.csv')
data.head()

Unnamed: 0,Sales Person,Country,Product,Date,Amount,Boxes Shipped
0,Jehu Rudeforth,UK,Mint Chip Choco,04-Jan-22,"$5,320",180
1,Van Tuxwell,India,85% Dark Bars,01-Aug-22,"$7,896",94
2,Gigi Bohling,India,Peanut Butter Cubes,07-Jul-22,"$4,501",91
3,Jan Morforth,Australia,Peanut Butter Cubes,27-Apr-22,"$12,726",342
4,Jehu Rudeforth,UK,Peanut Butter Cubes,24-Feb-22,"$13,685",184


# Nettoyage et Préparation des Données

### Standardisation des noms de colonnes

In [14]:
def standardisation(nom_colonne):
    nom_colonne = str(nom_colonne).lower().replace(' ', '_')
    return str(nom_colonne)

for column in data.columns:
    data.rename(columns={
        column: standardisation(column)
    }, inplace=True)

In [15]:
data.head()

Unnamed: 0,sales_person,country,product,date,amount,boxes_shipped
0,Jehu Rudeforth,UK,Mint Chip Choco,04-Jan-22,"$5,320",180
1,Van Tuxwell,India,85% Dark Bars,01-Aug-22,"$7,896",94
2,Gigi Bohling,India,Peanut Butter Cubes,07-Jul-22,"$4,501",91
3,Jan Morforth,Australia,Peanut Butter Cubes,27-Apr-22,"$12,726",342
4,Jehu Rudeforth,UK,Peanut Butter Cubes,24-Feb-22,"$13,685",184


### Formatage de la colonne ```date```

In [16]:
data['date'] = pd.to_datetime(data['date'], format='%d-%b-%y', errors='coerce')

In [17]:
data.head()

Unnamed: 0,sales_person,country,product,date,amount,boxes_shipped
0,Jehu Rudeforth,UK,Mint Chip Choco,2022-01-04,"$5,320",180
1,Van Tuxwell,India,85% Dark Bars,2022-08-01,"$7,896",94
2,Gigi Bohling,India,Peanut Butter Cubes,2022-07-07,"$4,501",91
3,Jan Morforth,Australia,Peanut Butter Cubes,2022-04-27,"$12,726",342
4,Jehu Rudeforth,UK,Peanut Butter Cubes,2022-02-24,"$13,685",184


### Formatage de la colonne ```amount```

In [18]:
data['amount'] = data['amount'].str.replace('$','').str.replace(',','')
data['amount'] = pd.to_numeric(data['amount'], errors='coerce')

In [19]:
data.head()

Unnamed: 0,sales_person,country,product,date,amount,boxes_shipped
0,Jehu Rudeforth,UK,Mint Chip Choco,2022-01-04,5320,180
1,Van Tuxwell,India,85% Dark Bars,2022-08-01,7896,94
2,Gigi Bohling,India,Peanut Butter Cubes,2022-07-07,4501,91
3,Jan Morforth,Australia,Peanut Butter Cubes,2022-04-27,12726,342
4,Jehu Rudeforth,UK,Peanut Butter Cubes,2022-02-24,13685,184


### Vérification des Outliers

In [20]:
data.describe()

Unnamed: 0,date,amount,boxes_shipped
count,1094,1094.0,1094.0
mean,2022-05-03 09:04:56.160877568,5652.308044,161.797989
min,2022-01-03 00:00:00,7.0,1.0
25%,2022-03-02 00:00:00,2390.5,70.0
50%,2022-05-11 00:00:00,4868.5,135.0
75%,2022-07-04 00:00:00,8027.25,228.75
max,2022-08-31 00:00:00,22050.0,709.0
std,,4102.442014,121.544145


Aucun Outlier détecté

### Vérification du type des colonnes

In [21]:
data.dtypes

sales_person             object
country                  object
product                  object
date             datetime64[ns]
amount                    int64
boxes_shipped             int64
dtype: object

Aucun problème à ce niveau

### Vérification des doublons

In [22]:
data.loc[data.duplicated(subset=['sales_person', 'country', 'product', 'date', 'amount', 'boxes_shipped'], keep=False), ]

Unnamed: 0,sales_person,country,product,date,amount,boxes_shipped


Aucun doublon détecté

### Vérification des valeurs manquantes

In [23]:
data.isnull().sum()

sales_person     0
country          0
product          0
date             0
amount           0
boxes_shipped    0
dtype: int64

Aucune valeur manquante !

### Sauvegarde du dataset nettoyé

In [24]:
data.to_csv('../data/data_cleaned.csv', index=False)