In [5]:
# Importation des bibliothèque
import numpy as np 
import pandas as pd 
import datetime as dt

In [6]:
# Fais apparaitre l'ensemble des informations
pd.set_option("display.max_rows", 5000)
pd.set_option("display.max_column", 70)
pd.set_option("display.max_colwidth", 100)

In [7]:
# Lecture des données
data = pd.read_csv('../data/raw/dirty_cafe_sales.csv')
data = data.sort_values(by='Transaction Date')
data

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
7152,TXN_6566716,Coffee,ERROR,2.0,2.0,Credit Card,,2023-01-01
8885,TXN_1581562,Coffee,2,2.0,4.0,Cash,In-store,2023-01-01
1806,TXN_2192787,Sandwich,5,4.0,20.0,Cash,In-store,2023-01-01
2244,TXN_5358805,Coffee,5,2.0,10.0,Digital Wallet,ERROR,2023-01-01
7285,TXN_1604072,Coffee,2,2.0,4.0,,,2023-01-01
...,...,...,...,...,...,...,...,...
9769,TXN_9686177,Cake,3,3.0,9.0,,In-store,
9833,TXN_5536245,Smoothie,4,4.0,16.0,Cash,,
9885,TXN_4659954,,3,4.0,12.0,Credit Card,In-store,
9931,TXN_8344810,Smoothie,2,4.0,8.0,,UNKNOWN,


### Nettoyage de la dataset

In [8]:
# Remplacer les ERROR et UNKNOW par des valeur null
data = data.replace(['ERROR', 'UNKNOWN'], np.nan)
data

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
7152,TXN_6566716,Coffee,,2.0,2.0,Credit Card,,2023-01-01
8885,TXN_1581562,Coffee,2,2.0,4.0,Cash,In-store,2023-01-01
1806,TXN_2192787,Sandwich,5,4.0,20.0,Cash,In-store,2023-01-01
2244,TXN_5358805,Coffee,5,2.0,10.0,Digital Wallet,,2023-01-01
7285,TXN_1604072,Coffee,2,2.0,4.0,,,2023-01-01
...,...,...,...,...,...,...,...,...
9769,TXN_9686177,Cake,3,3.0,9.0,,In-store,
9833,TXN_5536245,Smoothie,4,4.0,16.0,Cash,,
9885,TXN_4659954,,3,4.0,12.0,Credit Card,In-store,
9931,TXN_8344810,Smoothie,2,4.0,8.0,,,


In [9]:
# Changer les str en float et les dates en datetime
data['Price Per Unit'] = data['Price Per Unit'].astype(float)
data['Total Spent'] = data['Total Spent'].astype(float)
data['Quantity'] = data['Quantity'].astype(float)
# format que date sans l'heure
data['Transaction Date'] = pd.to_datetime(data['Transaction Date']).dt.date
data

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
7152,TXN_6566716,Coffee,,2.0,2.0,Credit Card,,2023-01-01
8885,TXN_1581562,Coffee,2.0,2.0,4.0,Cash,In-store,2023-01-01
1806,TXN_2192787,Sandwich,5.0,4.0,20.0,Cash,In-store,2023-01-01
2244,TXN_5358805,Coffee,5.0,2.0,10.0,Digital Wallet,,2023-01-01
7285,TXN_1604072,Coffee,2.0,2.0,4.0,,,2023-01-01
...,...,...,...,...,...,...,...,...
9769,TXN_9686177,Cake,3.0,3.0,9.0,,In-store,NaT
9833,TXN_5536245,Smoothie,4.0,4.0,16.0,Cash,,NaT
9885,TXN_4659954,,3.0,4.0,12.0,Credit Card,In-store,NaT
9931,TXN_8344810,Smoothie,2.0,4.0,8.0,,,NaT


In [10]:
# Trouver le prix unitaire moyen par leur nom de 'Item'
item_price_dict = data.set_index('Item')['Price Per Unit'].to_dict()
data['Price Per Unit'] = data.apply(lambda row: item_price_dict[row['Item']] if pd.isna(row['Price Per Unit']) else row['Price Per Unit'], axis=1)
data

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
7152,TXN_6566716,Coffee,,2.0,2.0,Credit Card,,2023-01-01
8885,TXN_1581562,Coffee,2.0,2.0,4.0,Cash,In-store,2023-01-01
1806,TXN_2192787,Sandwich,5.0,4.0,20.0,Cash,In-store,2023-01-01
2244,TXN_5358805,Coffee,5.0,2.0,10.0,Digital Wallet,,2023-01-01
7285,TXN_1604072,Coffee,2.0,2.0,4.0,,,2023-01-01
...,...,...,...,...,...,...,...,...
9769,TXN_9686177,Cake,3.0,3.0,9.0,,In-store,NaT
9833,TXN_5536245,Smoothie,4.0,4.0,16.0,Cash,,NaT
9885,TXN_4659954,,3.0,4.0,12.0,Credit Card,In-store,NaT
9931,TXN_8344810,Smoothie,2.0,4.0,8.0,,,NaT


In [11]:
# Remplacer les valeur null par la quantité en faisant un Total spent / Price Per Unit pour trouver la quantité
data['Quantity'] = data['Total Spent'] / data['Price Per Unit']
data

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
7152,TXN_6566716,Coffee,1.0,2.0,2.0,Credit Card,,2023-01-01
8885,TXN_1581562,Coffee,2.0,2.0,4.0,Cash,In-store,2023-01-01
1806,TXN_2192787,Sandwich,5.0,4.0,20.0,Cash,In-store,2023-01-01
2244,TXN_5358805,Coffee,5.0,2.0,10.0,Digital Wallet,,2023-01-01
7285,TXN_1604072,Coffee,2.0,2.0,4.0,,,2023-01-01
...,...,...,...,...,...,...,...,...
9769,TXN_9686177,Cake,3.0,3.0,9.0,,In-store,NaT
9833,TXN_5536245,Smoothie,4.0,4.0,16.0,Cash,,NaT
9885,TXN_4659954,,3.0,4.0,12.0,Credit Card,In-store,NaT
9931,TXN_8344810,Smoothie,2.0,4.0,8.0,,,NaT


In [12]:
# Exportation des données nettoyés ! :)
data.to_csv('../data/processed/cleaned_cafe_sales.csv', index=False)

In [13]:
data.describe()

Unnamed: 0,Quantity,Price Per Unit,Total Spent
count,9498.0,10000.0,9498.0
mean,3.019267,2.95335,8.924352
std,1.421518,1.279517,6.009919
min,0.5,1.0,1.0
25%,2.0,2.0,4.0
50%,3.0,3.0,8.0
75%,4.0,4.0,12.0
max,6.25,5.0,25.0
