# Nettoyage de données retail / Retail Data Cleaning
## Contexte
Dataset retail typique d’une PME e-commerce : ventes, clients, 15 % doublons, NaNs, formats incohérents.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 1. Diagnostic
df = pd.read_csv("retail_store_sales.csv")
print("Avant nettoyage :")
print(df.head())  # Screenshot 1
print("Doublons :", df.duplicated().sum())  # Ex. : 150 doublons
print("NaNs :", df.isna().sum())  # Screenshot 2

In [None]:
# 2. Nettoyage
df = df.drop_duplicates(subset=['customer_id'])  # Supprime doublons
df['price'] = df['price'].clip(lower=0)  # Corrige prix négatifs
df['date'] = pd.to_datetime(df['date'], errors='coerce')  # Normalise dates
df = df.dropna(subset=['email'])  # Supprime NaNs critiques
print("Après nettoyage :")
print(df.head())  # Screenshot 3

In [None]:
# 3. Visualisation KPI
sales_by_product = df.groupby('product')['price'].sum()
sales_by_product.plot(kind='bar', title='Ventes par produit nettoyées')
plt.savefig('screenshots/sales_graph.png')  # Screenshot 4
plt.show()

In [None]:
# 4. Export
df.to_excel('retail_clean.xlsx', index=False)
print("Résultat : +20 % données exploitables pour campagnes marketing.")