## Data cleaning
Focus on just "Completed" records

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../data/kernel/receivals.csv')

#### 1. Filtra solo i record completati


In [3]:
print(f"Dataset originale: {len(data)} record")
data_clean = data[data['receival_status'] == 'Completed'].copy()
print(f"Dataset filtrato (Completed): {len(data_clean)} record")

Dataset originale: 122590 record
Dataset filtrato (Completed): 122448 record


#### 2. Rimuovi record con net_weight mancante


In [4]:
data_clean = data_clean.dropna(subset=['net_weight'])
print(f"Dopo rimozione net_weight mancanti: {len(data_clean)} record")

Dopo rimozione net_weight mancanti: 122438 record


#### 3. Verifica e gestisci valori anomali (es. peso = 0)

In [5]:
print(f"\nRecord con peso = 0: {(data_clean['net_weight'] == 0).sum()}")
# Decidi se rimuoverli o tenerli
data_clean = data_clean[data_clean['net_weight'] > 0]


Record con peso = 0: 125


#### 4. Elimina batch_id (troppi missing, 47%)


In [6]:
data_clean = data_clean.drop('batch_id', axis=1)

#### 5. Gestisci missing values rimanenti

In [7]:
print("\n📊 Missing values dopo pulizia:")
print(data_clean.isnull().sum())


📊 Missing values dopo pulizia:
rm_id                     0
product_id                0
purchase_order_id         0
purchase_order_item_no    0
receival_item_no          0
date_arrival              0
receival_status           0
net_weight                0
supplier_id               0
dtype: int64


#### Imputa missing per rm_id, product_id, purchase_order_id con valori speciali


In [10]:
data_clean['rm_id'] = data_clean['rm_id'].fillna(-1)
data_clean['product_id'] = data_clean['product_id'].fillna(-1)
data_clean['purchase_order_id'] = data_clean['purchase_order_id'].fillna(-1)
data_clean['purchase_order_item_no'] = data_clean['purchase_order_item_no'].fillna(-1)

print(f"\nDataset finale pronto per feature engineering: {len(data_clean)} record")
data_clean.head()




Dataset finale pronto per feature engineering: 122313 record


Unnamed: 0,rm_id,product_id,purchase_order_id,purchase_order_item_no,receival_item_no,date_arrival,receival_status,net_weight,supplier_id
0,365.0,91900143.0,208545.0,10.0,1,2004-06-15 13:34:00 +02:00,Completed,11420.0,52062
1,365.0,91900143.0,208545.0,10.0,2,2004-06-15 13:34:00 +02:00,Completed,13760.0,52062
2,365.0,91900143.0,208490.0,10.0,1,2004-06-15 13:38:00 +02:00,Completed,11281.0,50468
3,365.0,91900143.0,208490.0,10.0,2,2004-06-15 13:38:00 +02:00,Completed,13083.0,50468
4,379.0,91900296.0,210435.0,20.0,1,2004-06-15 13:40:00 +02:00,Completed,23910.0,52577


In [11]:
data_clean.to_csv('data_clean.csv', index=False)