In [3]:
import pandas as pd
import numpy as np
import pickle

# Load data from previous notebook using pickle
with open('step1.pkl', 'rb') as f:
    df = pickle.load(f)
    
print("Data loaded from step1.pkl:", df.shape)

Data loaded from step1.pkl: (10000, 9)


# Data Cleaning
Handle missing values, remove duplicates, handle outliers (IQR, z-score), and validate/correct data types.

In [4]:
# Handle missing values
threshold = int(df.shape[1] * 0.5)
df = df.dropna(thresh=threshold)
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col] = df[col].fillna(df[col].mean())

In [5]:
# Remove duplicates
df = df.drop_duplicates()

In [6]:
# Handle outliers (IQR method)
import numpy as np
numeric_cols = df.select_dtypes(include=[np.number]).columns
outlier_index = set()
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outlier_index.update(df[(df[col] < lower_bound) | (df[col] > upper_bound)].index)
df = df.drop(outlier_index)

In [7]:
# Validate and correct data types
if 'event_time' in df.columns:
    df['event_time'] = pd.to_datetime(df['event_time'], errors='coerce')
if 'event_type' in df.columns:
    df['event_type'] = df['event_type'].astype(str)
if 'product_id' in df.columns:
    df['product_id'] = df['product_id'].astype(str)
if 'category_id' in df.columns:
    df['category_id'] = df['category_id'].astype('category')
if 'category_code' in df.columns:
    df['category_code'] = df['category_code'].astype(str)
if 'brand' in df.columns:
    df['brand'] = df['brand'].astype(str)
if 'price' in df.columns:
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
if 'user_id' in df.columns:
    df['user_id'] = df['user_id'].astype(str)
if 'user_session' in df.columns:
    df['user_session'] = df['user_session'].astype(str)

In [8]:
# Save cleaned data for next notebook using pickle
import pickle

with open('step2.pkl', 'wb') as f:
    pickle.dump(df, f)
    
print("Cleaned data saved as step2.pkl:", df.shape)

Cleaned data saved as step2.pkl: (8536, 9)
