In [None]:
# Import necessary libraries
import pandas as pd

# Step 1: Load the data
print("Loading raw orders data...")
orders = pd.read_csv('data/orders.csv')
print(f"Data loaded successfully with {orders.shape[0]} rows and {orders.shape[1]} columns.")

# Display the first few rows of the dataset
orders.head()

# Step 2: Check for missing values
print("\nChecking for missing values in the dataset...")
missing_values = orders.isnull().sum()
print(missing_values[missing_values > 0])

# Fill missing values (if applicable) - For this example, let's fill them with 0
orders.fillna(0, inplace=True)
print("Missing values filled with 0 (if any).")

# Step 3: Feature Engineering
print("\nEngineering new features...")

# Feature 1: Calculate the total number of orders per user
orders['total_orders'] = orders.groupby('user_id')['order_id'].transform('count')

# Feature 2: Create a 'time_of_day' feature based on 'order_hour_of_day'
def time_of_day(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

# Apply the time_of_day function to create the new feature
orders['time_of_day'] = orders['order_hour_of_day'].apply(time_of_day)
print("Features 'total_orders' and 'time_of_day' created.")

# Display the updated dataset with new features
orders[['user_id', 'order_id', 'total_orders', 'order_hour_of_day', 'time_of_day']].head()

# Step 4: Data Aggregation
print("\nAggregating user behavior by product...")

# Aggregate user behavior (number of purchases, average order hour, etc.) by user and product
user_product_behavior = orders.groupby(['user_id', 'product_id']).agg({
    'order_id': 'count',                       # Number of purchases for this product
    'order_hour_of_day': 'mean',               # Average time of order for this product
    'total_orders': 'first',                   # Total orders for this user
    'time_of_day': 'first'                     # Time of day of the first order for this product
}).reset_index()

# Rename columns for better readability
user_product_behavior.rename(columns={'order_id': 'num_purchases'}, inplace=True)

# Display the aggregated dataset
user_product_behavior.head()

# Step 5: Save the cleaned and aggregated data
print("\nSaving the cleaned and aggregated data to 'cleaned_orders.csv'...")
user_product_behavior.to_csv('data/cleaned_orders.csv', index=False)
print("Data saved successfully.")

# Final summary of the preprocessing steps
print(f"\nData preprocessing completed. The cleaned dataset contains {user_product_behavior.shape[0]} rows and {user_product_behavior.shape[1]} columns.")
