In [4]:
import pandas as pd

df = pd.read_csv("C:\\Users\\divya\\OneDrive\\Desktop\\dynamic_pricing_project\\data\\Online_Retail_Clean.csv")
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

df['Revenue'] = df['Quantity'] * df['UnitPrice']

df['DayOfWeek'] = df['InvoiceDate'].dt.dayofweek
df['IsWeekend'] = df['DayOfWeek'].isin([5,6])
df['Month'] = df['InvoiceDate'].dt.month
df['Hour'] = df['InvoiceDate'].dt.hour
df['HolidaySeason'] = df['Month'].isin([11, 12])
import numpy as np



# Top sellers (Top 10 products by total revenue)
top_sellers = df.groupby('Description')['Revenue'].sum().nlargest(10).index
df['IsTopSeller'] = df['Description'].isin(top_sellers).astype(int)

# Rolling revenue by product (sorted by time)
df = df.sort_values(['Description', 'InvoiceDate'])
df['RollingRevenue7D'] = df.groupby('Description')['Revenue'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

# Lag feature: previous day's revenue
df['PrevDayRevenue'] = df.groupby('Description')['Revenue'].shift(1)

# Country-wise Average Order Value
country_order_val = df.groupby('Country')['Revenue'].mean().rename("AvgOrderValueCountry")
df = df.merge(country_order_val, on='Country', how='left')

# Cumulative revenue per product
df['CumulativeRevenue'] = df.groupby('Description')['Revenue'].cumsum()

# Daily invoice count
df['InvoiceDay'] = df['InvoiceDate'].dt.date
daily_orders = df.groupby('InvoiceDay')['InvoiceNo'].nunique().rename("InvoiceCountPerDay")
df = df.merge(daily_orders, on='InvoiceDay', how='left')

# Drop high-cardinality or leakage columns before modeling
df_model = df.drop(columns=[
    'InvoiceNo', 'InvoiceDate', 'InvoiceDay', 'Description', 'CustomerID'
])

#One-hot encode categorical columns (like Country)
df_model = pd.get_dummies(df_model, columns=["Country"], drop_first=True)
# Select columns with dtype == bool
bool_cols = df_model.select_dtypes(include='bool').columns

# Fill NaNs in these columns if needed, then convert to int
df_model[bool_cols] = df_model[bool_cols].fillna(0).astype(int)

# Calculate Q1 and Q3
Q1 = df_model['Revenue'].quantile(0.25)
Q3 = df_model['Revenue'].quantile(0.75)

# Calculate IQR
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df_model = df_model[(df_model['Revenue'] >= lower_bound) & (df_model['Revenue'] <= upper_bound)]

# Save ready-for-model dataset
df_model.to_csv("../data/sales_model_ready.csv", index=False)

df.to_csv("../data/sales_with_features.csv", index=False)
