In [7]:
import logging
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

data=pd.read_csv("feature_frames.csv")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
order_sizes = data.groupby('order_id').size()
orders_with_5_or_more_products = order_sizes[order_sizes >= 5].index
filtered_data = data[data['order_id'].isin(orders_with_5_or_more_products)]

df = pd.DataFrame(filtered_data)

df['created_at'] = pd.to_datetime(df['created_at'])
df['order_date'] = pd.to_datetime(df['order_date'])

def get_season(month):
    if month in [12, 1, 2]:
        return 1  # winter
    elif month in [3, 4, 5]:
        return 2  # spring
    elif month in [6, 7, 8]:
        return 3  # summer
    else:
        return 4  # Outumn


df['season'] = df['order_date'].dt.month.apply(get_season)

df['day_of_week'] = df['order_date'].dt.dayofweek
df['is_weekend'] = df['order_date'].dt.dayofweek >= 5
df['hour_of_day'] = df['order_date'].dt.hour

product_type_counts = df['product_type'].value_counts()
df['product_type_freq'] = df['product_type'].map(product_type_counts)
vendor_counts = df['vendor'].value_counts()
df['vendor_freq'] = df['vendor'].map(vendor_counts)

df_winter = df[df['season'] == 1]
df_spring = df[df['season'] == 2]
df_summer = df[df['season'] == 3]
df_autumn = df[df['season'] == 4]

X_winter = df_winter.drop(['variant_id', 'order_id', 'user_id', 'created_at', 'order_date', 'outcome'], axis=1)
X_winter = pd.concat([X_winter, df_winter[['day_of_week', 'is_weekend', 'hour_of_day']]], axis=1)
y_winter = df_winter['outcome']
logging.info("X for winter:\n%s", X_winter.head().to_string())
logging.info("Y for winter :\n%s", y_winter.head().to_string())

X_spring = df_spring.drop(['variant_id', 'order_id', 'user_id', 'created_at', 'order_date', 'outcome'], axis=1)
X_spring = pd.concat([X_spring, df_spring[['day_of_week', 'is_weekend', 'hour_of_day']]], axis=1)
y_spring = df_spring['outcome']
logging.info("X for winter:\n%s", X_winter.head().to_string())
logging.info("Y for spring:\n%s", y_spring.head().to_string())

X_summer = df_summer.drop(['variant_id', 'order_id', 'user_id', 'created_at', 'order_date', 'outcome'], axis=1)
X_summer = pd.concat([X_summer, df_summer[['day_of_week', 'is_weekend', 'hour_of_day']]], axis=1)
y_summer = df_summer['outcome']
logging.info("X for winter:\n%s", X_winter.head().to_string())
logging.info("Y for summer:\n%s", y_summer.head().to_string())

X_autumn = df_autumn.drop(['variant_id', 'order_id', 'user_id', 'created_at','product_type','vendor' ,'order_date', 'outcome'], axis=1)
X_autumn = pd.concat([X_autumn, df_autumn[['day_of_week', 'is_weekend', 'hour_of_day']]], axis=1)
y_autumn = df_autumn['outcome']
logging.info("X for winter:\n%s", X_winter.head().to_string())
logging.info("Y for autumn:\n%s", y_autumn.head().to_string())



2024-02-13 16:16:11,414 - INFO - X for winter:
        product_type  user_order_seq  ordered_before  abandoned_before  active_snoozed  set_as_regular  normalised_price  discount_pct       vendor  global_popularity  count_adults  count_children  count_babies  count_pets  people_ex_baby  days_since_purchase_variant_id  avg_days_to_buy_variant_id  std_days_to_buy_variant_id  days_since_purchase_product_type  avg_days_to_buy_product_type  std_days_to_buy_product_type  season  day_of_week  is_weekend  hour_of_day  product_type_freq  vendor_freq  day_of_week  is_weekend  hour_of_day
688  ricepastapulses               2             0.0               0.0             0.0             0.0          0.081052      0.053512  clearspring           0.014354           2.0             0.0           0.0         0.0             2.0                            33.0                        42.0                   31.134053                              30.0                          30.0                      24.2