In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import yaml
import os

In [3]:
os.chdir('d:\\VScode\\Datathon')

## Preprocessing

1. Load data

In [4]:
transaksi = pd.read_csv('data/Transaksi.CSV')
app_sessions = pd.read_csv('data/app_sessions.csv')
userprofile = pd.read_csv('data/Userprofile.csv')
promotion_history = pd.read_csv('data/promotion_history.csv')
merchant = pd.read_csv('data/merchant.csv')
item_catalog = pd.read_csv('data/item_catalog.csv')
weather_calendar = pd.read_csv('data/weather_calendar.csv')
real_time_events = pd.read_csv('data/real_time_events.csv')

2. Preprocessing
merah = kolom id 
hijau = kolom kategori
biru = kolom waktu
pink = kolom tempat


In [5]:
def clean_transaksi(df):
    df = df.copy()
    # 1) Timestamp & fitur waktu
    df['order_timestamp'] = pd.to_datetime(df['order_timestamp'])
    df['order_hour']      = df['order_timestamp'].dt.hour
    df['order_day']       = df['order_timestamp'].dt.day
    df['order_month']     = df['order_timestamp'].dt.month
    df['order_year']      = df['order_timestamp'].dt.year
    df['order_weekday']   = df['order_timestamp'].dt.weekday
    df['order_timestamp'] = df['order_timestamp'].dt.floor('H')

    # 2) Handle missing promo
    df['promo_id_main']     = df['promo_id_main'].fillna('no_promo')
    df['promo_id_delivery'] = df['promo_id_delivery'].fillna('no_promo')

    # 3) Encoding kategori
    le_service = LabelEncoder().fit(df['service_type'])
    df['service_enc'] = le_service.transform(df['service_type'])

    le_user    = LabelEncoder().fit(df['user_id'])
    df['user_enc']    = le_user.transform(df['user_id'])

    le_merc    = LabelEncoder().fit(df['merchant_id'])
    df['merchant_enc'] = le_merc.transform(df['merchant_id'])

    le_promo_main = LabelEncoder().fit(df['promo_id_main'])
    df['promo_main_enc'] = le_promo_main.transform(df['promo_id_main'])

    le_promo_delv = LabelEncoder().fit(df['promo_id_delivery'])
    df['promo_delv_enc']  = le_promo_delv.transform(df['promo_id_delivery'])

    # 4) Normalisasi persen
    df['voucher_discount_pct'] = df['voucher_discount_pct'] / 100
    df['voucher_cashback_pct'] = df['voucher_cashback_pct'] / 100

    return df

df_clean_transaksi = clean_transaksi(transaksi)

In [6]:

def clean_real_time_events(df):
    # 1) Copy supaya tidak mengubah original
    df = df.copy()

    # 2) Encode user_id dengan LabelEncoder terpisah
    df['user_id'] = df['user_id'].fillna('unknown_user')
    le_user = LabelEncoder().fit(df['user_id'])
    df['user_enc'] = le_user.transform(df['user_id'])

    # 3) Parse dan ekstrak fitur waktu
    df['event_timestamp'] = pd.to_datetime(df['event_timestamp'])
    df['event_hour']    = df['event_timestamp'].dt.hour
    df['event_day']     = df['event_timestamp'].dt.day
    df['event_month']   = df['event_timestamp'].dt.month
    df['event_year']    = df['event_timestamp'].dt.year
    df['event_weekday'] = df['event_timestamp'].dt.weekday
    # bulatkan ke jam terdekat (floor ke H)
    df['event_timestamp'] = df['event_timestamp'].dt.floor('H')

    # 4) Normalisasi battery_level (asumsi 0–100 → 0–1)
    df['battery_level'] = df['battery_level'].fillna(0) / 100

    # 5) Encode current_app_screen dengan encoder terpisah
    df['current_app_screen'] = df['current_app_screen'].fillna('unknown_screen')
    le_app = LabelEncoder().fit(df['current_app_screen'])
    df['curr_app_enc'] = le_app.transform(df['current_app_screen'])

    return df


df_clean_real_time_events = clean_real_time_events(real_time_events)


In [7]:
def clean_app_sessions(df):

    df = df.copy()

    # 1) Parse timestamp
    df['open_timestamp'] = pd.to_datetime(df['open_timestamp'])
    df['close_timestamp'] = pd.to_datetime(df['close_timestamp'])

    # 2) Fitur waktu untuk open
    df['open_hour']    = df['open_timestamp'].dt.hour
    df['open_day']     = df['open_timestamp'].dt.day
    df['open_month']   = df['open_timestamp'].dt.month
    df['open_year']    = df['open_timestamp'].dt.year
    df['open_weekday'] = df['open_timestamp'].dt.weekday
    df['open_timestamp'] = df['open_timestamp'].dt.floor('H')

    # 3) Fitur waktu untuk close
    df['close_hour']    = df['close_timestamp'].dt.hour
    df['close_day']     = df['close_timestamp'].dt.day
    df['close_month']   = df['close_timestamp'].dt.month
    df['close_year']    = df['close_timestamp'].dt.year
    df['close_weekday'] = df['close_timestamp'].dt.weekday
    df['close_timestamp'] = df['close_timestamp'].dt.floor('H')

    # 4) Durasi sesi dalam detik
    df['session_duration'] = (
        df['close_timestamp'] - df['open_timestamp']
    ).dt.total_seconds()

    # 5) Isi missing sebelum encoding
    df['user_id']      = df['user_id'].astype(str).fillna('unknown_user')
    df['session_id']   = df['session_id'].astype(str).fillna('unknown_session')
    df['entry_point']  = df['entry_point'].fillna('unknown_entry')

    # 6) Encode kategori dengan LabelEncoder terpisah
    le_user    = LabelEncoder().fit(df['user_id'])
    df['user_enc']      = le_user.transform(df['user_id'])

    le_sess    = LabelEncoder().fit(df['session_id'])
    df['session_enc']   = le_sess.transform(df['session_id'])

    le_entry   = LabelEncoder().fit(df['entry_point'])
    df['entry_enc']     = le_entry.transform(df['entry_point'])

    return df

df_clean_app_sessions = clean_app_sessions(app_sessions)

In [8]:
def clean_weather_calendar(df):

    df = df.copy()
    
    # 1) Parse timestamp
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # 2) Ekstrak fitur waktu
    df['hour']     = df['timestamp'].dt.hour
    df['day']      = df['timestamp'].dt.day
    df['month']    = df['timestamp'].dt.month
    df['year']     = df['timestamp'].dt.year
    df['weekday']  = df['timestamp'].dt.weekday
    
    # 3) Bulatkan ke jam penuh (floor to hour)
    df['timestamp'] = df['timestamp'].dt.floor('H')
    
    # 4) Isi missing sebelum encode
    df['weather'] = df['weather'].fillna('unknown_weather')
    
    # 5) Encode weather dengan LabelEncoder terpisah
    le_weather = LabelEncoder().fit(df['weather'])
    df['weather_enc'] = le_weather.transform(df['weather'])
    
    return df

df_clean_weather_calendar = clean_weather_calendar(weather_calendar)

In [9]:
def clean_promotion_history(df):

    df = df.copy()
    
    # 1) Isi missing kategori sebelum encode
    df['promo_id']         = df['promo_id'].fillna('no_promo')
    df['promo_type']       = df['promo_type'].fillna('unknown_type')
    df['targeting_rule']   = df['targeting_rule'].fillna('unknown_rule')
    
    # 2) Encode kategori dengan LabelEncoder terpisah
    le_id    = LabelEncoder().fit(df['promo_id'])
    df['promo_enc'] = le_id.transform(df['promo_id'])
    
    le_type  = LabelEncoder().fit(df['promo_type'])
    df['type_enc'] = le_type.transform(df['promo_type'])
    
    le_target = LabelEncoder().fit(df['targeting_rule'])
    df['target_enc'] = le_target.transform(df['targeting_rule'])
    
    # 3) Parse start & end timestamps
    df['start_ts'] = pd.to_datetime(df['start_ts'])
    df['end_ts']   = pd.to_datetime(df['end_ts'])
    
    # 4) Ekstrak fitur waktu untuk start
    df['start_hour']    = df['start_ts'].dt.hour
    df['start_day']     = df['start_ts'].dt.day
    df['start_month']   = df['start_ts'].dt.month
    df['start_year']    = df['start_ts'].dt.year
    df['start_weekday'] = df['start_ts'].dt.weekday
    df['start_ts']      = df['start_ts'].dt.floor('H')
    
    # 5) Ekstrak fitur waktu untuk end
    df['end_hour']    = df['end_ts'].dt.hour
    df['end_day']     = df['end_ts'].dt.day
    df['end_month']   = df['end_ts'].dt.month
    df['end_year']    = df['end_ts'].dt.year
    df['end_weekday'] = df['end_ts'].dt.weekday
    df['end_ts']      = df['end_ts'].dt.floor('H')
    
    # 6) Hitung durasi promo dalam detik
    df['duration'] = (df['end_ts'] - df['start_ts']).dt.total_seconds().fillna(0)
    
    return df

df_clean_promotion_history = clean_promotion_history(promotion_history)

In [10]:
def clean_item_catalog(df):

    df = df.copy()

    # 1) Isi missing dan cast ke string
    df['item_id']     = df['item_id'].astype(str).fillna('unknown_item')
    df['category_id'] = df['category_id'].astype(str).fillna('unknown_category')

    # 2) Encode item_id
    le_item = LabelEncoder().fit(df['item_id'])
    df['item_enc'] = le_item.transform(df['item_id'])

    # 3) Encode category_id
    le_cat = LabelEncoder().fit(df['category_id'])
    df['category_enc'] = le_cat.transform(df['category_id'])

    return df
    
df_clean_item_catalog = clean_item_catalog(item_catalog)

In [11]:
def clean_merchant(df):

    df = df.copy()
    
    # 1) Isi missing dan pastikan tipe string
    df['merchant_id'] = df['merchant_id'].astype(str).fillna('unknown_merchant')
    
    # 2) Encode merchant_id dengan encoder terpisah
    le_merc = LabelEncoder().fit(df['merchant_id'])
    df['merchant_enc'] = le_merc.transform(df['merchant_id'])

    df['kecamatan'] = df['kecamatan'].fillna('unknown_kecamatan')
    le_kec = LabelEncoder().fit(df['kecamatan'])
    df['kecamatan_enc'] = le_kec.transform(df['kecamatan'])
    
    return df

df_clean_merchant = clean_merchant(merchant)

In [12]:
def clean_user_profile(df):

    df = df.copy()
    
    # 1) Pastikan user_id dan segment tidak kosong & bertipe string
    df['user_id'] = df['user_id'].astype(str).fillna('unknown_user')
    df['segment'] = df['segment'].astype(str).fillna('unknown_segment').str.lower()
    
    # 2) Encode user_id
    le_user = LabelEncoder().fit(df['user_id'])
    df['user_enc'] = le_user.transform(df['user_id'])
    
    # 3) Encode segment
    le_seg = LabelEncoder().fit(df['segment'])
    df['segment_enc'] = le_seg.transform(df['segment'])
    
    return df

df_clean_userprofile = clean_user_profile(userprofile)

3. Pembuatan fitur sess agg

In [13]:


def aggregate_sessions(sessions):
    """
    Agregasi data sessions per user:
      - total_sessions: jumlah unik session_id
      - avg_session_duration: rata-rata session_duration (detik)
      - last_session: waktu open_timestamp terakhir
    """
    df = sessions.copy()
    
    # Pastikan timestamps
    df['open_timestamp']  = pd.to_datetime(df['open_timestamp'])
    df['close_timestamp'] = pd.to_datetime(df['close_timestamp'])
    
    # Isi missing ID
    df['user_id']    = df['user_id'].astype(str).fillna('unknown_user')
    df['session_id'] = df['session_id'].astype(str).fillna('unknown_session')
    
    # Hitung durasi kalau belum ada
    if 'session_duration' not in df.columns:
        df['session_duration'] = (
            df['close_timestamp'] - df['open_timestamp']
        ).dt.total_seconds()
    df['session_duration'] = df['session_duration'].fillna(0)
    
    # Agregasi per user
    sess_agg = (
        df.groupby('user_id')
          .agg(
              total_sessions       = ('session_id', 'nunique'),
              avg_session_duration = ('session_duration', 'mean'),
              last_session         = ('open_timestamp', 'max'),
          )
          .reset_index()
    )
    return sess_agg

sess_agg = aggregate_sessions(df_clean_app_sessions)


4. Merge Data


In [14]:
def merge_data(
    trans: pd.DataFrame,
    sessions: pd.DataFrame,
    profiles: pd.DataFrame,
    promotions: pd.DataFrame,
    merchants: pd.DataFrame,
    items: pd.DataFrame,
    weather: pd.DataFrame
) -> pd.DataFrame:
    """
    1) RFM / order level aggregates
    2) Merge transaksi + profil
    3) Merge RFM, sessions, merchant, item, weather, promo
    4) Drop all raw columns not used by model
    """
    # --- 1) RFM aggregates per user ---
    # total orders, total spend, avg order value, recency
    order_agg = (
        trans.groupby('user_id')
             .agg(
                 total_orders    = ('order_id', 'nunique'),
                 total_spend     = ('total_spend', 'sum'),
                 avg_order_value = ('price_subtotal', 'mean'),
                 last_order      = ('order_timestamp', 'max')
             )
             .reset_index()
    )
    # compute recency in days
    max_ts = trans['order_timestamp'].max()
    order_agg['recency_days'] = (max_ts - order_agg['last_order']).dt.days

    # --- 2) Start merge transaksi + profil ---
    df = trans.merge(
        profiles[['user_id','segment_enc']],
        on='user_id', how='left'
    )

    # --- 3a) Add RFM features ---
    df = df.merge(order_agg, on='user_id', how='left')
    
    # 3b) Voucher‐usage & price‐habit context
    voucher_agg = (
        trans.groupby('user_id')
            .agg(
                voucher_count      = ('voucher_discount_amt', lambda x: (x>0).sum()),
                voucher_rate       = ('voucher_discount_amt', lambda x: (x>0).mean()),
                avg_voucher_pct    = ('voucher_discount_pct', 'mean'),
                total_cashback     = ('voucher_cashback_amt', 'sum'),
                user_avg_price     = ('price_subtotal', 'mean'),
                user_median_price  = ('price_subtotal', 'median')
            )
            .reset_index()
    )
    df = df.merge(voucher_agg, on='user_id', how='left')

    # --- 3b) Add session aggregates ---
    sess_agg = aggregate_sessions(sessions)
    df = df.merge(sess_agg, on='user_id', how='left')

    # --- 3c) Merchant info ---

    df = df.merge(
    merchants[['merchant_id','merchant_enc','kecamatan_enc']]
        .rename(columns={'merchant_enc':'merchant_enc','kecamatan_enc':'kecamatan_enc_merchant'}),
    on='merchant_id', how='left'
    )


    # --- 3d) First‐item features ---
    df['first_item_id'] = df['item_ids'].str.split(',').str[0]
    df = df.merge(
        items[['item_id','item_enc','category_enc','price']]
             .rename(columns={
                 'price':'first_item_price',
                 'item_enc':'first_item_enc',
                 'category_enc':'first_category_enc'
             }),
        left_on='first_item_id', right_on='item_id', how='left'
    )
    df.drop(columns=['first_item_id','item_id'], inplace=True)

    # --- 3e) Weather context (by order_timestamp) ---
    df = df.merge(
        weather[['timestamp','weather_enc','is_weekend','holiday_flag','temperature']]
               .rename(columns={'timestamp':'order_timestamp'}),
        on='order_timestamp', how='left'
    )

# 3f) Promotion context (use 'duration', not 'promo_duration')
    promo_cols = [
        'promo_id','promo_enc','type_enc','target_enc',
        'start_hour','start_weekday','duration'
    ]
    promo_main = promotions[promo_cols].rename(columns={
        'promo_id':'promo_id_main',
        'promo_enc':'promo_main_enc',
        'type_enc':'promo_type_main_enc',
        'target_enc':'promo_target_main_enc',
        'start_hour':'promo_start_hour_main',
        'start_weekday':'promo_start_weekday_main',
        'duration':'promo_duration_main'
    })
    promo_del = promotions[promo_cols].rename(columns={
        'promo_id':'promo_id_delivery',
        'promo_enc':'promo_delivery_enc',
        'type_enc':'promo_type_delivery_enc',
        'target_enc':'promo_target_delivery_enc',
        'start_hour':'promo_start_hour_delivery',
        'start_weekday':'promo_start_weekday_delivery',
        'duration':'promo_duration_delivery'
    })
    df = df.merge(promo_main, on='promo_id_main', how='left')\
           .merge(promo_del,  on='promo_id_delivery', how='left')
    # 1) Rename semua yang “_x” ke nama final:
    df = df.rename(columns={
        'merchant_enc_x':            'merchant_enc',
        'kecamatan_enc_x':           'kecamatan_enc_user',
        'kecamatan_enc_y':           'kecamatan_enc_merchant',
        'user_enc_x':                'user_enc',
        'total_spend_x':             'order_spend',
        'total_spend_y':             'user_total_spend',

        'promo_main_enc_x':          'promo_main_enc',
        'promo_type_main_enc_x':     'promo_type_main_enc',
        'promo_target_main_enc_x':   'promo_target_main_enc',
        'promo_start_hour_main_x':   'promo_start_hour_main',
        'promo_start_weekday_main_x':'promo_start_weekday_main',
        'promo_duration_main_x':     'promo_duration_main',

        'promo_delivery_enc_x':          'promo_delivery_enc',
        'promo_type_delivery_enc_x':     'promo_type_delivery_enc',
        'promo_target_delivery_enc_x':   'promo_target_delivery_enc',
        'promo_start_hour_delivery_x':   'promo_start_hour_delivery',
        'promo_start_weekday_delivery_x':'promo_start_weekday_delivery',
        'promo_duration_delivery_x':     'promo_duration_delivery',

        'service_enc_x':             'service_enc',
        'session_enc_x':             'session_enc',
        # …dan seterusnya untuk setiap “_x”
    })

    df['temperature_copy'] = df['temperature']  # jika mau amankan dulu

    # 2) Drop semua _y tersisa dan kolom mentah yang tidak dipakai:
    to_drop = [c for c in df.columns if c.endswith('_y')] + [
        'service_type','session_id','open_timestamp','close_timestamp',
        'entry_point','promo_id_main','promo_id_delivery','user_id',
        'merchant_id','item_ids','order_id','order_timestamp','last_order'
    ]
    df.drop(columns=[c for c in to_drop if c in df.columns], inplace=True)

    # enforce integer dtype
    for c in df.select_dtypes(include='int64').columns:
        df[c] = df[c].astype(int)

    return df

merged_df = merge_data(
    trans=df_clean_transaksi,
    sessions=df_clean_app_sessions,
    profiles=df_clean_userprofile,
    promotions=df_clean_promotion_history,
    merchants=df_clean_merchant,
    items=df_clean_item_catalog,
    weather=df_clean_weather_calendar
)

In [15]:
merged_df.head(5)

Unnamed: 0,price_subtotal,delivery_fee_before,delivery_fee_after,voucher_discount_pct,voucher_discount_amt,voucher_cashback_pct,voucher_cashback_amt,free_delivery_flag,order_spend,order_hour,...,promo_start_hour_main,promo_start_weekday_main,promo_duration_main,promo_delivery_enc,promo_type_delivery_enc,promo_target_delivery_enc,promo_start_hour_delivery,promo_start_weekday_delivery,promo_duration_delivery,temperature_copy
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,5000.0,6,...,,,,,,,,,,32.3
1,111952,10854.67,10854.67,0.0,0.0,0.0,0.0,0,122806.67,21,...,,,,,,,,,,30.3
2,22002,9202.03,0.0,0.0,0.0,0.1,2200.2,1,22002.0,21,...,0.0,4.0,3715200.0,12.0,2.0,2.0,0.0,3.0,3283200.0,28.6
3,0,0.0,0.0,0.0,0.0,0.15,0.0,1,5000.0,14,...,0.0,0.0,3369600.0,12.0,2.0,2.0,0.0,3.0,3283200.0,30.7
4,54969,8475.53,8475.53,0.0,0.0,0.0,0.0,0,63444.53,23,...,,,,,,,,,,34.0


## BUILD SEQUENCE

In [16]:
# --- Urutkan berdasarkan user_id dan waktu order ---
merged_df = merged_df.sort_values(['user_enc', 'order_hour']).copy()

# --- Helper: Binning harga ---
def price_bin(price):
    if pd.isnull(price): return 'unknown'
    try:
        price = float(price)
        if price < 25000: return 'low'
        elif price < 75000: return 'medium'
        else: return 'high'
    except: return 'unknown'

merged_df['price_bin'] = merged_df['first_item_price'].apply(price_bin)

# --- Helper: Waktu dalam kategori ---
def time_of_day(hour):
    if pd.isnull(hour): return 'unknown'
    hour = int(hour)
    if hour < 6: return 'night'
    elif hour < 12: return 'morning'
    elif hour < 18: return 'afternoon'
    else: return 'evening'

merged_df['time_of_day'] = merged_df['order_hour'].apply(time_of_day)

# --- Helper: Pola cuaca (rainy, normal, extreme) ---
def weather_level(temp):
    if pd.isnull(temp): return 'unknown'
    if temp < 25: return 'cool'
    elif temp < 30: return 'normal'
    else: return 'hot'

merged_df['weather_level'] = merged_df['temperature_copy'].apply(weather_level)

# --- Sequence Builder per User ---
sequence_df = (
    merged_df.groupby('user_enc')
    .agg(
        seq_item_ids           = ('first_item_enc', lambda x: list(x)),
        seq_item_cats          = ('first_category_enc', lambda x: list(x)),
        seq_merchant_ids       = ('merchant_enc', lambda x: list(x)),
        seq_service_types      = ('service_enc', lambda x: list(x)),
        seq_price_bin          = ('price_bin', lambda x: list(x)),
        seq_time_of_day        = ('time_of_day', lambda x: list(x)),
        seq_weekday            = ('order_weekday', lambda x: list(x)),
        seq_weather_enc        = ('weather_enc', lambda x: list(x)),
        seq_weather_lvl        = ('weather_level', lambda x: list(x)),
        seq_promo_main_enc     = ('promo_main_enc', lambda x: list(x)),
        seq_promo_delv_enc     = ('promo_delivery_enc', lambda x: list(x)),
        seq_disc_pct           = ('voucher_discount_pct', lambda x: list(np.nan_to_num(x))),
        seq_cashback_pct       = ('voucher_cashback_pct', lambda x: list(np.nan_to_num(x))),
        seq_subtotal           = ('price_subtotal', lambda x: list(np.nan_to_num(x))),
        seq_total_spend        = ('order_spend', lambda x: list(np.nan_to_num(x))),
        seq_is_weekend         = ('is_weekend', lambda x: list(x)),
        seq_is_holiday         = ('holiday_flag', lambda x: list(x)),

        # --- Static features (ambil sekali saja) ---
        segment_enc            = ('segment_enc', 'first'),
        kecamatan_enc_merchant = ('kecamatan_enc_merchant',lambda x: x.mode().iloc[0] if not x.mode().empty else -1),
        avg_session_duration   = ('avg_session_duration', 'first'),
        total_orders           = ('total_orders', 'first'),
        user_total_spend       = ('user_total_spend', 'first'),
        user_avg_price         = ('user_avg_price', 'first'),
        user_median_price      = ('user_median_price', 'first'),
        voucher_count          = ('voucher_count', 'first'),
        voucher_rate           = ('voucher_rate', 'first'),
        avg_voucher_pct        = ('avg_voucher_pct', 'first'),
        total_cashback         = ('total_cashback', 'first'),
        recency_days           = ('recency_days', 'first')
    )
    .reset_index()
)

# --- Tambahan fitur sequence ringkasan / agregat ---
sequence_df['seq_length'] = sequence_df['seq_item_ids'].apply(len)
sequence_df['unique_items'] = sequence_df['seq_item_ids'].apply(lambda x: len(set(x)))
sequence_df['unique_merchants'] = sequence_df['seq_merchant_ids'].apply(lambda x: len(set(x)))
sequence_df['voucher_usage_ratio'] = sequence_df['voucher_count'] / sequence_df['total_orders'].replace(0, 1)

def discount_class(pct):
    if pct == 0:
        return 0    # no_promo
    elif pct < 0.05:
        return 1    # low
    elif pct < 0.15:
        return 2    # medium
    else:
        return 3    # high

sequence_df['target_discount'] = sequence_df['seq_disc_pct'].apply(
    lambda seq: discount_class(seq[-1] if seq else 0)
)

# 8) Update SEQ_COLS for downstream training
SEQ_COLS = [
    'seq_item_ids', 'seq_time_of_day', 'seq_price_bin', 'seq_weather_lvl',
    'seq_disc_pct', 'seq_promo_main_enc'
]


# --- Simpan hasil ---
sequence_df.to_csv('data/processed_data/sequence_data_full.csv', index=False)
print("[✓] sequence_data_full.csv berhasil disimpan dengan fitur lengkap.")

# --- Preview hasil ---
sequence_df.head()


[✓] sequence_data_full.csv berhasil disimpan dengan fitur lengkap.


Unnamed: 0,user_enc,seq_item_ids,seq_item_cats,seq_merchant_ids,seq_service_types,seq_price_bin,seq_time_of_day,seq_weekday,seq_weather_enc,seq_weather_lvl,...,voucher_count,voucher_rate,avg_voucher_pct,total_cashback,recency_days,seq_length,unique_items,unique_merchants,voucher_usage_ratio,target_discount
0,0,"[nan, nan, nan, nan, nan, nan, nan, 693.0, 488...","[nan, nan, nan, nan, nan, nan, nan, 6.0, 7.0, ...","[320, 320, 320, 320, 320, 320, 320, 293, 15, 1...","[2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, 2, 0]","[unknown, unknown, unknown, unknown, unknown, ...","[morning, morning, morning, morning, afternoon...","[1, 6, 4, 0, 6, 4, 0, 5, 6, 1, 6, 2, 3, 5]","[3, 2, 0, 1, 3, 3, 3, 3, 3, 3, 0, 3, 0, 0]","[hot, hot, hot, hot, hot, normal, hot, hot, no...",...,0,0.0,0.028571,8351.85,0,14,14,5,0.0,0
1,1,"[nan, nan, nan, 137.0, 38.0, nan, nan, nan, 36...","[nan, nan, nan, 1.0, 5.0, nan, nan, nan, 2.0, ...","[80, 179, 320, 189, 74, 320, 320, 291, 277, 32...","[1, 1, 2, 1, 1, 2, 0, 1, 1, 2, 1]","[unknown, unknown, unknown, medium, high, unkn...","[morning, morning, morning, morning, evening, ...","[2, 0, 6, 1, 5, 2, 0, 2, 1, 6, 6]","[0, 1, 3, 3, 0, 3, 3, 1, 3, 0, 3]","[cool, normal, normal, hot, hot, normal, norma...",...,4,0.363636,0.068182,0.0,0,11,11,8,0.363636,3
2,2,"[nan, nan, nan, nan, nan, nan, 662.0, 329.0, n...","[nan, nan, nan, nan, nan, nan, 7.0, 9.0, nan, ...","[315, 320, 320, 320, 213, 112, 307, 279, 320, ...","[1, 2, 0, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 0, ...","[unknown, unknown, unknown, unknown, unknown, ...","[night, night, morning, morning, afternoon, af...","[6, 1, 0, 2, 4, 1, 3, 4, 3, 3, 6, 0, 2, 4, 0, ...","[1, 3, 3, 1, 3, 1, 0, 3, 3, 1, 2, 0, 0, 0, 0, ...","[hot, normal, hot, normal, hot, normal, normal...",...,1,0.055556,0.047222,21745.6,3,18,18,11,0.055556,0
3,3,"[nan, nan, nan, 800.0, nan, nan, 847.0, 830.0,...","[nan, nan, nan, 1.0, nan, nan, 4.0, 3.0, nan, ...","[320, 320, 320, 40, 15, 68, 178, 76, 135, 223,...","[2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1]","[unknown, unknown, unknown, high, unknown, unk...","[night, afternoon, afternoon, afternoon, after...","[1, 0, 1, 3, 1, 2, 4, 2, 3, 1, 6, 4, 6, 6]","[3, 0, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3]","[cool, normal, cool, normal, normal, normal, c...",...,2,0.142857,0.039286,8950.05,1,14,14,11,0.142857,0
4,4,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[173, 320, 278, 320, 320, 272, 76, 69, 164, 32...","[1, 0, 1, 2, 2, 1, 1, 1, 1, 0, 1, 1, 2]","[unknown, unknown, unknown, unknown, unknown, ...","[night, night, morning, morning, morning, morn...","[6, 6, 2, 5, 0, 1, 1, 1, 1, 3, 6, 5, 1]","[1, 3, 3, 0, 3, 3, 2, 0, 1, 3, 1, 3, 0]","[hot, cool, hot, hot, hot, hot, normal, hot, n...",...,5,0.384615,0.088462,12469.9,1,13,13,9,0.384615,3


## Model

In [None]:
# File: train_and_infer_discount_fixed.py

import os
import ast
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import math

# === 1. Configuration ===
SEQ_CSV        = 'data/processed_data/sequence_data_full.csv'
MODEL_PATH     = 'models/transformer_discount.pt'
DEVICE         = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MAX_SEQ_LENGTH = 30
BATCH_SIZE     = 64
EPOCHS         = 10
LR             = 1e-3

# === 2. Helper functions ===
def parse_list(s):
    if pd.isnull(s):
        return []
    if isinstance(s, list):
        return s
    try:
        return ast.literal_eval(s)
    except:
        return []

def pad_or_truncate(seq, max_len=MAX_SEQ_LENGTH, pad_val=0):
    if len(seq) > max_len:
        return seq[-max_len:]
    return seq + [pad_val] * (max_len - len(seq))

def discount_class(pct):
    if pct == 0:
        return 0
    elif pct < 0.05:
        return 1
    elif pct < 0.15:
        return 2
    else:
        return 3

def build_vocab(seqs, pad_zero=True):
    vals = sorted({str(v) for seq in seqs for v in seq})
    vocab = {'<PAD>': 0} if pad_zero else {}
    start = 1 if pad_zero else 0
    for i, v in enumerate(vals):
        vocab[v] = i + start
    return vocab

def map_seq(seq, vocab):
    return [vocab.get(str(x), 0) for x in seq]

# === 3. Load & preprocess ===
df = pd.read_csv(SEQ_CSV)
raw_cols = [
    'seq_item_ids',
    'seq_time_of_day',
    'seq_price_bin',
    'seq_weather_lvl',
    'seq_disc_pct',
    'seq_promo_main_enc'
]
for c in raw_cols:
    df[c] = df[c].apply(parse_list)

# build discount-class sequence from raw pct
df['seq_disc_class'] = df['seq_disc_pct'].apply(lambda seq: [discount_class(x) for x in seq])

# pad/truncate all sequences
seq_cols = [
    'seq_item_ids',
    'seq_time_of_day',
    'seq_price_bin',
    'seq_weather_lvl',
    'seq_disc_class',
    'seq_promo_main_enc'
]
for c in seq_cols:
    df[c] = df[c].apply(pad_or_truncate)

# create target label: class of last discount pct
df['target'] = df['seq_disc_class'].apply(lambda seq: seq[-1] if seq else 0)

# === 4. Build vocabularies and map to indices ===
item_vocab    = build_vocab(df['seq_item_ids'])
time_vocab    = build_vocab(df['seq_time_of_day'])
price_vocab   = build_vocab(df['seq_price_bin'])
weather_vocab = build_vocab(df['seq_weather_lvl'])
disc_vocab    = {0:0, 1:1, 2:2, 3:3}
promo_vocab   = build_vocab(df['seq_promo_main_enc'])

vocab_sizes = {
    'item':    len(item_vocab),
    'time':    len(time_vocab),
    'price':   len(price_vocab),
    'weather': len(weather_vocab),
    'disc':    len(disc_vocab),
    'promo':   len(promo_vocab)
}

df['seq_item_ids']       = df['seq_item_ids'].apply(lambda s: map_seq(s, item_vocab))
df['seq_time_of_day']    = df['seq_time_of_day'].apply(lambda s: map_seq(s, time_vocab))
df['seq_price_bin']      = df['seq_price_bin'].apply(lambda s: map_seq(s, price_vocab))
df['seq_weather_lvl']    = df['seq_weather_lvl'].apply(lambda s: map_seq(s, weather_vocab))
df['seq_disc_class']     = df['seq_disc_class'].apply(lambda s: [disc_vocab[x] for x in s])
df['seq_promo_main_enc'] = df['seq_promo_main_enc'].apply(lambda s: map_seq(s, promo_vocab))

# === 5. Train/Validation Split ===
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['target']
)
print(f"Train size: {len(train_df)}, Val size: {len(val_df)}")
dist = train_df['target'].value_counts(normalize=True)
print("Distribusi kelas (train):")
print(dist)

# === 6. Dataset & DataLoader ===
class DiscountDataset(Dataset):
    def __init__(self, df):
        self.X = {
            'item':    torch.tensor(df['seq_item_ids'].tolist(), dtype=torch.long),
            'time':    torch.tensor(df['seq_time_of_day'].tolist(), dtype=torch.long),
            'price':   torch.tensor(df['seq_price_bin'].tolist(), dtype=torch.long),
            'weather': torch.tensor(df['seq_weather_lvl'].tolist(), dtype=torch.long),
            'disc':    torch.tensor(df['seq_disc_class'].tolist(), dtype=torch.long),
            'promo':   torch.tensor(df['seq_promo_main_enc'].tolist(), dtype=torch.long),
        }
        self.y = torch.tensor(df['target'].tolist(), dtype=torch.long)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        sample = {k: v[idx] for k, v in self.X.items()}
        sample['target'] = self.y[idx]
        return sample

train_loader = DataLoader(
    DiscountDataset(train_df),
    batch_size=BATCH_SIZE,
    shuffle=True
)
val_loader = DataLoader(
    DiscountDataset(val_df),
    batch_size=BATCH_SIZE,
    shuffle=False
)

# === 7. Model Definition ===
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=MAX_SEQ_LENGTH):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        p  = torch.arange(0, max_len).unsqueeze(1).float()
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000) / d_model))
        pe[:, 0::2] = torch.sin(p * div)
        pe[:, 1::2] = torch.cos(p * div)
        self.pe = pe.unsqueeze(0)
    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class TransformerDiscount(nn.Module):
    def __init__(self, vs, d_model=128, nhead=4, num_layers=2):
        super().__init__()
        self.embs = nn.ModuleDict({
            'item':    nn.Embedding(vs['item'], d_model, padding_idx=0),
            'time':    nn.Embedding(vs['time'], d_model, padding_idx=0),
            'price':   nn.Embedding(vs['price'], d_model, padding_idx=0),
            'weather': nn.Embedding(vs['weather'], d_model, padding_idx=0),
            'disc':    nn.Embedding(vs['disc'], d_model, padding_idx=0),
            'promo':   nn.Embedding(vs['promo'], d_model, padding_idx=0),
        })
        self.pos_enc     = PositionalEncoding(d_model)
        enc_layer        = nn.TransformerEncoderLayer(d_model, nhead, d_model*4, dropout=0.1, batch_first=True)
        self.transformer = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
        self.dropout     = nn.Dropout(0.1)
        self.out         = nn.Linear(d_model, vs['disc'])
    def forward(self, x):
        emb = sum(self.embs[k](x[k]) for k in ['item','time','price','weather','disc','promo'])
        emb = emb * math.sqrt(emb.size(-1))
        emb = self.pos_enc(emb)
        emb = self.dropout(emb)
        out = self.transformer(emb)  # no padding mask
        logits = self.out(out[:, -1, :])
        return logits

# === 8. Training Setup ===
# compute class weights on train set
num_classes = 4
counts = train_df['target'].value_counts().to_dict()
freqs = [counts.get(i, 0) or 1.0 for i in range(num_classes)]
total = sum(freqs)
class_weights = [ total/f for f in freqs ]
w_sum = sum(class_weights)
class_weights = [w/w_sum for w in class_weights]
class_weights = torch.tensor(class_weights, dtype=torch.float, device=DEVICE)
print("Class weights:", class_weights)

model     = TransformerDiscount(vocab_sizes).to(DEVICE)
optimizer = Adam(model.parameters(), lr=LR, weight_decay=1e-5)
scheduler = StepLR(optimizer, step_size=3, gamma=0.1)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# === 9. Training Loop ===
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        logits = model(batch)
        loss   = criterion(logits, batch['target'])
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS} – Loss: {avg_loss:.4f}")
    scheduler.step()

# === 10. Save Model ===
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
torch.save({
    'state_dict':   model.state_dict(),
    'vocab_sizes':  vocab_sizes,
    'item_vocab':   item_vocab,
    'time_vocab':   time_vocab,
    'price_vocab':  price_vocab,
    'weather_vocab':weather_vocab,
    'disc_vocab':   disc_vocab,
    'promo_vocab':  promo_vocab,
    'max_seq_len':  MAX_SEQ_LENGTH
}, MODEL_PATH)
print(f"Model saved to {MODEL_PATH}")

# === 11. Validation Evaluation ===
all_preds, all_truth = [], []
model.eval()
with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        logits = model(batch)
        preds  = logits.argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_truth.extend(batch['target'].cpu().numpy())
print("Confusion Matrix:\n", confusion_matrix(all_truth, all_preds, labels=[0,1,2,3]))
print("Classification Report:\n", classification_report(all_truth, all_preds, labels=[0,1,2,3],
                                                         target_names=['no','low','medium','high']))

# === 12. Inference function ===
def predict_discount_batch(seq_batch):
    model.eval()
    col_to_vocab = {
        'seq_item_ids':       item_vocab,
        'seq_time_of_day':    time_vocab,
        'seq_price_bin':      price_vocab,
        'seq_weather_lvl':    weather_vocab,
        'seq_disc_class':     disc_vocab,
        'seq_promo_main_enc': promo_vocab
    }
    x = {}
    for col, vocab in col_to_vocab.items():
        seqs = [pad_or_truncate(s) for s in seq_batch[col]]
        idxs = [map_seq(s, vocab) for s in seqs]
        if col == 'seq_item_ids':        key = 'item'
        elif col == 'seq_time_of_day':   key = 'time'
        elif col == 'seq_price_bin':     key = 'price'
        elif col == 'seq_weather_lvl':   key = 'weather'
        elif col == 'seq_disc_class':    key = 'disc'
        elif col == 'seq_promo_main_enc':key = 'promo'
        x[key] = torch.tensor(idxs, dtype=torch.long, device=DEVICE)
    with torch.no_grad():
        logits = model(x)
        probs  = F.softmax(logits, dim=1)
        preds  = probs.argmax(dim=1).cpu().numpy()
    return preds, probs.cpu().numpy()

# === 13. Example Inference ===
test_batch = {c: val_df[c][:5].tolist() for c in seq_cols}
preds, probs = predict_discount_batch(test_batch)
print("Sample predictions:", preds)
print("Sample probabilities:", probs)


## Inference Batch

In [18]:
from pathlib import Path
import ast
import pandas as pd
import torch
import torch.nn.functional as F

# Config
SEQ_CSV    = 'data/processed_data/sequence_data_full.csv'
MODEL_PATH = 'models/transformer_discount.pt'
DEVICE     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 64

# load checkpoint
checkpoint = torch.load(MODEL_PATH, map_location=DEVICE)

# recover vocabs & params
vocab_sizes   = checkpoint['vocab_sizes']
item_vocab    = checkpoint['item_vocab']
time_vocab    = checkpoint['time_vocab']
price_vocab   = checkpoint['price_vocab']
weather_vocab = checkpoint['weather_vocab']
disc_vocab    = checkpoint['disc_vocab']
promo_vocab   = checkpoint['promo_vocab']
max_seq_len   = checkpoint['max_seq_len']

# instantiate & load model
model = TransformerDiscount(vocab_sizes).to(DEVICE)
model.load_state_dict(checkpoint['state_dict'])
model.eval()

def parse_list(s):
    if pd.isnull(s): return []
    if isinstance(s, list): return s
    try: return ast.literal_eval(s)
    except: return []

def pad_or_truncate(seq, max_len=max_seq_len, pad_val=0):
    if len(seq) > max_len: return seq[-max_len:]
    return seq + [pad_val]*(max_len - len(seq))

def map_seq(seq, vocab):
    return [vocab.get(str(x), 0) for x in seq]

def predict_discount_batch(seq_batch):
    col2vocab = {
        'seq_item_ids':       item_vocab,
        'seq_time_of_day':    time_vocab,
        'seq_price_bin':      price_vocab,
        'seq_weather_lvl':    weather_vocab,
        'seq_disc_class':     disc_vocab,
        'seq_promo_main_enc': promo_vocab
    }
    x = {}
    for col, vocab in col2vocab.items():
        seqs = [pad_or_truncate(s) for s in seq_batch[col]]
        idxs = [map_seq(s, vocab) for s in seqs]
        key = {
            'seq_item_ids':'item',
            'seq_time_of_day':'time',
            'seq_price_bin':'price',
            'seq_weather_lvl':'weather',
            'seq_disc_class':'disc',
            'seq_promo_main_enc':'promo'
        }[col]
        x[key] = torch.tensor(idxs, dtype=torch.long, device=DEVICE)
    with torch.no_grad():
        logits = model(x)
        probs  = F.softmax(logits, dim=1)
        preds  = probs.argmax(dim=1).cpu().numpy()
    return preds, probs.cpu().numpy()

def run_inference_batch(input_csv, output_csv, batch_size=BATCH_SIZE):
    df = pd.read_csv(input_csv)
    # parse raw sequences
    for col in ['seq_item_ids','seq_time_of_day','seq_price_bin',
                'seq_weather_lvl','seq_disc_pct','seq_promo_main_enc']:
        df[col] = df[col].apply(parse_list)
    # build discount-class
    df['seq_disc_class'] = df['seq_disc_pct'].apply(
        lambda seq: [0 if x==0 else (1 if x<0.05 else (2 if x<0.15 else 3)) 
                     for x in seq]
    )
    preds_all, _ = [], []
    # iterate per batch
    for start in range(0, len(df), batch_size):
        batch = {col: df[col].iloc[start:start+batch_size].tolist() 
                 for col in ['seq_item_ids','seq_time_of_day','seq_price_bin',
                             'seq_weather_lvl','seq_disc_class','seq_promo_main_enc']}
        preds, _ = predict_discount_batch(batch)
        preds_all.extend(preds)
    # save
    df['pred_discount_class'] = preds_all
    df[['pred_discount_class']].to_csv(output_csv, index=False)
    print(f"[✓] Saved predictions to {output_csv}")

run_inference_batch(
    input_csv  = SEQ_CSV,
    output_csv = 'data/processed_data/discount_preds.csv'
)


[✓] Saved predictions to data/processed_data/discount_preds.csv


## Realtime inference

In [19]:
# load hasil sequence builder
SEQ_DF = 'data/processed_data/sequence_data_full.csv'
seq_df = pd.read_csv(SEQ_DF)

# kolom raw sequence yang perlu di-parse jadi list
RAW_SEQ_COLS = [
    'seq_item_ids','seq_time_of_day','seq_price_bin',
    'seq_weather_lvl','seq_disc_pct','seq_promo_main_enc'
]

# helper parse_list (sama seperti di inference batch)
def parse_list(s):
    if pd.isnull(s): return []
    if isinstance(s, list): return s
    try: return ast.literal_eval(s)
    except: return []

# parse semua kolom sequence
for col in RAW_SEQ_COLS:
    seq_df[col] = seq_df[col].apply(parse_list)

print("[✓] Loaded and parsed sequence_data_full")

# Fungsi untuk realtime inference: ambil row sequence per user, jalankan model
def realtime_inference_user(user_enc):
    # cari sequence user
    row = seq_df.loc[seq_df['user_enc']==user_enc]
    if row.empty:
        raise ValueError(f"No sequence found for user_enc={user_enc}")
    row = row.iloc[0]
  
    # bangun batch kecil untuk model
    batch = {
        'seq_item_ids':       [row['seq_item_ids']],
        'seq_time_of_day':    [row['seq_time_of_day']],
        'seq_price_bin':      [row['seq_price_bin']],
        'seq_weather_lvl':    [row['seq_weather_lvl']],
        # ubah persen discount jadi kelas sama seperti di train
        'seq_disc_class':     [[0 if x==0 else (1 if x<0.05 else (2 if x<0.15 else 3))
                                 for x in row['seq_disc_pct']]],
        'seq_promo_main_enc': [row['seq_promo_main_enc']]
    }

    preds, probs = predict_discount_batch(batch)
    return int(preds[0]), probs[0]

print("[✓] realtime_inference_user ready")

# ambil contoh user pertama
sample_user = int(seq_df['user_enc'].iloc[0])

# jalankan realtime inference
pred_class, prob_dist = realtime_inference_user(sample_user)
print(f"Realtime prediksi untuk user {sample_user} → kelas discount = {pred_class}, probs = {prob_dist}")

# strategi mapping kelas → pct diskon
def discount_strategy_from_class(c):
    return {0:0.00, 1:0.03, 2:0.07, 3:0.15}.get(c, 0.00)

# stub fungsi notifikasi (ganti dengan push service-mu)
def send_notification(user_enc, pct):
    msg = f"Rekomendasi diskon {pct*100:.0f}% untuk Anda sekarang!"
    print(f"[NOTIF to user {user_enc}]: {msg}")

# contoh end-to-end
pct = discount_strategy_from_class(pred_class)
send_notification(sample_user, pct)


[✓] Loaded and parsed sequence_data_full
[✓] realtime_inference_user ready
Realtime prediksi untuk user 0 → kelas discount = 0, probs = [9.9971336e-01 1.1612193e-04 8.4611369e-05 8.5928557e-05]
[NOTIF to user 0]: Rekomendasi diskon 0% untuk Anda sekarang!


## Discount_strategy

In [20]:
# Base mapping dari prediksi kelas ke % diskon
CLASS2PCT = {
    0: 0.00,  # no promo
    1: 0.03,  # low
    2: 0.07,  # medium
    3: 0.15   # high
}

def discount_strategy_from_class(pred_class: int) -> float:
    """
    Dapatkan base discount percentage berdasarkan kelas output model.
    """
    return CLASS2PCT.get(pred_class, 0.00)

def apply_discount_strategy(
    user_enc: int,
    pred_class: int,
    prob_dist: list,
    seq_df=seq_df
) -> float:
    """
    Terapkan rule tambahan untuk menyesuaikan base discount:
    - Jika recency_days > 30 dan ada diskon, tambahkan +2%
    - Batas maksimal diskon 20%
    """
    row = seq_df.loc[seq_df['user_enc']==user_enc].iloc[0]
    recency = row['recency_days']
    
    base_pct = discount_strategy_from_class(pred_class)
    
    # jika customer sudah lama tidak order, tambahkan insentif kecil
    if recency > 30 and base_pct > 0:
        base_pct += 0.02
    
    # jangan lebih dari 20%
    final_pct = min(base_pct, 0.20)
    return final_pct

import tqdm

strategies = []
for u in tqdm.tqdm(seq_df['user_enc'].unique()):
    # realtime prediksi dan distribusi
    pred_cls, prob_dist = realtime_inference_user(u)
    pct = apply_discount_strategy(u, pred_cls, prob_dist)
    strategies.append({
        'user_enc': u,
        'pred_class': pred_cls,
        'discount_pct': pct
    })

strategies_df = pd.DataFrame(strategies)
os.makedirs('data/strategies', exist_ok=True)
strategies_df.to_csv('data/strategies/discount_strategy.csv', index=False)
print("[✓] discount_strategy.csv berhasil disimpan")

def send_discount_notifications(strat_df):
    """
    Iterasi tiap baris strategi, kirim notif jika pct > 0.
    """
    for _, r in strat_df.iterrows():
        u = int(r['user_enc'])
        pct = r['discount_pct']
        if pct > 0:
            send_notification(u, pct)

# contoh pemanggilan
send_discount_notifications(strategies_df)


100%|██████████| 800/800 [00:05<00:00, 144.94it/s]

[✓] discount_strategy.csv berhasil disimpan





## Realtime recommendation

In [21]:
# Load kembali item_catalog untuk lookup rekomendasi
item_catalog = pd.read_csv('data/item_catalog.csv')

from collections import Counter

def recommend_favorite_item(user_enc, seq_df, item_catalog):
    """
    Pilih item favorit user berdasarkan kategori paling sering di sequence:
    1. Ambil seq_item_cats untuk user
    2. Temukan kategori paling banyak muncul
    3. Filter item_catalog dan sample 1 item di kategori itu
    """
    row = seq_df.loc[seq_df['user_enc']==user_enc]
    if row.empty:
        return None
    cats = row.iloc[0]['seq_item_cats']
    if not cats:
        return None
    
    # kategori paling sering
    top_cat = Counter(cats).most_common(1)[0][0]
    
    # pilih satu item random dari kategori tersebut
    choices = item_catalog[item_catalog['category_id'].astype(str).map(lambda x: seq_df.iloc[0]['seq_item_cats']).astype(str) == top_cat] \
              if 'category_enc' not in item_catalog.columns else \
              item_catalog[item_catalog['category_enc']==top_cat]
    
    if choices.empty:
        return None
    return choices.sample(1)['item_id'].values[0]

def send_notification(user_enc, pct, message=None):
    """
    user_enc: int
    pct: float [0,1]
    message: str | None, jika None default pakai diskon saja
    """
    if message is None:
        message = f"Rekomendasi diskon {pct*100:.0f}% untuk Anda sekarang!"
    print(f"[NOTIF to user {user_enc}]: {message}")

import datetime

def realtime_recommendation(user_enc):
    # 1) Prediksi diskon
    pred_cls, prob_dist = realtime_inference_user(user_enc)
    pct = apply_discount_strategy(user_enc, pred_cls, prob_dist)
    
    # 2) Ambil konteks waktu
    now = datetime.datetime.now()
    hour = now.hour
    
    # 3) Rekomendasi based on time window
    if 11 <= hour < 14:
        # lunchtime
        fav = recommend_favorite_item(user_enc, seq_df, item_catalog)
        if fav:
            msg = f"Jam makan siang! Coba pesan '{fav}' favoritmu dengan diskon {pct*100:.0f}%."
        else:
            msg = f"Diskon {pct*100:.0f}% untuk menu favoritmu!"
    elif 18 <= hour < 21:
        # dinner time
        fav = recommend_favorite_item(user_enc, seq_df, item_catalog)
        if fav:
            msg = f"Waktu makan malam! '{fav}' lagi hits, dapatkan diskon {pct*100:.0f}%."
        else:
            msg = f"Dinner special: diskon {pct*100:.0f}% buat kamu."
    else:
        # default
        msg = None
    
    # 4) Kirim notifikasi
    send_notification(user_enc, pct, message=msg)

# Contoh pemanggilan:
sample_user = int(seq_df['user_enc'].iloc[0])
realtime_recommendation(sample_user)




[NOTIF to user 0]: Rekomendasi diskon 0% untuk Anda sekarang!
