### ***`Feature Engineering`***

In [98]:
# Library
import warnings
import os
import json

import numpy as np
import pandas as pd
import networkx as nx
from pathlib import Path
from functools import lru_cache
import matplotlib.pyplot as plt
from collections import Counter
warnings.filterwarnings('ignore')
from datetime import datetime, timezone

##### `Reading Cleaned Dataset`

In [99]:
# Show all dataframe columns
pd.set_option('display.max_columns', None)

# Used to read pre-processed dataset
def data_loader(file_name: str) -> pd.DataFrame:
    working_dir = Path('.').cwd()
    path = working_dir.parent / f'data/processed/{file_name}'
    df = pd.read_csv(path)
    return df

# Reading dataset from local directory
file_name  = 'clean_dataset_fnb.csv'
data = data_loader(file_name)

In [100]:
# Modeling user transition likelihood: from display to click or checkout
# Focus is on estimating chances based on time spent transitioning from cold-start to semi-active, leading to interaction

# Modeling assumptions:
# (1) Only consider users in 'semi-active' or 'active' mode when the interaction is 'display'
# (2) Ignore users in 'cold-start' mode who only display — likely no intent to engage
# (3) Users who click or checkout may come from any active mode (cold-start, semi-active, active)
# (4) Pay special attention to users who move from 'display + cold-start' to 'click/checkout' and later become active

# User behavior categories:
# - New users (cold-start) not interacting: likely no intent
# - Users displaying but not clicking: passive interest
# - Users who click/checkout: show intent and are candidates for recommendation

# Objective: recommend relevant products to users based on their transition patterns and current features

In [101]:
# # # Identify users in cold-start mode with only display interactions
# # cold_start_display = data[(data['active_mode'] == 'cold start') & (data['interaction'] == 'display')]

# # # Identify users who are either not in cold-start or have interactions beyond display
# # users_of_interest = data[(data['active_mode'] != 'cold start') | (data['interaction'] != 'display')]

# # # Filter users who were initially in cold-start but later showed intent
# # user_in_cold_start = users_of_interest[users_of_interest['user_id'].isin(cold_start_display['user_id'])]

# # print("Before removing display: ", len(users_of_interest))

# # # Combine cold-start users and their later interactions, remove duplicates
# # filtered_users_with_intent = (
# #     pd.concat([user_in_cold_start, cold_start_display], axis=0)
# #     .drop_duplicates()
# #     .sort_values(['user_id', 'int_date'], ascending=True)
# # )

# # Sort full data by user and interaction date, then set date as index
# df = data.sort_values(['user_id', 'int_date'])

# # Compute next active_mode to detect if user is moving toward engagement
# df['next_active_mode'] = df.groupby('user_id')['active_mode'].shift(1)
# df['will_be_active'] = (
#     df['next_active_mode'].isin(['semi active', 'active'])
# ).astype(int)

# # Compute next interaction to detect future interaction intent
# df['next_int'] = df.groupby('user_id')['interaction'].shift(1)
# df['int_will'] = (
#     df['next_int'].isin(['click', 'checkout'])
# ).astype(int)

# # Drop temporary transition columns
# df.drop(['next_active_mode', 'next_int'], axis=1, inplace=True)

# # Define engagement outcome based on future active mode and interaction
# conditions = [
#     (df['will_be_active'] == 1) & (df['int_will'] == 0),  # active but no interaction intent
#     (df['will_be_active'] == 0) & (df['int_will'] == 1),  # interaction intent without activation
#     (df['will_be_active'] == 1) & (df['int_will'] == 1),  # both activation and interaction intent
# ]
# df['will_engage'] = np.select(conditions, [0, 1, 1], default=np.nan)

# # Retain only rows with meaningful engagement prediction
# df = df[df['will_engage'].notna()].drop(['will_be_active', 'int_will'], axis=1)

# # View engagement distribution
# print(df.will_engage.value_counts(normalize=True))
# # Azure AI Foundry (used for scraping)

In [102]:
# # Project Objectives

# The goal is to model user interaction behavior with a product in a two-step process:

# 1. **Predict whether a user is likely to engage** with a product (e.g., any interaction).
# 2. **If engaged**, predict the specific type of action:  
#    - **Click**
#    - **Checkout**
# 3. **Recommend a product** that aligns with the predicted action using unsupervised learning 
#    (e.g., clustering, similarity-based methods).


In [103]:
import json
import pandas as pd
import numpy as np
from functools import lru_cache
from datetime import datetime, timezone

@lru_cache(maxsize=128)
def process_data_pipeline(data_values: json) -> pd.DataFrame:
    data = pd.DataFrame(json.loads(data_values))
    
    if data.empty:
        return pd.DataFrame()

    # Parse dates
    data['int_date'] = pd.to_datetime(data['int_date'], errors='coerce')

    # Ensure required columns exist
    required_columns = ['user_id', 'item_id', 'item_type', 'segment', 'beh_segment', 'active_mode', 'time_of_day']
    for col in required_columns:
        if col not in data.columns:
            data[col] = np.nan

    data.fillna({'item_id': 'no_item_id'}, inplace=True)

    # Interaction counts
    data['total_user_interaction'] = data.groupby('user_id')['item_id'].transform('count')
    data['total_user_interaction_per_item'] = data.groupby(['user_id', 'item_id'])['item_id'].transform('count')

    # Item popularity
    # data['item_popularity'] = data.groupby('item_id')['user_id'].transform('count')
    data['normalized_popularity'] = data.groupby('item_id')['user_id'].transform('mean')

    # Behavior diversity
    data['user_unique_items'] = data.groupby('user_id')['item_id'].transform('nunique')
    data['user_unique_item_type'] = data.groupby('user_id')['item_type'].transform('nunique')
    data['user_unique_segment'] = data.groupby('user_id')['segment'].transform('nunique')
    data['user_unique_behavior_segment'] = data.groupby('user_id')['beh_segment'].transform('nunique')

    # Interaction type rates
    for item_type in data['item_type'].dropna().unique():
        data[f'user_item_{item_type}_rate'] = (data['item_type'] == item_type).astype(int)

    for mode in data['active_mode'].dropna().unique():
        data[f'user_{mode}_rate'] = (data['active_mode'] == mode).astype(int)

    # Calendar features
    data['day_of_week'] = data['int_date'].dt.dayofweek + 1
    data['week_of_month'] = ((data['int_date'].dt.day - 1) // 7 + 1).astype('Int64')
    data['month'] = data['int_date'].dt.month

    # Weekly stats
    data['user_total_weekly_int'] = data.groupby(['user_id', 'week_of_month'])['total_user_interaction'].transform('sum')
    data['user_avg_weekly_int'] = data.groupby(['user_id', 'week_of_month'])['total_user_interaction'].transform('mean')

    # Boundary flags
    data['is_weekend'] = data['day_of_week'].isin([6, 7]).astype(int)
    data['is_month_start'] = data['int_date'].dt.is_month_start.astype(int)
    data['is_month_end'] = data['int_date'].dt.is_month_end.astype(int)
    data['is_week_start'] = (data['day_of_week'] == 1).astype(int)

    # Interaction rates by time
    data['month_start_interaction_rate'] = data.groupby('user_id')['is_month_start'].transform('mean')
    data['month_end_interaction_rate'] = data.groupby('user_id')['is_month_end'].transform('mean')
    data['weekend_interaction_rate'] = data.groupby('user_id')['is_weekend'].transform('mean')

    # Cyclical features
    data['day_sin'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
    data['day_cos'] = np.cos(2 * np.pi * data['day_of_week'] / 7)
    data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
    data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)

    # Previous interaction features
    data = data.sort_values(by=['user_id', 'int_date'])
    data['prev_item_id'] = data.groupby('user_id')['item_id'].shift(1).fillna('no_prev_item')
    data['prev_user_int'] = data.groupby('user_id')['item_type'].shift(1).fillna('no_action')

    data['prev_item_freq'] = data['prev_item_id'].map(data['prev_item_id'].value_counts(normalize=True)).fillna(0)
    data['prev_user_int_freq'] = data['prev_user_int'].map(data['prev_user_int'].value_counts(normalize=True)).fillna(0)

    # Time delta features
    data['day_since_last_user_int'] = data.groupby(['user_id', 'item_id'])['int_date'].diff().dt.days.fillna(-1)

    # Total user interactions by time of day
    data['total_user_int_by_time_of_day'] = data.groupby('user_id')['time_of_day'].transform('count')

    # Encode categorical maps
    data['beh_segment'] = pd.factorize(data['beh_segment'])[0]
    data['segment'] = pd.factorize(data['segment'])[0]
    data['time_of_day'] = pd.factorize(data['time_of_day'])[0]

    # One-hot encode selected columns
    for col in ['item_type', 'active_mode', 'screen_page']:
        if col in data.columns:
            dummies = pd.get_dummies(data[col], prefix=col, dtype=float)
            data = pd.concat([data.drop(columns=col), dummies], axis=1)

    # Final cleanup
    data.rename(columns={'interaction': 'target'}, inplace=True)
    data.drop(columns=['prev_user_int', 'prev_item_id'], errors='ignore', inplace=True)

    return data.reset_index(drop=True)


# Example usage
def build_feature_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    json_data = df.to_json(orient='records')
    result_df = process_data_pipeline(json_data)
    return result_df


In [104]:
import random
from datetime import datetime, timedelta
import pandas as pd

def generate_synthetic_user_row(data: pd.DataFrame, time_period: int = 100) -> pd.DataFrame:
    required_columns = [
        'user_id', 'int_date', 'item_id', 'active_mode',
        'beh_segment', 'segment', 'item_type',
        'screen_page', 'time_of_day'
    ]
    
    for col in required_columns:
        if col not in data.columns:
            raise ValueError(f"Missing required column: '{col}' in input data.")
    
    max_date = pd.to_datetime(data['int_date'], errors='coerce').max()
    if pd.isnull(max_date):
        max_date = pd.Timestamp.today()

    future_dates = [
        (max_date + timedelta(days=i)).strftime('%Y-%m-%d') 
        for i in range(1, time_period + 1)
    ]

    custom_row = {}
    for col in required_columns:
        if col == 'int_date':
            continue
        valid_values = data[col].dropna().unique()
        custom_row[col] = random.choice(valid_values) if len(valid_values) > 0 else 'unknown'
    
    custom_row['int_date'] = random.choice(future_dates)

    return pd.DataFrame([custom_row])

In [105]:
# Assuming you already have a `data` DataFrame with the necessary columns
synthetic_row = generate_synthetic_user_row(data)
user_id = synthetic_row['user_id']
custom_user_data = data[data['user_id'].isin(user_id)]

data_new = pd.concat([custom_user_data, synthetic_row], axis=0, ignore_index=True)

new_user_record = process_data_pipeline(data_new.to_json()).tail(1)
new_user_record


Unnamed: 0,user_id,target,int_date,item_id,time_of_day,segment,beh_segment,total_user_interaction,total_user_interaction_per_item,normalized_popularity,user_unique_items,user_unique_item_type,user_unique_segment,user_unique_behavior_segment,user_item_all_rate,user_item_connect_rate,user_active_rate,user_semi active_rate,day_of_week,week_of_month,month,user_total_weekly_int,user_avg_weekly_int,is_weekend,is_month_start,is_month_end,is_week_start,month_start_interaction_rate,month_end_interaction_rate,weekend_interaction_rate,day_sin,day_cos,month_sin,month_cos,prev_item_freq,prev_user_int_freq,day_since_last_user_int,total_user_int_by_time_of_day,item_type_all,item_type_connect,active_mode_active,active_mode_semi active,screen_page_screen1
5,285279203,,2023-05-27,gafc,0,1,1,6,1,285279203.0,3,2,2,2,0,1,0,1,6,4,5,18,6.0,1,0,0,0,0.0,0.0,0.166667,-0.781831,0.62349,0.5,-0.866025,0.666667,0.666667,-1.0,6,0.0,1.0,0.0,1.0,1.0


In [106]:
# Final data pipeline processing
data_model = process_data_pipeline(data.to_json())

target_columns = ['user_id', 'target', 'item_id']
full_columns = target_columns + list(set(data_model.columns.values) - set(target_columns))

data_model = data_model[full_columns]
data_model = data_model[data_model['item_id'] != 'no_item_id']
print(len(data))
print(len(data_model))

369683
196072


In [107]:
data_model.head()

Unnamed: 0,user_id,target,item_id,user_item_lifestyle_rate,day_sin,month_sin,week_of_month,is_month_start,item_type_lifestyle,item_type_all,total_user_interaction,total_user_interaction_per_item,day_of_week,user_avg_weekly_int,user_item_transact_rate,item_type_invest,month_cos,item_type_connect,is_weekend,day_since_last_user_int,normalized_popularity,month,month_end_interaction_rate,active_mode_cold start,user_unique_item_type,month_start_interaction_rate,is_month_end,user_cold start_rate,user_item_connect_rate,prev_item_freq,item_type_insure,user_semi active_rate,user_item_insure_rate,active_mode_semi active,beh_segment,user_item_lend_rate,time_of_day,screen_page_screen2,item_type_lend,user_unique_behavior_segment,user_item_all_rate,is_week_start,day_cos,user_total_weekly_int,int_date,user_item_invest_rate,total_user_int_by_time_of_day,user_active_rate,active_mode_active,user_unique_segment,weekend_interaction_rate,segment,prev_user_int_freq,item_type_transact,screen_page_screen1,user_unique_items
2,4521,click,ibab,0,-2.449294e-16,0.866025,1,0,0.0,0.0,5,2,7,5.0,0,0.0,0.5,0.0,1,-1.0,4591820000.0,2,0.0,0.0,2,0.0,0,0,0,0.307323,1.0,1,1,1.0,1,0,0,0.0,0.0,1,0,0,1.0,10,2023-02-05,0,5,0,0.0,1,0.6,1,0.307323,0.0,1.0,2
3,4521,checkout,ibab,0,-2.449294e-16,0.866025,1,0,0.0,0.0,5,2,7,5.0,0,0.0,0.5,0.0,1,0.0,4591820000.0,2,0.0,0.0,2,0.0,0,0,0,0.016206,1.0,1,1,1.0,1,0,0,0.0,0.0,1,0,0,1.0,10,2023-02-05,0,5,0,0.0,1,0.6,1,0.13042,0.0,1.0,2
15,14454,click,cafm,0,0.4338837,0.866025,2,0,0.0,0.0,3,2,3,3.0,1,0.0,0.5,0.0,0,-1.0,4225440000.0,2,0.0,0.0,2,0.0,0,0,0,0.228236,0.0,0,0,0.0,0,0,1,1.0,0.0,1,0,0,-0.900969,9,2023-02-08,0,3,1,1.0,1,0.0,2,0.228236,1.0,0.0,2
16,14454,checkout,cafm,0,0.4338837,0.866025,2,0,0.0,0.0,3,2,3,3.0,1,0.0,0.5,0.0,0,0.0,4225440000.0,2,0.0,0.0,2,0.0,0,0,0,0.005034,0.0,0,0,0.0,0,0,1,1.0,0.0,1,0,0,-0.900969,9,2023-02-08,0,3,1,1.0,1,0.0,2,0.033729,1.0,0.0,2
19,15000,click,carf,0,0.9749279,0.5,5,0,0.0,0.0,3,2,2,3.0,0,0.0,0.866025,0.0,0,-1.0,896401400.0,1,0.666667,1.0,2,0.0,1,1,0,0.307323,0.0,0,0,0.0,0,1,1,1.0,1.0,1,0,0,-0.222521,6,2023-01-31,0,3,0,0.0,1,0.0,0,0.307323,0.0,0.0,2


In [108]:
df = data.sort_values(['user_id', 'int_date'])

# Compute next active_mode to detect if user is moving toward engagement
df['next_active_mode'] = df.groupby('user_id')['active_mode'].shift(1)
df['will_be_active'] = (
    df['next_active_mode'].isin(['semi active', 'active'])
).astype(int)

# Compute next interaction to detect future interaction intent
df['next_int'] = df.groupby('user_id')['interaction'].shift(1)
df['int_will'] = (
    df['next_int'].isin(['click', 'checkout'])
).astype(int)

# Drop temporary transition columns
df.drop(['next_active_mode', 'next_int'], axis=1, inplace=True)

# Define engagement outcome based on future active mode and interaction
conditions = [
    (df['will_be_active'] == 1) & (df['int_will'] == 0),  # active but no interaction intent
    (df['will_be_active'] == 0) & (df['int_will'] == 1),  # interaction intent without activation
    (df['will_be_active'] == 1) & (df['int_will'] == 1),  # both activation and interaction intent
]
df['will_engage'] = np.select(conditions, [0, 1, 1], default=np.nan)

# Retain only rows with meaningful engagement prediction
df = df[df['will_engage'].notna()].drop(['will_be_active', 'int_will'], axis=1)

# View engagement distribution
print(df.will_engage.value_counts(normalize=True))

will_engage
1.0    0.692747
0.0    0.307253
Name: proportion, dtype: float64


***`Loading Processed ModelPoints`***

In [109]:
# Function to load processed model points from a local directory
def uploading_modelpoint_to_parquent(file_name: str, df: pd.DataFrame) -> None:
    """Save a DataFrame to parquent in the processed data directory."""
    path = (Path('.').cwd().parent / 'data/processed')
    path.mkdir(parents=True, exist_ok=True)
    file_path = path / file_name
    df['int_date'] = pd.to_datetime(df['int_date'])
    df.drop('int_date', axis=1, inplace=True)
    df.to_parquet(file_path, index=False)
    print(f'Processed data saved to {file_path.as_posix()}')

# Reads model points data from local directory
def loading_modelpoint_data(file_name: str) -> pd.DataFrame:
    path = (Path('.').cwd().parent / 'data/processed') / file_name
    df = pd.read_parquet(path)
    return df

In [110]:
# Separate data into training and evaluation model points
data_eval = data_model.iloc[-300:]
modelpoint_training = (
    data_model[~data_model.index.isin(data_eval.index)]
    .reset_index(drop=True)
)

modelpoint_eval = data_eval.reset_index(drop=True)

# Loading the data into local data directory
uploading_modelpoint_to_parquent('modelpoint_train.parquet', modelpoint_training)
uploading_modelpoint_to_parquent('modelpoint_eval.parquet', modelpoint_eval)

# modelpoint_training

Processed data saved to /mnt/d/research-workspace/workx-projects/fnb-dataquest-recosys/data/processed/modelpoint_train.parquet
Processed data saved to /mnt/d/research-workspace/workx-projects/fnb-dataquest-recosys/data/processed/modelpoint_eval.parquet


In [111]:
modelpoint_training.isna().sum()

user_id                            0
target                             0
item_id                            0
user_item_lifestyle_rate           0
day_sin                            0
month_sin                          0
week_of_month                      0
is_month_start                     0
item_type_lifestyle                0
item_type_all                      0
total_user_interaction             0
total_user_interaction_per_item    0
day_of_week                        0
user_avg_weekly_int                0
user_item_transact_rate            0
item_type_invest                   0
month_cos                          0
item_type_connect                  0
is_weekend                         0
day_since_last_user_int            0
normalized_popularity              0
month                              0
month_end_interaction_rate         0
active_mode_cold start             0
user_unique_item_type              0
month_start_interaction_rate       0
is_month_end                       0
u