In [None]:
import pandas as pd

INPUT_FILE = "../data/transactions_features.csv"
OUTPUT_FILE = "../data/transactions_patterns.csv"

df = pd.read_csv(INPUT_FILE, parse_dates=["datetime"])



In [None]:
# INITIALIZE FLAGS & REASONS
df["flags"] = ""
df["reasons"] = ""

In [None]:
def add_flag_reason(row, flag, reason):
    if row["flags"]:
        row["flags"] += f"|{flag}"
        row["reasons"] += f" | {reason}"
    else:
        row["flags"] = flag
        row["reasons"] = reason
    return row

for idx, row in df.iterrows():

    # High amount anomaly
    if row["amount_zscore"] >= 2.0:
        df.loc[idx] = add_flag_reason(
            row,
            "HIGH_AMOUNT",
            "Transaction amount is significantly higher than your usual category spending"
        )

    # Low amount anomaly
    if row["amount_zscore"] <= -2.0:
        df.loc[idx] = add_flag_reason(
            row,
            "LOW_AMOUNT",
            "Transaction amount is significantly lower than your usual category spending"
        )

    # Late night weekday
    if row["time_bucket"] == "LATE_NIGHT" and not row["is_weekend"]:
        df.loc[idx] = add_flag_reason(
            row,
            "LATE_NIGHT_WEEKDAY",
            "Transaction occurred late at night on a weekday"
        )

    # New or rare merchant
    if row["merchant_txn_count"] <= 2:
        df.loc[idx] = add_flag_reason(
            row,
            "RARE_MERCHANT",
            "This merchant is new or rarely used in your transaction history"
        )

df.head()

Unnamed: 0,transaction_id,user_id,datetime,amount,merchant,category,channel,payment_type,location,hour,...,is_weekend,time_bucket,category_avg_amount,category_std_amount,category_txn_count,merchant_avg_amount,merchant_txn_count,amount_zscore,flags,reasons
0,TXN000003,U001,2025-01-01 12:20:00,372.27,LAZADA,Shopping,E-WALLET,DEBIT,Kuala Lumpur,12,...,False,AFTERNOON,287.1,137.94,177,299.9,51,0.62,,
1,TXN000001,U001,2025-01-01 14:11:00,357.83,UNIQLO,Shopping,E-WALLET,DEBIT,Petaling Jaya,14,...,False,AFTERNOON,287.1,137.94,177,296.04,35,0.51,,
2,TXN000002,U001,2025-01-01 15:41:00,83.12,SETEL,Fuel,DEBIT CARD,DEBIT,Petaling Jaya,15,...,False,AFTERNOON,83.21,22.89,142,85.28,42,-0.0,,
3,TXN000004,U001,2025-01-01 18:03:00,423.52,SHOPEE,Shopping,DEBIT CARD,DEBIT,Petaling Jaya,18,...,False,EVENING,287.1,137.94,177,252.19,45,0.99,,
4,TXN000008,U001,2025-01-02 12:49:00,185.36,LOTUS,Grocery,E-WALLET,DEBIT,Shah Alam,12,...,False,AFTERNOON,141.85,61.78,177,149.89,47,0.7,,


In [8]:
df_filter_flags = df[df["amount_zscore"]>=2]
df_filter_flags.head()

Unnamed: 0,transaction_id,user_id,datetime,amount,merchant,category,channel,payment_type,location,hour,...,is_weekend,time_bucket,category_avg_amount,category_std_amount,category_txn_count,merchant_avg_amount,merchant_txn_count,amount_zscore,flags,reasons
143,TXN000145,U001,2025-03-02 10:39:00,146.98,MCDONALDS,Food,E-WALLET,DEBIT,Petaling Jaya,10,...,True,MORNING,71.84,34.34,263,65.96,57,2.19,HIGH_AMOUNT,Transaction amount is significantly higher tha...
177,TXN000180,U001,2025-03-15 10:08:00,154.55,LOCAL CAFE,Food,DEBIT CARD,DEBIT,Shah Alam,10,...,True,MORNING,71.84,34.34,263,75.05,62,2.41,HIGH_AMOUNT,Transaction amount is significantly higher tha...
238,TXN000239,U001,2025-04-06 15:25:00,141.64,GRAB*FOOD,Food,DEBIT CARD,DEBIT,Petaling Jaya,15,...,True,AFTERNOON,71.84,34.34,263,71.35,45,2.03,HIGH_AMOUNT,Transaction amount is significantly higher tha...
257,TXN000259,U001,2025-04-12 16:16:00,563.12,LAZADA,Shopping,DEBIT CARD,DEBIT,Petaling Jaya,16,...,True,AFTERNOON,287.1,137.94,177,299.9,51,2.0,HIGH_AMOUNT,Transaction amount is significantly higher tha...
339,TXN000338,U001,2025-05-18 19:08:00,144.39,LOCAL CAFE,Food,DEBIT CARD,DEBIT,Shah Alam,19,...,True,EVENING,71.84,34.34,263,75.05,62,2.11,HIGH_AMOUNT,Transaction amount is significantly higher tha...


In [9]:
df["flags"] = df["flags"].replace("", "NORMAL")
df["reasons"] = df["reasons"].replace("", "No unusual spending behavior detected")
df.head()


Unnamed: 0,transaction_id,user_id,datetime,amount,merchant,category,channel,payment_type,location,hour,...,is_weekend,time_bucket,category_avg_amount,category_std_amount,category_txn_count,merchant_avg_amount,merchant_txn_count,amount_zscore,flags,reasons
0,TXN000003,U001,2025-01-01 12:20:00,372.27,LAZADA,Shopping,E-WALLET,DEBIT,Kuala Lumpur,12,...,False,AFTERNOON,287.1,137.94,177,299.9,51,0.62,NORMAL,No unusual spending behavior detected
1,TXN000001,U001,2025-01-01 14:11:00,357.83,UNIQLO,Shopping,E-WALLET,DEBIT,Petaling Jaya,14,...,False,AFTERNOON,287.1,137.94,177,296.04,35,0.51,NORMAL,No unusual spending behavior detected
2,TXN000002,U001,2025-01-01 15:41:00,83.12,SETEL,Fuel,DEBIT CARD,DEBIT,Petaling Jaya,15,...,False,AFTERNOON,83.21,22.89,142,85.28,42,-0.0,NORMAL,No unusual spending behavior detected
3,TXN000004,U001,2025-01-01 18:03:00,423.52,SHOPEE,Shopping,DEBIT CARD,DEBIT,Petaling Jaya,18,...,False,EVENING,287.1,137.94,177,252.19,45,0.99,NORMAL,No unusual spending behavior detected
4,TXN000008,U001,2025-01-02 12:49:00,185.36,LOTUS,Grocery,E-WALLET,DEBIT,Shah Alam,12,...,False,AFTERNOON,141.85,61.78,177,149.89,47,0.7,NORMAL,No unusual spending behavior detected


In [10]:
df = df.sort_values("datetime")
df.to_csv(OUTPUT_FILE, index=False)

print(f"Pattern detection completed → {OUTPUT_FILE}")

Pattern detection completed → ../data/transactions_patterns.csv
