In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_transaction = pd.read_csv(   
    'C:/Users/Administrator/Desktop/fraud-detection/data/raw/train_transaction.csv',    
    nrows=100000)

print(f"Shape: {df_transaction.shape}")


Shape: (100000, 394)


### Missing Data Features

In [3]:
df_transaction['D7']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
99995   NaN
99996   NaN
99997   NaN
99998   NaN
99999   NaN
Name: D7, Length: 100000, dtype: float64

In [4]:
len(df_transaction['D7'])

100000

In [5]:
df_transaction['D7'].isnull().sum()

np.int64(93928)

In [6]:
df_transaction['D7_is_missing'] = df_transaction['D7'].isna().astype(int)

In [7]:
df_transaction['D7_is_missing']

0        1
1        1
2        1
3        1
4        1
        ..
99995    1
99996    1
99997    1
99998    1
99999    1
Name: D7_is_missing, Length: 100000, dtype: int64

In [8]:
d_columns = []  # start with an empty list

for col in df_transaction.columns:  # go through each column name
    if col.startswith('D') and col[1:].isdigit():  # check two conditions
        d_columns.append(col)  # if both are True, add column to the list


In [9]:
# Get all D columns
d_columns = [col for col in df_transaction.columns if col.startswith('D') and col[1:].isdigit()]

# Check their missing rates
d_missing_rates = df_transaction[d_columns].isna().mean() * 100
d_missing_rates = d_missing_rates.sort_values(ascending=False)

In [10]:
d_columns


['D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15']

In [11]:

d_missing_rates 

D7     93.928
D13    90.479
D12    89.495
D14    88.847
D6     87.259
D8     78.289
D9     78.289
D11    71.208
D5     65.327
D2     57.633
D3     56.598
D4     46.175
D15    34.047
D10    30.926
D1      0.000
dtype: float64

In [12]:
d_missing =  df_transaction.columns[
    df_transaction.columns.str.startswith('D') & df_transaction.columns.str[1:].str.isdigit()
]

In [13]:
d_missing.to_list()

['D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15']

In [14]:
df_transaction[d_missing].isna().mean().sort_values(ascending= False) * 100

D7     93.928
D13    90.479
D12    89.495
D14    88.847
D6     87.259
D8     78.289
D9     78.289
D11    71.208
D5     65.327
D2     57.633
D3     56.598
D4     46.175
D15    34.047
D10    30.926
D1      0.000
dtype: float64

In [15]:
for col in d_missing:
    d_missing_cols = col + '_is_missing'
    df_transaction[d_missing_cols] = df_transaction[col].isna().astype(int)

In [16]:
print(df_transaction[[col + '_is_missing' for col in d_missing]].head())

   D1_is_missing  D2_is_missing  D3_is_missing  D4_is_missing  D5_is_missing  \
0              0              1              0              1              1   
1              0              1              1              0              1   
2              0              1              1              0              1   
3              0              0              0              0              0   
4              0              1              1              1              1   

   D6_is_missing  D7_is_missing  D8_is_missing  D9_is_missing  D10_is_missing  \
0              1              1              1              1               0   
1              1              1              1              1               0   
2              1              1              1              1               0   
3              1              1              1              1               0   
4              1              1              1              1               1   

   D11_is_missing  D12_is_missin

In [17]:
D7_fraud_rate = df_transaction.groupby('D7_is_missing')['isFraud'].mean().sort_values(ascending=False) * 100

In [18]:
D7_fraud_rate

D7_is_missing
0    10.770751
1     2.030279
Name: isFraud, dtype: float64

In [19]:
d_missing_cols = [col + '_is_missing' for col in d_missing]

# Create a DataFrame of fraud rates for all D missing columns
fraud_summary = pd.DataFrame({
    col: df_transaction.groupby(col)['isFraud'].mean() * 100
    for col in d_missing_cols
})

# Transpose for easier reading (rows = D columns, columns = 0/1)
fraud_summary = fraud_summary.T

# Show the full summary
print(fraud_summary)


                        0         1
D1_is_missing    2.561000       NaN
D2_is_missing    2.301319  2.751896
D3_is_missing    2.656560  2.487720
D4_is_missing    3.002322  2.046562
D5_is_missing    3.458022  2.084896
D6_is_missing    7.024566  1.909259
D7_is_missing   10.770751  2.030279
D8_is_missing    4.458569  2.034769
D9_is_missing    4.458569  2.034769
D10_is_missing   2.665258  2.328138
D11_is_missing   1.663657  2.923829
D12_is_missing   8.576868  1.854852
D13_is_missing   8.045373  1.983886
D14_is_missing   7.397113  1.953921
D15_is_missing   2.794414  2.108850


In [20]:
# Drop the non-predictive missing flags
non_predictive = ['D1_is_missing', 'D2_is_missing', 'D3_is_missing', 
                  'D10_is_missing', 'D11_is_missing', 'D15_is_missing']

df_transaction.drop(columns=non_predictive, inplace=True)

## Transaction Amount-Based Features

In [21]:
# Create amount bins based on your EDA findings
def categorize_amount(amount):
    if amount < 50:
        return 'Low_value'
    elif amount < 100:
        return 'Medium-value'
    elif amount < 500:
        return 'Medium_high_value'  # The risky sweet spot!
    elif amount < 1000:
        return 'High-value'
    else:
        return 'Ultra-High-value'

df_transaction['amount_category'] = df_transaction['TransactionAmt'].apply(categorize_amount)


In [22]:
df_transaction['amount_category']

0             Medium-value
1                Low_value
2             Medium-value
3             Medium-value
4             Medium-value
               ...        
99995         Medium-value
99996    Medium_high_value
99997         Medium-value
99998    Medium_high_value
99999    Medium_high_value
Name: amount_category, Length: 100000, dtype: object

In [23]:
# Also create numeric amount risk score (fraud rate by category)
amount_fraud_rates = df_transaction.groupby('amount_category')['isFraud'].mean() * 100

In [24]:
amount_fraud_rates

amount_category
High-value           2.956020
Low_value            2.857737
Medium-value         2.347229
Medium_high_value    2.503508
Ultra-High-value     1.188300
Name: isFraud, dtype: float64

In [25]:
df_transaction.groupby('amount_category')['isFraud'].value_counts(normalize=True) * 100

amount_category    isFraud
High-value         0          97.043980
                   1           2.956020
Low_value          0          97.142263
                   1           2.857737
Medium-value       0          97.652771
                   1           2.347229
Medium_high_value  0          97.496492
                   1           2.503508
Ultra-High-value   0          98.811700
                   1           1.188300
Name: proportion, dtype: float64

In [26]:
amount_risk_map = amount_fraud_rates.to_dict()
df_transaction['amount_risk_score'] = df_transaction['amount_category'].map(amount_risk_map)

In [27]:
df_transaction['amount_risk_score'] 

0        2.347229
1        2.857737
2        2.347229
3        2.347229
4        2.347229
           ...   
99995    2.347229
99996    2.503508
99997    2.347229
99998    2.503508
99999    2.503508
Name: amount_risk_score, Length: 100000, dtype: float64

In [28]:
# Verify
print("Amount categories distribution:")
print(df_transaction['amount_category'].value_counts())
print("\nAmount risk scores:")
print(df_transaction.groupby('amount_category')['amount_risk_score'].first().sort_values(ascending=False))

Amount categories distribution:
amount_category
Medium_high_value    39904
Low_value            28834
Medium-value         27394
High-value            2774
Ultra-High-value      1094
Name: count, dtype: int64

Amount risk scores:
amount_category
High-value           2.956020
Low_value            2.857737
Medium_high_value    2.503508
Medium-value         2.347229
Ultra-High-value     1.188300
Name: amount_risk_score, dtype: float64


In [29]:
# Check a few sample rows
print(df_transaction[['TransactionAmt', 'amount_category', 'amount_risk_score', 'isFraud']].head(10))

   TransactionAmt    amount_category  amount_risk_score  isFraud
0            68.5       Medium-value           2.347229        0
1            29.0          Low_value           2.857737        0
2            59.0       Medium-value           2.347229        0
3            50.0       Medium-value           2.347229        0
4            50.0       Medium-value           2.347229        0
5            49.0          Low_value           2.857737        0
6           159.0  Medium_high_value           2.503508        0
7           422.5  Medium_high_value           2.503508        0
8            15.0          Low_value           2.857737        0
9           117.0  Medium_high_value           2.503508        0


##  Email Features

In [30]:
# 1. Email domain risk scores (using your EDA findings)
email_risk_map = {
    'outlook.com': 6.91,
    'hotmail.com': 4.63,
    'icloud.com': 3.58,
    'gmail.com': 3.52,
    'msn.com': 1.64,
    'aol.com': 1.42,
    'yahoo.com': 1.35,
    'anonymous.com': 1.05,
    'att.net': 0.86,
    'comcast.net': 0.86
}

df_transaction['P_email_risk'] = df_transaction['P_emaildomain'].map(email_risk_map)

# Fill missing with baseline
df_transaction['P_email_risk'].fillna(2.56, inplace=True)

# 2. Email match feature (P = R)
df_transaction['email_match'] = (df_transaction['P_emaildomain'] == df_transaction['R_emaildomain']).astype(int)

# 3. Check if email is missing
df_transaction['P_email_missing'] = df_transaction['P_emaildomain'].isna().astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_transaction['P_email_risk'].fillna(2.56, inplace=True)


In [31]:
df_transaction['email_match']

0        0
1        0
2        0
3        0
4        0
        ..
99995    1
99996    0
99997    0
99998    1
99999    0
Name: email_match, Length: 100000, dtype: int64

In [32]:
print("Email features created:")
print(df_transaction[['P_emaildomain', 'P_email_risk', 'email_match', 'P_email_missing']].head(10))

Email features created:
   P_emaildomain  P_email_risk  email_match  P_email_missing
0            NaN          2.56            0                1
1      gmail.com          3.52            0                0
2    outlook.com          6.91            0                0
3      yahoo.com          1.35            0                0
4      gmail.com          3.52            0                0
5      gmail.com          3.52            0                0
6      yahoo.com          1.35            0                0
7       mail.com          2.56            0                0
8  anonymous.com          1.05            0                0
9      yahoo.com          1.35            0                0


## Product features:

In [33]:
df_transaction.groupby('ProductCD')['isFraud'].mean() * 100

ProductCD
C    8.783367
H    1.977965
R    1.078403
S    2.335709
W    1.846056
Name: isFraud, dtype: float64

In [34]:
# Product risk scores from your EDA
product_risk_map = {
    'C': 8.78,
    'S': 2.34,
    'H': 1.98,
    'W': 1.85,
    'R': 1.08
}

df_transaction['product_risk_score'] = df_transaction['ProductCD'].map(product_risk_map)

# Binary flag for high-risk Product C
df_transaction['is_product_c'] = (df_transaction['ProductCD'] == 'C').astype(int)

In [35]:
print("Product features:")
print(df_transaction[['ProductCD', 'product_risk_score', 'is_product_c']].value_counts().head(10))

Product features:
ProductCD  product_risk_score  is_product_c
W          1.85                0               56878
H          1.98                0               15521
R          1.08                0               13724
C          8.78                1               11351
S          2.34                0                2526
Name: count, dtype: int64


## Card Feature:

In [36]:
# Card type risk scores
card_type_risk = {
    'credit': 3.60,
    'debit': 2.00,
    'debit or credit': 0.00,  # Too few samples
    'charge card': 0.00
}

df_transaction['card_type_risk'] = df_transaction['card6'].map(card_type_risk)
df_transaction['card_type_risk'].fillna(2.56, inplace=True)  # Fill missing with baseline

# Card brand risk scores
card_brand_risk = {
    'mastercard': 2.88,
    'discover': 2.53,
    'visa': 2.49,
    'american express': 1.14
}

df_transaction['card_brand_risk'] = df_transaction['card4'].map(card_brand_risk)
df_transaction['card_brand_risk'].fillna(2.56, inplace=True)

# Binary flag for credit card
df_transaction['is_credit_card'] = (df_transaction['card6'] == 'credit').astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_transaction['card_type_risk'].fillna(2.56, inplace=True)  # Fill missing with baseline
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_transaction['card_brand_risk'].fillna(2.56, inplace=True)


In [37]:
print("Card features sample:")
print(df_transaction[['card4', 'card6', 'card_brand_risk', 'card_type_risk', 'is_credit_card']].head(10))

Card features sample:
        card4   card6  card_brand_risk  card_type_risk  is_credit_card
0    discover  credit             2.53             3.6               1
1  mastercard  credit             2.88             3.6               1
2        visa   debit             2.49             2.0               0
3  mastercard   debit             2.88             2.0               0
4  mastercard  credit             2.88             3.6               1
5        visa   debit             2.49             2.0               0
6        visa   debit             2.49             2.0               0
7        visa   debit             2.49             2.0               0
8        visa   debit             2.49             2.0               0
9  mastercard   debit             2.88             2.0               0


### TIME CONVERSION 


In [38]:
df_transaction["TransactionDT"].dtype

dtype('int64')

In [39]:
start_date = pd.Timestamp('2017-12-01')

df_transaction["TransactionDT"] = start_date + pd.to_timedelta(df_transaction["TransactionDT"], unit='s')

In [40]:
df_transaction["datetime"] = df_transaction["TransactionDT"]

In [41]:
df_transaction["datetime"].dtype

dtype('<M8[ns]')

In [42]:
df_transaction["hour"] = df_transaction["datetime"].dt.hour

In [43]:
df_transaction["day_of_week"] = df_transaction["datetime"].dt.dayofweek


In [44]:
df_transaction["is_weekend"] = df_transaction["day_of_week"].isin([5, 6])


In [45]:
fraud_by_hour = df_transaction.groupby('hour')['isFraud'].mean() * 100

In [46]:
fraud_transaction_day = df_transaction.groupby('day_of_week')['isFraud'].mean() * 100

In [47]:
# Map hour to fraud rate (your actual data)
hour_risk_map = fraud_by_hour.to_dict()

# Map day to fraud rate (your actual data)
day_risk_map = fraud_transaction_day.to_dict()

# Create the numeric features
df_transaction['hour_risk_score'] = df_transaction['hour'].map(hour_risk_map)
df_transaction['day_risk_score'] = df_transaction['day_of_week'].map(day_risk_map)

# Combined temporal risk score (average)
df_transaction['temporal_risk_score'] = (df_transaction['hour_risk_score'] + 
                                          df_transaction['day_risk_score']) / 2

In [48]:
# Create categorical risk levels
def categorize_temporal_risk(score):
    if score > 4.0:
        return 'very_high'
    elif score > 3.0:
        return 'high'
    elif score > 2.3:
        return 'medium'
    elif score > 2.0:
        return 'low'
    else:
        return 'very_low'

df_transaction['temporal_risk_category'] = df_transaction['temporal_risk_score'].apply(categorize_temporal_risk)

In [49]:
print(df_transaction.columns.tolist())

['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V

## Interaction Features

### Product C × Hour (Product C fraud peaks at 2pm)

In [50]:
df_transaction['productC_risky_hours'] = (
    (df_transaction['ProductCD'] == 'C') & 
    (df_transaction['hour'].between(10, 14))
).astype(int)

In [51]:
df_transaction['productC_risky_hours'] 

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: productC_risky_hours, Length: 100000, dtype: int64

### Amount × Hour (small amounts at risky hours)

In [52]:
df_transaction['small_amount_early_morning'] = (
    (df_transaction['TransactionAmt'] < 100) & 
    (df_transaction['hour'].between(4, 9))
).astype(int)

### High-risk email + credit card combo

In [53]:
df_transaction['risky_email_credit'] = (
    (df_transaction['P_email_risk'] > 4.0) & 
    (df_transaction['is_credit_card'] == 1)
).astype(int)

### Product C × Amount (low-value Product C purchases

In [54]:
df_transaction['risky_product_C_amount'] = (
     (df_transaction['ProductCD'] == 'C') & 
    (df_transaction['TransactionAmt'] < 50)
).astype(int)

### Combined risk score (temporal + product + amount + email + card)

In [55]:
df_transaction['combined_risk_score'] = (
    df_transaction['temporal_risk_score'] +
    df_transaction['product_risk_score'] +
    df_transaction['amount_risk_score'] +
    df_transaction['P_email_risk'] +
    df_transaction['card_type_risk']
) / 5  # Average of all risk scores

In [56]:
# Alternative using mean() as mean calcuates across colums and mean(axis=1) can be used to specify row-wise operation
df_transaction['combined_risk_score'] = df_transaction[[
    'temporal_risk_score',
    'product_risk_score',
    'amount_risk_score',
    'P_email_risk',
    'card_type_risk'
]].mean(axis=1)

In [57]:
print("Interaction features sample:")
print(df_transaction[['ProductCD', 'hour', 'TransactionAmt', 'productC_risky_hours', 
                      'small_amount_early_morning', 'combined_risk_score', 'isFraud']].head(10))

Interaction features sample:
  ProductCD  hour  TransactionAmt  productC_risky_hours  \
0         W     0            68.5                     0   
1         W     0            29.0                     0   
2         W     0            59.0                     0   
3         W     0            50.0                     0   
4         H     0            50.0                     0   
5         W     0            49.0                     0   
6         W     0           159.0                     0   
7         W     0           422.5                     0   
8         H     0            15.0                     0   
9         W     0           117.0                     0   

   small_amount_early_morning  combined_risk_score  isFraud  
0                           0             2.550568        0  
1                           0             2.844670        0  
2                           0             3.100568        0  
3                           0             1.988568        0  
4          

In [58]:
print(df_transaction.columns.tolist())

['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V

In [59]:
print(df_transaction.select_dtypes(include='object').head(5))


  ProductCD       card4   card6 P_emaildomain R_emaildomain   M1   M2   M3  \
0         W    discover  credit           NaN           NaN    T    T    T   
1         W  mastercard  credit     gmail.com           NaN  NaN  NaN  NaN   
2         W        visa   debit   outlook.com           NaN    T    T    T   
3         W  mastercard   debit     yahoo.com           NaN  NaN  NaN  NaN   
4         H  mastercard  credit     gmail.com           NaN  NaN  NaN  NaN   

    M4   M5   M6   M7   M8   M9 amount_category temporal_risk_category  
0   M2    F    T  NaN  NaN  NaN    Medium-value                 medium  
1   M0    T    T  NaN  NaN  NaN       Low_value                 medium  
2   M0    F    F    F    F    F    Medium-value                 medium  
3   M0    T    F  NaN  NaN  NaN    Medium-value                 medium  
4  NaN  NaN  NaN  NaN  NaN  NaN    Medium-value                 medium  


In [60]:
df_transaction.columns[df_transaction.dtypes == 'object']

Index(['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1',
       'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'amount_category',
       'temporal_risk_category'],
      dtype='object')

##  Categorical Encoding

In [61]:
df_transaction = pd.get_dummies(df_transaction, columns=['ProductCD', 'card4', 'card6'], 
                                prefix=['Product', 'CardBrand', 'CardType'], drop_first=True)

In [62]:
print("All encoded columns:")
encoded_cols = [col for col in df_transaction.columns if col.startswith(('Product_', 'CardType_', 'CardBrand_'))]
print(f"Total encoded columns: {len(encoded_cols)}")
print(sorted(encoded_cols))

All encoded columns:
Total encoded columns: 10
['CardBrand_discover', 'CardBrand_mastercard', 'CardBrand_visa', 'CardType_credit', 'CardType_debit', 'CardType_debit or credit', 'Product_H', 'Product_R', 'Product_S', 'Product_W']


In [63]:
# Generate complete feature summary
print("="*70)
print("FEATURE ENGINEERING SUMMARY")
print("="*70)

# Count features by category
temporal_features = [col for col in df_transaction.columns if col in ['hour', 'day_of_week', 'is_weekend', 'hour_risk_score', 'day_risk_score', 'temporal_risk_score', 'temporal_risk_category']]
missing_features = [col for col in df_transaction.columns if col.endswith('_is_missing')]
amount_features = [col for col in df_transaction.columns if 'amount' in col.lower()]
email_features = [col for col in df_transaction.columns if 'email' in col.lower()]
product_features = [col for col in df_transaction.columns if col.startswith('Product_') or 'product' in col.lower()]
card_features = [col for col in df_transaction.columns if col.startswith(('Card', 'card')) or 'card' in col.lower()]
interaction_features = [col for col in df_transaction.columns if any(x in col for x in ['productC_risky', 'small_amount_early', 'risky_email', 'combined_risk'])]

print(f"\n✅ Temporal Features ({len(temporal_features)}):")
print(temporal_features)

print(f"\n✅ Missing Data Features ({len(missing_features)}):")
print(missing_features)

print(f"\n✅ Amount Features ({len(amount_features)}):")
print(amount_features)

print(f"\n✅ Email Features ({len(email_features)}):")
print(email_features)

print(f"\n✅ Product Features ({len(product_features)}):")
print(product_features)

print(f"\n✅ Card Features ({len(card_features)}):")
print(card_features)

print(f"\n✅ Interaction Features ({len(interaction_features)}):")
print(interaction_features)

print(f"\n{'='*70}")
print(f"TOTAL ENGINEERED FEATURES: {len(temporal_features) + len(missing_features) + len(amount_features) + len(email_features) + len(product_features) + len(card_features) + len(interaction_features)}")
print(f"ORIGINAL DATASET COLUMNS: 394")
print(f"CURRENT DATASET COLUMNS: {len(df_transaction.columns)}")
print(f"{'='*70}")

FEATURE ENGINEERING SUMMARY

✅ Temporal Features (7):
['hour', 'day_of_week', 'is_weekend', 'hour_risk_score', 'day_risk_score', 'temporal_risk_score', 'temporal_risk_category']

✅ Missing Data Features (9):
['D7_is_missing', 'D4_is_missing', 'D5_is_missing', 'D6_is_missing', 'D8_is_missing', 'D9_is_missing', 'D12_is_missing', 'D13_is_missing', 'D14_is_missing']

✅ Amount Features (4):
['amount_category', 'amount_risk_score', 'small_amount_early_morning', 'risky_product_C_amount']

✅ Email Features (6):
['P_emaildomain', 'R_emaildomain', 'P_email_risk', 'email_match', 'P_email_missing', 'risky_email_credit']

✅ Product Features (8):
['product_risk_score', 'is_product_c', 'productC_risky_hours', 'risky_product_C_amount', 'Product_H', 'Product_R', 'Product_S', 'Product_W']

✅ Card Features (13):
['card1', 'card2', 'card3', 'card5', 'card_type_risk', 'card_brand_risk', 'is_credit_card', 'CardBrand_discover', 'CardBrand_mastercard', 'CardBrand_visa', 'CardType_credit', 'CardType_debit', 'C