In [2]:
import pandas as pd

train_trans = pd.read_csv("../DATA/train_transaction.csv")
train_id = pd.read_csv("../DATA/train_identity.csv")

df = train_trans.merge(train_id, on="TransactionID", how="left")


In [3]:
missing = df.isnull().mean()
drop_cols = missing[missing > 0.9].index

df_reduced = df.drop(columns=drop_cols)


In [4]:
num_cols = df_reduced.select_dtypes(include='number').columns
cat_cols = df_reduced.select_dtypes(include='object').columns

print("Numerical:", len(num_cols))
print("Categorical:", len(cat_cols))


Numerical: 393
Categorical: 29


In [5]:
id_cols = ['TransactionID']
df_reduced = df_reduced.drop(columns=id_cols)


In [6]:
leakage_cols = ['TransactionDT']  # raw value not useful directly
df_reduced = df_reduced.drop(columns=leakage_cols)


In [7]:
core_features = [
    'TransactionAmt',
    'ProductCD',
    'card1','card2','card3','card4','card5','card6',
    'addr1','addr2',
    'DeviceType'
]


In [8]:
v_cols = [c for c in df_reduced.columns if c.startswith('V')]

corr = df_reduced[v_cols + ['isFraud']].corr()['isFraud']
top_v = corr.abs().sort_values(ascending=False).head(20).index.tolist()

print(top_v)


['isFraud', 'V257', 'V246', 'V244', 'V242', 'V201', 'V200', 'V189', 'V188', 'V258', 'V45', 'V158', 'V156', 'V149', 'V228', 'V44', 'V86', 'V87', 'V170', 'V147']


In [9]:
import numpy as np

df_reduced['TransactionAmt_log'] = np.log1p(df_reduced['TransactionAmt'])


In [10]:
def frequency_encoding(df, col):
    freq = df[col].value_counts(normalize=True)
    return df[col].map(freq)


In [11]:
for col in ['ProductCD', 'card4', 'card6', 'DeviceType']:
    df_reduced[col + '_freq'] = frequency_encoding(df_reduced, col)
