# Feature engineering


In [1]:

from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from debugpy.adapter.components import missing
from networkx.algorithms.bipartite.basic import color
from pyarrow.compute import top_k_unstable, scalar

# from ydata_profiling import ProfileReport

from mastercard import dist_visualisation, dist_categorical_visualisation

PROCESSED_DATA_DIR = Path("../data/processed")
INTERIM_DATA_DIR = Path("../data/interim")

FIGURES_DIR = Path("../reports/figures")
df = pd.read_parquet(INTERIM_DATA_DIR / "merge_data.parquet")


[32m2025-06-10 16:13:56.218[0m | [1mINFO    [0m | [36mmastercard.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/marcin/workspace/data_sience/mastercard[0m


In [2]:
df.drop(columns=["currency", 'user_id', 'merchant_id', 'location'], inplace=True)

In [None]:
df.info()


In [4]:
df["education"] = (
    df['education'].fillna('Missing'))

In [5]:
df.drop(columns=["transaction_id"], inplace=True)

In [6]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 25 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   timestamp                       500000 non-null  datetime64[ns]
 1   amount                          500000 non-null  float64       
 2   channel                         500000 non-null  object        
 3   device                          500000 non-null  object        
 4   payment_method                  500000 non-null  object        
 5   is_international                500000 non-null  int64         
 6   session_length_seconds          500000 non-null  int64         
 7   is_first_time_merchant          500000 non-null  int64         
 8   is_fraud                        500000 non-null  int64         
 9   age                             500000 non-null  int64         
 10  sex                             500000 non-null  object 

In [7]:
categorical_cols = ["education", "channel", "device", "payment_method", "sex", "primary_source_of_income", "category", "country_users ", "country_merchant"]
df_encoded = pd.get_dummies(df, columns=categorical_cols, dtype=int, drop_first=True)

In [9]:
df_encoded.columns


Index(['timestamp', 'amount', 'is_international', 'session_length_seconds',
       'is_first_time_merchant', 'is_fraud', 'age',
       'sum_of_monthly_installments', 'sum_of_monthly_expenses',
       'country_users', 'signup_date', 'risk_score', 'country_merchant',
       'trust_score', 'number_of_alerts_last_6_months',
       'avg_transaction_amount', 'account_age_months', 'has_fraud_history',
       'education_High School', 'education_Master', 'education_Missing',
       'education_PhD', 'channel_mobile', 'channel_online', 'device_Web',
       'device_iOS', 'payment_method_credit_card', 'payment_method_debit_card',
       'payment_method_mobile_payment', 'sex_Male', 'sex_Other',
       'primary_source_of_income_Employment',
       'primary_source_of_income_Retirement',
       'primary_source_of_income_Savings',
       'primary_source_of_income_Student Aid',
       'primary_source_of_income_Unemployment', 'category_education',
       'category_electronics', 'category_gaming', 'categor

In [10]:
# df_encoded.info()

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
df.columns


Index(['timestamp', 'amount', 'channel', 'device', 'payment_method',
       'is_international', 'session_length_seconds', 'is_first_time_merchant',
       'is_fraud', 'age', 'sex', 'education', 'primary_source_of_income',
       'sum_of_monthly_installments', 'sum_of_monthly_expenses',
       'country_users', 'signup_date', 'risk_score', 'category',
       'country_merchant', 'trust_score', 'number_of_alerts_last_6_months',
       'avg_transaction_amount', 'account_age_months', 'has_fraud_history'],
      dtype='object')

In [13]:
numerical_cols = ["amount", "session_length_seconds", "age", "sum_of_monthly_installments", "sum_of_monthly_expenses", "risk_score", "number_of_alerts_last_6_months", "avg_transaction_amount", "account_age_months"]

scaler = StandardScaler()

In [14]:
df_encoded[numerical_cols] =  scaler.fit_transform(df_encoded[numerical_cols])

In [15]:
df_encoded.columns

Index(['timestamp', 'amount', 'is_international', 'session_length_seconds',
       'is_first_time_merchant', 'is_fraud', 'age',
       'sum_of_monthly_installments', 'sum_of_monthly_expenses',
       'country_users', 'signup_date', 'risk_score', 'country_merchant',
       'trust_score', 'number_of_alerts_last_6_months',
       'avg_transaction_amount', 'account_age_months', 'has_fraud_history',
       'education_High School', 'education_Master', 'education_Missing',
       'education_PhD', 'channel_mobile', 'channel_online', 'device_Web',
       'device_iOS', 'payment_method_credit_card', 'payment_method_debit_card',
       'payment_method_mobile_payment', 'sex_Male', 'sex_Other',
       'primary_source_of_income_Employment',
       'primary_source_of_income_Retirement',
       'primary_source_of_income_Savings',
       'primary_source_of_income_Student Aid',
       'primary_source_of_income_Unemployment', 'category_education',
       'category_electronics', 'category_gaming', 'categor

In [16]:
df_encoded.to_parquet(INTERIM_DATA_DIR / "df_encoded1.parquet")


Index([], dtype='object')
