In [61]:
import awswrangler as wr
from dotenv import load_dotenv
import os
import boto3
import pandas as pd

In [62]:
load_dotenv()

True

### Configuration

In [63]:
%%capture
!aws sso login --profile $AWS_PROFILE

In [64]:
my_session = boto3.setup_default_session(profile_name=os.environ.get("AWS_PROFILE"))

In [65]:
S3_BUCKET = os.environ.get("S3_BUCKET")

### Read Data

Need to turn off Block public access (bucket settings)

In [66]:
def read_data(table):
  return wr.s3.read_parquet(
    path=f"s3://{S3_BUCKET}/financial-export-18042025/financial/financial.{table}/1/", 
    dataset=True,
    boto3_session=my_session
  )

In [142]:
account_df = read_data('account')
card_df = read_data('card')
client_df = read_data('client')
disp_df = read_data('disp')
district_df = read_data('district')
loan_df = read_data('loan')
order_df = read_data('order')
trans_df = read_data('trans')

In [143]:
# Filter finished loans
loan_df = loan_df.query('status == "A" or status == "B"')

### Create Features

Credits to https://medium.com/data-science/loan-default-prediction-an-end-to-end-ml-project-with-real-bank-data-part-1-1405f7aecb9e#ed0c

Join loan and account

In [144]:
loan_df.rename(
  columns={
    'date': 'loan_date',
    'amount': 'loan_amount',
    'duration': 'loan_duration',
    'payments': 'loan_payments',
    'status': 'loan_status'
  },
  inplace=True
)

account_df.rename(
  columns={
    'date': 'account_date',
    'frequency': 'account_frequency'
  },
  inplace=True
)

df = pd.merge(
  left=loan_df, 
  right=account_df, 
  how='left',
  on='account_id'
)

In [145]:
df['loan_date'] = pd.to_datetime(df['loan_date'])
df['account_date'] = pd.to_datetime(df['account_date'])
df['days_between'] = df['loan_date'] - df['account_date']

Join loan and disp

In [146]:
disp_df.rename(
  columns={
    'type': 'disp_type'
  },
  inplace=True
)

df = pd.merge(
  left=df, 
  right=disp_df, 
  how='left',
  on='account_id'
)

Join loan with card

In [147]:
card_df.rename(
  columns={
    'type': 'card_type',
    'issued': 'card_issued'
  },
  inplace=True
)

df = pd.merge(
  left=df, 
  right=card_df, 
  how='left',
  on='disp_id'
)

Join loan with client

In [148]:
df = pd.merge(
  left=df, 
  right=client_df[['client_id', 'gender', 'birth_date']], 
  how='left',
  on='client_id'
)

In [149]:
# Calculate age
for col in ['loan_date', 'account_date', 'birth_date']:
  df[col] = pd.to_datetime(df[col]) 

df['age'] = (df['loan_date'].dt.year - df['birth_date'].dt.year).astype(int)

Join loan with district

In [150]:
district_df.rename(
  columns={
    'A2': 'district_name',
    'A3': 'region_name',
    'A11': 'avg_salary'
  },
  inplace=True
)

df = pd.merge(
  left=df, 
  right=district_df[['district_id', 'district_name', 'region_name', 'avg_salary']], 
  how='left',
  on='district_id'
)

Join loan with transactions

In [151]:
trans_df.rename(
  columns={
    'date': 'trans_date',
    'type': 'trans_type',
    'operation': 'trans_operation',
    'amount': 'trans_amount',
    'balance': 'trans_balance',
    'k_symbol': 'trans_k_symbol',
    'bank': 'trans_bank'
  },
  inplace=True
)

df_copy = pd.merge(
  left=df,
  right=trans_df,
  on='account_id'
)

In [152]:
df_copy['trans_date'] = pd.to_datetime(df_copy['trans_date'])
df_copy = df_copy.query('loan_date >= trans_date')

In [153]:
def agg_trans(df):
  return (
    df
    .groupby(['loan_id', 'account_id'])
    .agg({
      'trans_date': 'size',
      'trans_amount': 'mean',
      'trans_balance': 'mean'
    })
    .rename(columns={
      'trans_date': f'n_trans',
      'trans_amount': f'avg_trans_amount',
      'trans_balance': f'avg_trans_balance'
    })
)

In [154]:
trans_stats = agg_trans(df_copy)

In [155]:
df = pd.merge(
  left=df,
  right=trans_stats,
  on=['loan_id', 'account_id']
)

Join loan with orders

In [156]:
order_stats = (order_df
    .groupby(['account_id'])
    .agg({'amount': 'mean',})
    .rename(columns={'amount': 'avg_order_amount'})
)

In [157]:
df = pd.merge(
  left=df,
  right=order_stats,
  on=['account_id']
)

In [158]:
df.columns

Index(['loan_id', 'account_id', 'loan_date', 'loan_amount', 'loan_duration',
       'loan_payments', 'loan_status', 'district_id', 'account_frequency',
       'account_date', 'days_between', 'disp_id', 'client_id', 'disp_type',
       'card_id', 'card_type', 'card_issued', 'gender', 'birth_date', 'age',
       'district_name', 'region_name', 'avg_salary', 'n_trans',
       'avg_trans_amount', 'avg_trans_balance', 'avg_order_amount'],
      dtype='object')

In [159]:
df = df[[
  'loan_duration',
  'loan_payments',
  'days_between',
  'account_frequency',
  'avg_order_amount',
  'avg_trans_amount',
  'avg_trans_balance',
  'n_trans',
  'card_type',
  'avg_salary',
  'gender',
  'age',
  'loan_status'
]]

In [160]:
df.head()

Unnamed: 0,loan_duration,loan_payments,days_between,account_frequency,avg_order_amount,avg_trans_amount,avg_trans_balance,n_trans,card_type,avg_salary,gender,age,loan_status
0,24,3373.0,313 days,POPLATEK MESICNE,5319.35,7941.818182,32372.290909,110,,12541,M,49,A
1,24,3373.0,313 days,POPLATEK MESICNE,5319.35,7941.818182,32372.290909,110,,12541,F,54,A
2,12,2523.0,388 days,POPLATEK MESICNE,2523.2,5856.35,25197.1375,80,,9104,F,57,B
3,12,2523.0,498 days,POPLATEK MESICNE,2653.55,12041.557576,62800.29697,165,,9893,M,35,A
4,24,6915.0,561 days,POPLATEK MESICNE,4584.333333,12822.4,52523.336,125,,8427,F,52,A


In [161]:
df.to_csv('../datasets/loan_with_features.csv', index=False)