In [27]:
import awswrangler as wr
from dotenv import load_dotenv
import os
import boto3
import pandas as pd

In [2]:
load_dotenv()

True

### Configuration

In [3]:
!aws sso login --profile $AWS_PROFILE

Attempting to automatically open the SSO authorization page in your default browser.
If the browser does not open or you wish to use a different device to authorize this request, open the following URL:

https://d-9067d8c794.awsapps.com/start/#/device

Then enter the code:

XHWN-WMDK
Successfully logged into Start URL: https://d-9067d8c794.awsapps.com/start


In [11]:
my_session = boto3.setup_default_session(profile_name=os.environ.get("AWS_PROFILE"))

In [5]:
S3_BUCKET = os.environ.get("S3_BUCKET")

### Read Data

Need to turn off Block public access (bucket settings)

In [15]:
def read_data(table):
  return wr.s3.read_parquet(
    path=f"s3://{S3_BUCKET}/financial-export-18042025/financial/financial.{table}/1/", 
    dataset=True,
    boto3_session=my_session
  )

In [16]:
account_df = read_data('account')
card_df = read_data('card')
client_df = read_data('client')
disp_df = read_data('disp')
district_df = read_data('district')
loan_df = read_data('loan')
order_df = read_data('order')
trans_df = read_data('trans')

### Create Features

Join loan and account

In [None]:
loan_df.rename(
  columns={
    'date': 'loan_date'
  },
  inplace=True
)

account_df.rename(
  columns={
    'date': 'account_date'
  },
  inplace=True
)

df = pd.merge(
  left=loan_df, 
  right=account_df, 
  how='left',
  on='account_id'
)

Join loan and disp

In [None]:
disp_df.rename(
  columns={
    'type': 'disp_type'
  },
  inplace=True
)

df = pd.merge(
  left=df, 
  right=disp_df, 
  how='left',
  on='account_id'
)

Join loan with card

In [None]:
card_df.rename(
  columns={
    'type': 'card_type',
    'issued': 'card_issued'
  },
  inplace=True
)

df = pd.merge(
  left=df, 
  right=card_df, 
  how='left',
  on='disp_id'
)

Join loan with client

In [None]:
df = pd.merge(
  left=df, 
  right=client_df[['client_id', 'gender', 'birth_date']], 
  how='left',
  on='client_id'
)

In [None]:
# Calculate age
for col in ['loan_date', 'account_date', 'birth_date']:
  df[col] = pd.to_datetime(df[col]) 

df['age'] = (df['loan_date'].dt.year - df['birth_date'].dt.year).astype(int)

Join loan with district

In [95]:
district_df.rename(
  columns={
    'A2': 'district_name',
    'A3': 'region_name',
    'A11': 'avg_salary'
  },
  inplace=True
)

df = pd.merge(
  left=df, 
  right=district_df[['district_id', 'district_name', 'region_name', 'avg_salary']], 
  how='left',
  on='district_id'
)

Join loan with transactions

In [107]:
trans_df_latest = trans_df.rename(
  columns={
    'date': 'trans_date_latest',
    'type': 'trans_type_latest',
    'operation': 'trans_operation_latest',
    'amount': 'trans_amount_latest',
    'balance': 'trans_balance_latest',
    'k_symbol': 'trans_k_symbol_latest',
    'bank': 'trans_bank_latest'
  },
  inplace=False
)
trans_df_latest = trans_df_latest[[
  'account_id',
  'trans_date_latest',
  'trans_type_latest',
  'trans_operation_latest',
  'trans_amount_latest',
  'trans_balance_latest',
  'trans_k_symbol_latest',
  'trans_bank_latest'
]]

df['loan_date'] = pd.to_datetime(df['loan_date'])
trans_df_latest['trans_date_latest'] = pd.to_datetime(trans_df_latest['trans_date_latest'])

df.sort_values(by='loan_date', inplace=True)
trans_df_latest.sort_values(by='trans_date_latest', inplace=True)

df = pd.merge_asof(
  left=df,
  right=trans_df_latest,
  left_on='loan_date',
  right_on='trans_date_latest',
  by='account_id'
)

In [109]:
df.head()

Unnamed: 0,loan_id,account_id,loan_date,amount,duration,payments,status,district_id,frequency,account_date,...,district_name,region_name,avg_salary,trans_date_latest,trans_type_latest,trans_operation_latest,trans_amount_latest,trans_balance_latest,trans_k_symbol_latest,trans_bank_latest
0,5314,1787,1993-07-05,96396,12,8033.0,B,30,POPLATEK TYDNE,1993-03-22,...,Sokolov,west Bohemia,9650,1993-06-20,PRIJEM,VKLAD,3300,20100,,
1,5316,1801,1993-07-11,165960,36,4610.0,A,46,POPLATEK MESICNE,1993-02-13,...,Nachod,east Bohemia,8369,1993-07-09,VYDAJ,PREVOD NA UCET,3419,52209,,YZ
2,6863,9188,1993-07-28,127080,60,2118.0,A,45,POPLATEK MESICNE,1993-02-08,...,Jicin,east Bohemia,8390,1993-07-21,VYDAJ,VYBER,12000,20273,,
3,5325,1843,1993-08-03,105804,36,2939.0,A,12,POPLATEK MESICNE,1993-01-30,...,Pribram,central Bohemia,8754,1993-07-31,PRIJEM,,178,34307,UROK,
4,7240,11013,1993-09-06,274740,60,4579.0,A,1,POPLATEK TYDNE,1993-02-14,...,Hl.m. Praha,Prague,12541,1993-08-31,PRIJEM,,183,41143,UROK,


In [66]:
card_agg_df = (
  card_df
  .groupby('disp_id')
  .agg({
    'disp_id': 'count'
  })
)

In [67]:
card_agg_df.loc[card_agg_df['disp_id'] > 1]

Unnamed: 0_level_0,disp_id
disp_id,Unnamed: 1_level_1
