In [61]:
import awswrangler as wr
from dotenv import load_dotenv
import os
import boto3
import pandas as pd

In [62]:
load_dotenv()

True

### Configuration

In [63]:
%%capture
!aws sso login --profile $AWS_PROFILE

In [64]:
my_session = boto3.setup_default_session(profile_name=os.environ.get("AWS_PROFILE"))

In [65]:
S3_BUCKET = os.environ.get("S3_BUCKET")

### Read Data

Need to turn off Block public access (bucket settings)

In [66]:
def read_data(table):
  return wr.s3.read_parquet(
    path=f"s3://{S3_BUCKET}/financial-export-18042025/financial/financial.{table}/1/", 
    dataset=True,
    boto3_session=my_session
  )

In [67]:
account_df = read_data('account')
card_df = read_data('card')
client_df = read_data('client')
disp_df = read_data('disp')
district_df = read_data('district')
loan_df = read_data('loan')
order_df = read_data('order')
trans_df = read_data('trans')

In [68]:
# Filter finished loans
loan_df = loan_df.query('status == "A" or status == "B"')

### Create Features

Join loan and account

In [69]:
loan_df.rename(
  columns={
    'date': 'loan_date',
    'amount': 'loan_amount',
    'duration': 'loan_duration',
    'payments': 'loan_payments',
    'status': 'loan_status'
  },
  inplace=True
)

account_df.rename(
  columns={
    'date': 'account_date',
    'frequency': 'account_frequency'
  },
  inplace=True
)

df = pd.merge(
  left=loan_df, 
  right=account_df, 
  how='left',
  on='account_id'
)

Join loan and disp

In [70]:
disp_df.rename(
  columns={
    'type': 'disp_type'
  },
  inplace=True
)

df = pd.merge(
  left=df, 
  right=disp_df, 
  how='left',
  on='account_id'
)

Join loan with card

In [71]:
card_df.rename(
  columns={
    'type': 'card_type',
    'issued': 'card_issued'
  },
  inplace=True
)

df = pd.merge(
  left=df, 
  right=card_df, 
  how='left',
  on='disp_id'
)

Join loan with client

In [72]:
df = pd.merge(
  left=df, 
  right=client_df[['client_id', 'gender', 'birth_date']], 
  how='left',
  on='client_id'
)

In [73]:
# Calculate age
for col in ['loan_date', 'account_date', 'birth_date']:
  df[col] = pd.to_datetime(df[col]) 

df['age'] = (df['loan_date'].dt.year - df['birth_date'].dt.year).astype(int)

Join loan with district

In [74]:
district_df.rename(
  columns={
    'A2': 'district_name',
    'A3': 'region_name',
    'A11': 'avg_salary'
  },
  inplace=True
)

df = pd.merge(
  left=df, 
  right=district_df[['district_id', 'district_name', 'region_name', 'avg_salary']], 
  how='left',
  on='district_id'
)

Join loan with transactions

In [None]:
trans_df_latest = trans_df.rename(
  columns={
    'date': 'trans_date_latest',
    'type': 'trans_type_latest',
    'operation': 'trans_operation_latest',
    'amount': 'trans_amount_latest',
    'balance': 'trans_balance_latest',
    'k_symbol': 'trans_k_symbol_latest',
    'bank': 'trans_bank_latest'
  },
  inplace=False
)
trans_df_latest = trans_df_latest[[
  'account_id',
  'trans_date_latest',
  'trans_type_latest',
  'trans_operation_latest',
  'trans_amount_latest',
  'trans_balance_latest',
  'trans_k_symbol_latest',
  'trans_bank_latest'
]]

df['loan_date'] = pd.to_datetime(df['loan_date'])
trans_df_latest['trans_date_latest'] = pd.to_datetime(trans_df_latest['trans_date_latest'])

df.sort_values(by='loan_date', inplace=True)
trans_df_latest.sort_values(by='trans_date_latest', inplace=True)

df = pd.merge_asof(
  left=df,
  right=trans_df_latest,
  left_on='loan_date',
  right_on='trans_date_latest',
  by='account_id',
  direction='backward'
)

df['days_since_last_transaction'] = df['trans_date_latest'] - df['loan_date']

KeyError: "['trans_date_latest'] not found in axis"

In [77]:
df.drop('trans_date_latest', axis=1, inplace=True)

### Create Transactions Features

In [20]:
# Create transaction features

In [79]:
trans_df.rename(
  columns={
    'date': 'trans_date',
    'type': 'trans_type',
    'operation': 'trans_operation',
    'amount': 'trans_amount',
    'balance': 'trans_balance',
    'k_symbol': 'trans_k_symbol',
    'bank': 'trans_bank'
  },
  inplace=True
)

df_copy = pd.merge(
  left=df,
  right=trans_df,
  on='account_id'
)

In [80]:
df_copy['loan_date'] = pd.to_datetime(df_copy['loan_date'])
df_copy['trans_date'] = pd.to_datetime(df_copy['trans_date'])
df_copy = df_copy.query('loan_date >= trans_date')
df_copy['months_between_loan_and_trans_date'] = df_copy['loan_date'].dt.month - df_copy['trans_date'].dt.month

In [81]:
def agg_trans(df, window_months):
  return (
    df
    .query(f'months_between_loan_and_trans_date <= {window_months}')
    .groupby(['loan_id', 'account_id'])
    .agg({
      'trans_date': 'size',
      'trans_amount': 'mean',
      'trans_balance': 'mean'
    })
    .rename(columns={
      'trans_date': f'trans_count_p{window_months}m',
      'trans_amount': f'avg_amount_p{window_months}m',
      'trans_balance': f'avg_balance_p{window_months}m'
    })
)

In [82]:
trans_stats_p3m = agg_trans(df_copy, 3)
trans_stats_p6m = agg_trans(df_copy, 6)
trans_stats_p12m = agg_trans(df_copy, 12)

In [83]:
df_copy['trans_date_yyyymmdd'] = df_copy['trans_date'].dt.strftime('%Y%m%d')
df_copy['trans_date_yyyyww'] = df_copy['trans_date'].dt.strftime('%Y%V')
df_copy['trans_date_yyyymm'] = df_copy['trans_date'].dt.strftime('%Y%m')


In [84]:
def agg_trans_by_level(df, level, level_col, window_months):
  return (df
  .query(f'months_between_loan_and_trans_date <= {window_months}')
  .groupby(['loan_id', 'account_id', level_col])
  .agg({
    'trans_date': 'size',
    'trans_amount': 'sum',
    'trans_balance': 'sum'
  })
  .groupby(['loan_id', 'account_id'])
  .agg({
    'trans_date': 'median',
    'trans_amount': 'mean',
    'trans_balance': 'mean'
  })
  .rename(columns={
    'trans_date': f'median_{level}_trans_count_p{window_months}m',
    'trans_amount': f'avg_{level}_amount_p{window_months}m',
    'trans_balance': f'avg_{level}_balance_p{window_months}m'
  }))

In [85]:
trans_daily_stats_p3m = agg_trans_by_level(
  df=df_copy, 
  level='daily', 
  level_col='trans_date_yyyymmdd',
  window_months=3
)
trans_weekly_stats_p3m = agg_trans_by_level(
  df=df_copy, 
  level='weekly', 
  level_col='trans_date_yyyyww',
  window_months=3
)
trans_monthly_stats_p3m = agg_trans_by_level(
  df=df_copy, 
  level='monthly', 
  level_col='trans_date_yyyymm',
  window_months=3
)

trans_daily_stats_p6m = agg_trans_by_level(
  df=df_copy, 
  level='daily', 
  level_col='trans_date_yyyymmdd',
  window_months=6
)
trans_weekly_stats_p6m = agg_trans_by_level(
  df=df_copy, 
  level='weekly', 
  level_col='trans_date_yyyyww',
  window_months=6
)
trans_monthly_stats_p6m = agg_trans_by_level(
  df=df_copy, 
  level='monthly', 
  level_col='trans_date_yyyymm',
  window_months=6
)

trans_daily_stats_p12m = agg_trans_by_level(
  df=df_copy, 
  level='daily', 
  level_col='trans_date_yyyymmdd',
  window_months=12
)
trans_weekly_stats_p12m = agg_trans_by_level(
  df=df_copy, 
  level='weekly', 
  level_col='trans_date_yyyyww',
  window_months=12
)
trans_monthly_stats_p12m = agg_trans_by_level(
  df=df_copy, 
  level='monthly', 
  level_col='trans_date_yyyymm',
  window_months=12
)


In [86]:
trans_type_stats = (
  df_copy
  .groupby(['loan_id', 'account_id', 'trans_type'])
  .agg({
    'trans_date': 'size'
  })
  .reset_index()
  .pivot(
    index=['loan_id', 'account_id'],
    columns='trans_type',
    values='trans_date'
  )
  .rename(columns={
    'PRIJEM': 'trans_type_prijem_count',
    'VYBER': 'trans_type_vyber_count',
    'VYDAJ': 'trans_type_vydaj_count',
  })
)

In [87]:
trans_operation_stats = (
  df_copy
  .groupby(['loan_id', 'account_id', 'trans_operation'])
  .agg({
    'trans_date': 'size'
  })
  .reset_index()
  .pivot(
    index=['loan_id', 'account_id'],
    columns='trans_operation',
    values='trans_date'
  )
  .rename(columns={
    'PREVOD NA UCET': 'trans_operation_prevod_na_ucet_count',
    'PREVOD Z UCTU': 'trans_operation_prevod_z_uctu_count',
    'VKLAD': 'trans_operation_vklad_count',
    'VYDAJ': 'trans_operation_vydaj_count',
    'VYBER KARTOU': 'trans_operation_vyber_kartou',
    'VYBER': 'trans_operation_vyber_count'
  })
)

In [88]:
trans_stats_dfs = [
  trans_stats_p3m,
  trans_stats_p6m,
  trans_stats_p12m,
  trans_daily_stats_p3m,
  trans_daily_stats_p6m,
  trans_daily_stats_p12m,
  trans_weekly_stats_p3m,
  trans_weekly_stats_p6m,
  trans_weekly_stats_p12m,
  trans_monthly_stats_p3m,
  trans_monthly_stats_p6m,
  trans_monthly_stats_p12m,
  trans_type_stats,
  trans_operation_stats
]

for d in trans_stats_dfs:
  df = pd.merge(
    left=df,
    right=d,
    on=['loan_id', 'account_id']
  )

In [89]:
df.drop(columns=[
  'loan_id',
  'account_id',
  'loan_date',
  'district_id',
  'account_date',
  'disp_id',
  'client_id',
  'card_id',
  'card_issued',
  'birth_date'
],
inplace=True)

In [90]:
df.columns

Index(['loan_amount', 'loan_duration', 'loan_payments', 'loan_status',
       'account_frequency', 'disp_type', 'card_type', 'gender', 'age',
       'district_name', 'region_name', 'avg_salary', 'trans_type_latest',
       'trans_operation_latest', 'trans_amount_latest', 'trans_balance_latest',
       'trans_k_symbol_latest', 'trans_bank_latest',
       'days_since_last_transaction', 'trans_count_p3m', 'avg_amount_p3m',
       'avg_balance_p3m', 'trans_count_p6m', 'avg_amount_p6m',
       'avg_balance_p6m', 'trans_count_p12m', 'avg_amount_p12m',
       'avg_balance_p12m', 'median_daily_trans_count_p3m',
       'avg_daily_amount_p3m', 'avg_daily_balance_p3m',
       'median_daily_trans_count_p6m', 'avg_daily_amount_p6m',
       'avg_daily_balance_p6m', 'median_daily_trans_count_p12m',
       'avg_daily_amount_p12m', 'avg_daily_balance_p12m',
       'median_weekly_trans_count_p3m', 'avg_weekly_amount_p3m',
       'avg_weekly_balance_p3m', 'median_weekly_trans_count_p6m',
       'avg_week

In [91]:
df.head()

Unnamed: 0,loan_amount,loan_duration,loan_payments,loan_status,account_frequency,disp_type,card_type,gender,age,district_name,...,avg_monthly_amount_p12m,avg_monthly_balance_p12m,trans_type_prijem_count,trans_type_vyber_count,trans_type_vydaj_count,trans_operation_prevod_na_ucet_count,trans_operation_prevod_z_uctu_count,trans_operation_vklad_count,trans_operation_vyber_count,trans_operation_vyber_kartou
0,96396,12,8033.0,B,POPLATEK TYDNE,OWNER,,F,46,Sokolov,...,5025.0,12250.0,4.0,,,,,4.0,,
1,165960,36,4610.0,A,POPLATEK MESICNE,OWNER,,M,25,Nachod,...,67930.0,321184.166667,17.0,2.0,18.0,8.0,,12.0,12.0,
2,127080,60,2118.0,A,POPLATEK MESICNE,OWNER,,M,57,Jicin,...,21669.833333,120244.166667,15.0,,9.0,3.0,5.0,2.0,6.0,
3,105804,36,2939.0,A,POPLATEK MESICNE,OWNER,classic,F,53,Pribram,...,29475.428571,147491.571429,13.0,,12.0,2.0,6.0,2.0,10.0,
4,274740,60,4579.0,A,POPLATEK TYDNE,OWNER,junior,M,15,Hl.m. Praha,...,73077.285714,220583.0,13.0,,14.0,3.0,6.0,1.0,11.0,


In [92]:
df.to_csv('../datasets/loan_with_features.csv', index=False)