##### **Setup**
##### Download report from BCC
1. Go to the web bank client
2. Under each currency account choose "Выписка"
3. After choosing a period -> "Отправить на почту" 


In [1]:
#imports
import pandas as pd
import re

pd.set_option('display.max_colwidth', None)

In [2]:
#parse euro_acc table
df_euro = pd.read_html('euro_acc.html')[2]
#parse tenge_acc table
df_tenge = pd.read_html('tenge_acc.html')[2]

##### Data rules:
1. All income is positive
2. All spendings are negative



##### DataFrames
1. Day-to-day transactions
2. Account transfers (incl. salary, forex, between banks)

### Preparing EURO table

In [3]:
#renaming headers
df_euro = df_euro.rename(
    columns={
        '№ п/п': 'id',
        'Дата': 'record_date',
        'Дебет': 'sum',
        'Кредит': 'sum_temp',
        'Назначение': 'details', }
)

#changing types
df_euro['id'] = df_euro['id'].astype('int')

df_euro['sum'] = df_euro['sum'].str.replace(r'\s+', '', regex=True)
df_euro['sum'] = pd.to_numeric(df_euro['sum'])

df_euro['sum_temp'] = df_euro['sum_temp'].str.replace(r'\s+', '', regex=True)
df_euro['sum_temp'] = pd.to_numeric(df_euro['sum_temp'])

df_euro['record_date'] = pd.to_datetime(df_euro['record_date'], format='%d.%m.%Y')

In [4]:
#moving all sums to one column
def add_cashback(row):
    if pd.isna(row['sum']):
        row['sum'] = row['sum_temp']
    if pd.isna(row['sum_temp']):
        row['sum'] = -row['sum']
    return row

df_euro = df_euro.apply(add_cashback, axis=1)
df_euro = df_euro.drop('sum_temp', axis=1)

In [32]:
#parsing details
first_word = r'(^\w+)'
forex = r'^Покупка иностранной валюты'
atm = r'^Снятие наличных АТМ'
transfer = r'^Перевод (списание)'
retail_regex = r'^(?:[^,]*,){4}([^,]*)'
retail_loc_regex = r'^[^,]*,[^,]*,\s*([^,]*,[^,]*)'
retail_datetime_regex = r'^[^,]*,\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}:\d{2})'

def check_retail(row):
    if re.match(first_word, row['details']).group(0) == 'Retail':
        row['pos_loc'] = re.search(retail_loc_regex, row['details']).group(1)
        row['pos'] = re.search(retail_regex, row['details']).group(1)
        row['transaction_dt'] = re.search(retail_datetime_regex, row['details']).group(1)
        row['category_1'] = 'retail'
    if re.match(forex, row['details']):
        row['category_1'] = 'forex'
    if re.match(atm, row['details']):
        row['category_1'] = 'atm'
    if re.match(transfer, row['details']):
        row['category_1'] = 'transfer'
    return row

df_euro = df_euro.apply(check_retail, axis=1)

In [None]:
# Regular expressions to extract date and retail sum directly in the loop
cashback_date_regex = r'Дата (\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}:\d{2})'
retail_sum_regex = r'сумма ([\d.]+)'

# Step 2: Track indices of cashback rows that have matching retail transactions
matched_cashback_indices = []

# Step 3: Process cashback rows, find matching Retail rows, and add cashback sum
for index, row in df_euro.iterrows():
    if "Учет вознаграждений по CashBack" in row['details']:
        # Extract the cashback date and retail transaction sum directly
        cashback_date_match = re.search(cashback_date_regex, row['details'])
        retail_sum_match = re.search(retail_sum_regex, row['details'])
        
        if cashback_date_match and retail_sum_match:
            cashback_date = cashback_date_match.group(1)
            retail_amount = float(retail_sum_match.group(1))
            cashback_amount = row['sum']
            
            # Find matching Retail row by 'transaction_dt' and 'sum'
            matching_retail_index = df_euro[(df_euro['transaction_dt'] == cashback_date) & 
                                            (df_euro['sum'] == -retail_amount)].index
            
            # If match found, update cashback_sum in the Retail row and mark cashback row for deletion
            if not matching_retail_index.empty:
                df_euro.loc[matching_retail_index, 'cashback_sum'] = cashback_amount
                matched_cashback_indices.append(index)

# Step 4: Remove matched cashback rows
df_euro.drop(matched_cashback_indices, inplace=True)

# Reset index after dropping rows (optional)
df_euro.reset_index(drop=True, inplace=True)

# Display the updated DataFrame
df_euro.head(10)

In [33]:
#display(len(df_euro[df_euro['category_1'] != 'retail']))

display(df_euro[df_euro['category_1'].isna()])
#display(df_euro[df_euro['category_1'] == 'forex'])

Unnamed: 0,category_1,details,id,pos,pos_loc,record_date,sum,transaction_dt,cashback_sum
41,,"Reverse.Учет вознаграждений по CashBack. Дата 03.01.2024 00:00:00, сумма 354.95 EUR",81,,,2024-01-07,-7.1,,
42,,"Прочие зачисления на карту (credit). 03.01.2024 00:00:00, PRT, FRIELAS/LOURE, IKEA PORTUGAL MOVEIS E DE, Карта: 462818******4620 Валюты:EUR-USD| IPS: 1.1253| BCC: 1; IPS: 1.0914",82,,,2024-01-07,354.95,,
149,,"Прочие зачисления на карту (credit). 21.01.2024 00:00:00, NLD, AMSTERDAM, UBER EATS, Карта: 462818******4620 Валюты:EUR/EUR| IPS: | BCC: 1; IPS: 1.0864",287,,,2024-01-24,2.0,,
170,,"Прочие зачисления на карту (credit). 24.01.2024 00:00:00, NLD, AMSTERDAM, UBER EATS, Карта: 462818******4620 Валюты:EUR/EUR| IPS: | BCC: 1; IPS: 1.084",327,,,2024-01-27,5.37,,
212,,"Учет вознаграждений по CashBack. Дата 31.01.2024 00:00:00, сумма 38.83 USD",409,,,2024-02-03,0.73,,
407,,"Учет вознаграждений по CashBack. Дата 02.03.2024 00:00:00, сумма 242.29 USD",788,,,2024-03-07,4.53,,
426,,"Перевод (списание). 07.03.2024 21:22:15, KAZ, ALMATY, Perevod BCC.KZ, Карта: 462818******4620, Счет получателя: Влад. карт. счета карты-получателя",825,,,2024-03-08,-300.0,,
467,,"Перевод (списание). 12.03.2024 18:17:27, KAZ, ALMATY, Perevod BCC.KZ, Карта: 462818******4620, Счет получателя: Влад. карт. счета карты-получателя",906,,,2024-03-13,-2000.0,,
621,,"Перевод (списание). 04.04.2024 15:00:35, KAZ, ALMATY, Perevod BCC.KZ, Карта: 462818******4620, Счет получателя: Влад. карт. счета карты-получателя",1212,,,2024-04-05,-2000.0,,
712,,"Прочие зачисления на карту (credit). 13.04.2024 00:00:00, LTU, Vilnius, Vinted, Карта: 462818******4620 Валюты:EUR-USD| IPS: 1.0896| BCC: 1; IPS: 1.0621",1393,,,2024-04-16,87.4,,
