##### **Setup**
##### Download report from BCC
1. Go to the web bank client
2. Under each currency account choose "Выписка"
3. After choosing a period -> "Отправить на почту" 


#### to-do
- add flag for not using in analysis
- check categories per month
- add analitics for cashback (and when is is added to the account)
- top-10 biggest spendings in every category
- check subscriptions
- separate preparation of data with analytics

In [1]:
#imports
import pandas as pd
import re

pd.set_option('display.max_colwidth', None)

In [2]:
#parse euro_acc table
df_euro = pd.read_html('euro_acc.html')[2]
#parse tenge_acc table
df_tenge = pd.read_html('tenge_acc.html')[2]

##### Data rules:
1. All income is positive
2. All spendings are negative



##### DataFrames
1. Day-to-day transactions
2. Account transfers (incl. salary, forex, between banks)

### Preparing EURO table

In [3]:
#renaming headers
df_euro = df_euro.rename(
    columns={
        '№ п/п': 'id',
        'Дата': 'record_dt',
        'Дебет': 'sum',
        'Кредит': 'sum_temp',
        'Назначение': 'details', }
)

#changing types
df_euro['id'] = df_euro['id'].astype('int')

df_euro['sum'] = df_euro['sum'].str.replace(r'\s+', '', regex=True)
df_euro['sum'] = pd.to_numeric(df_euro['sum'])

df_euro['sum_temp'] = df_euro['sum_temp'].str.replace(r'\s+', '', regex=True)
df_euro['sum_temp'] = pd.to_numeric(df_euro['sum_temp'])

df_euro['record_dt'] = pd.to_datetime(df_euro['record_dt'], format='%d.%m.%Y')

In [4]:
#moving all sums to one column
def add_cashback(row):
    if pd.isna(row['sum']):
        row['sum'] = row['sum_temp']
    if pd.isna(row['sum_temp']):
        row['sum'] = -row['sum']
    return row

df_euro = df_euro.apply(add_cashback, axis=1)
df_euro = df_euro.drop('sum_temp', axis=1)

In [5]:
#parsing details
first_word = r'(^\w+)'
forex = r'^Покупка иностранной валюты'
atm = r'^Снятие наличных АТМ'
transfer = r'^Перевод \(списание\)'
returns = r'^Прочие зачисления на карту \(credit\)'
retail_regex = r'^(?:[^,]*,){4}([^,]*)'
retail_loc_regex = r'^[^,]*,[^,]*,\s*([^,]*,[^,]*)'
retail_datetime_regex = r'^[^,]*,\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}:\d{2})'

def check_retail(row):
    if re.match(first_word, row['details']).group(0) == 'Retail':
        row['pos_loc'] = re.search(retail_loc_regex, row['details']).group(1)
        row['pos'] = re.search(retail_regex, row['details']).group(1)
        row['transaction_dt'] = re.search(retail_datetime_regex, row['details']).group(1)
        row['category_1'] = 'retail'
    elif re.match(forex, row['details']):
        row['category_1'] = 'forex'
    elif re.match(atm, row['details']):
        row['category_1'] = 'atm'
    elif re.match(transfer, row['details']):
        row['category_1'] = 'transfer'
    elif re.match(returns, row['details']):
        row['category_1'] = 'returns'
    else:
        row['category_1'] = 'unallocated'
    return row

df_euro = df_euro.apply(check_retail, axis=1)

In [6]:
# Regular expressions to extract date and retail sum directly in the loop
cashback_date_regex = r'Дата (\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}:\d{2})'
retail_sum_regex = r'сумма ([\d.]+)'

# Step 2: Track indices of cashback rows that have matching retail transactions
matched_cashback_indices = []

# Step 3: Process cashback rows, find matching Retail rows, and add cashback sum
for index, row in df_euro.iterrows():
    if "Учет вознаграждений по CashBack" in row['details']:
        # Extract the cashback date and retail transaction sum directly
        cashback_date_match = re.search(cashback_date_regex, row['details'])
        retail_sum_match = re.search(retail_sum_regex, row['details'])
        
        if cashback_date_match and retail_sum_match:
            cashback_date = cashback_date_match.group(1)
            retail_amount = float(retail_sum_match.group(1))
            cashback_amount = row['sum']
            
            # Find matching Retail row by 'transaction_dt' and 'sum'
            matching_retail_index = df_euro[(df_euro['transaction_dt'] == cashback_date) & 
                                            (df_euro['sum'] == -retail_amount)].index
            
            # If match found, update cashback_sum in the Retail row and mark cashback row for deletion
            if not matching_retail_index.empty:
                df_euro.loc[matching_retail_index, 'cashback_sum'] = cashback_amount
                matched_cashback_indices.append(index)

# Step 4: Remove matched cashback rows
df_euro.drop(matched_cashback_indices, inplace=True)

# Reset index after dropping rows (optional)
df_euro.reset_index(drop=True, inplace=True)

In [9]:
# Define categories with lists of keywords for each
category_2_keywords = {
    'clothes': [
        ' COS Av.Liberdade', ' HM Grandella', ' PAYPAL *LICK.EVA', ' Hermes Lisbonne', 
        ' HAVAIANAS LISBOA', ' MASSIMO DUTTI', ' HM Colombo', 
        ' Uniqlo Europe Ltd Sucursa', ' STIVALI', ' ZARA PORTUGAL', ' INTIMISSIMI R CARMO', 
        ' SEPHORA', ' Vestiaire', 'Vinted', 'LOEWE',
    ],
    'eat_out': [
        ' CERES BOULANGERIE', ' ELE E ELA', ' COSMIKGABARITO', ' UNICO GELATO CAFFE', 
        ' BUNA SABORES', ' THE FOLKS UNIP LDA', ' BARU', ' HELLO KRISTOF', 
        ' Dallas 02 - Sao Bento', ' Janis', ' NEIGHBOURHOOD COFFEE L', 
        ' CROQUETERIA MELHORES', ' Acento coffee', ' CAFE MONKA', ' REST POMME EATERY', 
        ' BUNA', ' PIZZARIA VIAVAI', ' COMIDA INDEPENDENTE', ' Vesuviano', ' No Convento', 
        ' DEAR BREAKFAST', ' COPENHAGEN COFFEE', ' LUPITA PIZZARIA', ' CAFE LAYERS', 
        ' MARQUISE PADARIA I', ' A MANTEIGARIA', ' Fora Artisan Pastry', ' COMOBA LISBOA', 
        ' UNI', ' IMANOL PRINCIPE REAL', ' LIBERTY CAFE', ' NUMA CAFE', ' TIMEOUT', 
        ' REST TOMORROW AT 9', ' STARBUCKS DOUBLE', ' HONEST GREENS CSODRE', 
        ' REST CABO DA ROCA', ' Black Trumpet', ' Dallas 02 Sao Bento', 
        ' NEIGHBOURHOOD COFFEE', ' MARQUISE PADARIA II', ' MERCADAO', ' LANDEAU LDA', 
        ' CALMO CAFE', ' PALACIO DO GRILO LDA', ' ARCA', ' PARRA', ' CAFE SAO', 
        ' Pao do Beco', ' ACID CAFE MADRID', ' EAST CREMA COFFEE HERMOSI', 
        ' CAFETERIAS MUSEO THYSSEN', ' RESTAURANTE QUINTIN', ' BUCOLICO', ' HOT NOW', 
        ' COFI', ' STARBUCKS COFFEE ALCALA', ' Casa Neutrale', ' KIOSKO PRENSA TERESA SANC', 
        ' BOUTIQUE LINDT GOYA', ' AEROPUERTO MADRID BARAJAS', ' SQ *OSOM COFFEE', 
        ' STARBUCKS COFFEE AR', ' BAR LUCE', ' SIGNOR LIEVITO', ' Bar Transiti Imb Malpens', 
        ' Chantilly Geladaria', ' CAFE THE LAYERS', ' RESTAURANTE HOY', ' MC DONALDS OEIRAS', 
        ' YELLOW LEMON', ' DOBECO', ' BAKE BROS', ' Birkenstock Digital Gm', 
        ' ESPRESSO LAB', ' HONEST GREENS', ' PARRA WINE BISTRO', ' LA BOULANGERIE', 
        ' Isakaya by Koji', 'FABRICA', 'выдача наличных', 'OAK BERRY', 'Starbucks',
        'KIRILL IVANOV', 'RHODO BAGELS',
    ],
    'food_order': ['BOLT.EU','UBER * EATS'],
    'grocery': [
        ' FOOD MERCEARIA BIO', ' CONTINENTE BOM DIA', ' AMANHECER MINI MERC', 
        ' COMPANHIA PORTUGUEZA', ' COMP PORTUGUEZA CHA', ' LANDEAU CHIADO', 
        ' DIA FRUTA', ' MERCEARIA LUIS', ' SUPERMERCADO ESTRELA', ' ALDI PRINCIPE REAL',
        'MINIPRECO', 'Glovo', 'PINGO DOCE', 'MERCADO DE SANTOS', 'SUPER MERCADO SANTOS',
        ' GLEBA NOSSA', ' AUCHAN',
    ],
    'home': [
        ' A LINHA DA VIZINHA', ' TIGER CAMPO OURIQUE', ' SP KINFILL CAREX', 
        ' AREAS PORTUGAL SA', ' MUJI CHIADO', ' Saudade Flores', ' TIGER CHIADO', 
        ' TIGER COLOMBO', ' POPPIES DESIGN STORE', ' Aesop Cosmetics Spain', 
        ' BANEMA STUDIO LISBOA', ' ZaraHome.com', ' Flexispot GmbH',
        ' ARTURAS LUIS LDA', 'IKEA', 'FinnishDesignShopCOM', 'ZARA HOME',
    ],
    'misc': [
        ' SUMUP *SALTED BOOKS LISB', ' PCDIGA', ' EL CORTE INGLES', ' F CALOUSTE GULBENKIA', 
        ' FARMACIA CONDE BARAO', ' TICKETLINE SA', ' Farmacia Central', 
        ' CARLOS LOPES PEREIRA', ' FCALOUSTE GULBENKIAN', ' FUND CALOUSTE GULBEN', 
        ' Livraria Snob', ' FARMACIA ACOREANA', ' MAAT MUSEU ARTE', ' TERMAS DO ESTORIL', 
        ' WWW TICKETONE IT', ' CARMENCITA FILM LAB', ' CAIXAFORUM MADRID', 
        ' FUND.COLECC.THYSSEN BORNE', ' FUNDACION COLECCION THYSS', 
        ' MONDADORI BOOKSTORE GALL', ' SP ASTROPAD', ' DECATHLON LISBOA', 
        ' Liberty Books', ' LEGO CHIADO', ' FNAC LOJA DO CHIADO', 
        ' OPTICA CENTRAL CALHA', ' GAGA', ' PAPELARIA PLANETA', 'SP KEYGEM'
    ],
    'pet': [' PATINHAS MIMADAS', ' H VET SAO BENTO LDA', ' ZOOPLUS'],
    'amazon': ['Amazon', 'AMAZON', 'AMZN'],
    'subscriptions': [' BB-SAMSUNG', 'MIDJOURNEY', 'WOO', 'Netflix.com', ' ADOBE *ADOBE',
                      ' SUNSETFIT', ' ADOBE *INDESIGN',
    ],
    'transport': [' BIGLIETTERIA MIDATICKET', ' CLESS TICKET ATM MILANO','UBER', ],
    'travel': [
        ' EASYJET AIR K78SMJH', ' EASYJET AIR K78SGKL', ' FlyTAP', ' EASYJET AIR K7DMXW3', 
        ' AIRBNB * HMJM4T2ZX9', ' EASYJET AIR K7RM5JR', ' EASYJET 000K7RM5JR', ' rentalcars.com',
    ]
}


def assign_category(row):
    for category, keywords in category_2_keywords.items():
        if any(keyword in row['details'] for keyword in keywords):
            return category
    return 'unallocated'

# Apply the function to create the 'category' column
df_euro.loc[df_euro['category_1'] == 'retail', 'category_2'] = df_euro[df_euro['category_1'] == 'retail'].apply(assign_category, axis=1)


In [10]:
#check unallocated pos
df_euro[df_euro['category_2'] == 'unallocated']['pos'].unique()
#df_euro[df_euro['pos'].isna()]['details'].unique()

array([' MUSAS ARISTOCRATAS', ' BCM BRICOLAGE SA', ' ASSOCIACAO PROMOCAO',
       ' COTIDIANO COMERCIO', ' TINTURARIA SILSOL', ' MBD',
       ' Revolut**4128*', ' DICE.FM', ' BATCHI LDA', ' CASTRO - GARRETT',
       ' SUMUP *ATRAVES DO FIRMAM', ' CAMPO OURIQUE', ' IDASFEST',
       ' SISTEMA J', ' TORRE IGREJA CASTELO', ' XAFARIX', ' MEX FACTORY',
       ' RELAY VIRGIN LISBOA', ' PAUL', ' ITUGUERRA S.L.',
       ' HELADOS MAISON GLACE ESPA', ' LA ESQUINA DE RECOLETO',
       ' LA ALQUIMIA', ' GALIPPO', ' MAGPIE', " SQ *PINK'S", ' Velazquez',
       ' LEITARIA NITA', ' FNM*TRENORD TVM 2039-', ' Coop-2448 Viganell',
       ' Stazione Piccadilly SA', ' ART COMPUTER', ' IUTA BISTROT',
       ' BITRENTA SRL', ' HB SERVIZI SRL', ' FOOD TRUCK DISTRICT',
       ' BEIT EVENTS SRL', ' SUMUP *GELATERIE MILANES',
       ' T3M SNC DI ANDREA SADERI', ' GIANNASI CASSA MOBILE',
       ' SUMUP *PIAZZALE EGEO S.R', ' BLUE LION FOOD SPA',
       ' OFFICINA PROFUMO FARMACE', ' CIUMBIA SRL', ' PINACOTECA B

In [18]:
grouped_df = df_euro.groupby('category_2')['sum'].sum().reset_index().sort_values(by='sum')
grouped_df

#filtered_df = df_euro[df_euro['category_2'] == 'unallocated']
#grouped_df = filtered_df.groupby('pos')['sum'].sum().reset_index().sort_values(by='sum')
#grouped_df.head(30)

Unnamed: 0,category_2,sum
11,unallocated,-10211.94
4,grocery,-8179.99
1,clothes,-7402.48
3,food_order,-6800.85
5,home,-6107.16
6,misc,-6035.75
0,amazon,-4907.96
2,eat_out,-4495.24
10,travel,-1830.57
7,pet,-1091.05


In [30]:
#df_euro[df_euro['category_2'] == 'clothes'].sort_values(by='sum').head(10)

monthly_sum = df_euro.groupby([df_euro['record_dt'].dt.to_period('M'), 'category_2'])['sum'].sum().reset_index()

# Pivot the DataFrame to have months as columns
pivot_table = monthly_sum.pivot(index='category_2', columns='record_dt', values='sum').fillna(0)

# Convert the PeriodIndex to a string for clarity
pivot_table.columns = pivot_table.columns.astype(str)

# Display the pivot table
pivot_table

record_dt,2024-01,2024-02,2024-03,2024-04,2024-05,2024-06,2024-07,2024-08,2024-09,2024-10,2024-11,2024-12
category_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
amazon,-1316.18,-590.74,-196.99,-581.5,-4.99,-139.41,-620.91,-738.91,-4.99,-157.82,-148.04,-407.48
clothes,-167.9,-783.99,-983.64,-1261.63,-588.35,-634.08,-566.06,-200.59,-669.03,-232.8,-330.29,-984.12
eat_out,-453.5,-512.92,-523.69,-598.12,-518.39,-313.1,-363.1,-468.75,-206.2,-140.25,-362.42,-34.8
food_order,-696.91,-480.97,-641.93,-560.42,-725.76,-592.76,-410.32,-392.44,-860.56,-682.87,-311.59,-444.32
grocery,-506.22,-798.68,-741.63,-868.12,-662.45,-764.29,-913.45,-665.66,-497.68,-702.5,-490.53,-568.78
home,-2747.38,-805.5,-90.23,-500.6,-57.0,-42.1,-86.08,-501.21,-391.46,-518.92,-283.61,-83.07
misc,-790.23,-827.09,-358.84,-365.88,-1806.98,-480.81,-240.09,-427.46,-433.37,-103.52,-38.0,-163.48
pet,-440.66,0.0,0.0,-185.32,0.0,0.0,0.0,-113.77,-240.7,-110.6,0.0,0.0
subscriptions,-35.0,-50.0,-72.13,-72.13,-83.63,-72.13,-57.13,-56.14,-56.14,-145.5,-60.04,-131.66
transport,-14.11,-4.99,-56.67,-18.98,-109.64,-62.26,-78.79,-129.76,-179.57,-56.98,-113.02,-67.88


### Preparing TENGE table

In [19]:
#renaming headers
df_tenge.columns = [' '.join(col).strip() for col in df_tenge.columns.values]

df_tenge = df_tenge.rename(
    columns={
        df_tenge.columns[0]: 'record_dt',
        df_tenge.columns[1]: 'transaction_dt',
        df_tenge.columns[2]: 'details',
        df_tenge.columns[3]: 'sum_in_currency',
        df_tenge.columns[4]: 'currency',
        df_tenge.columns[5]: 'fee',
        df_tenge.columns[6]: 'total_sum',
        df_tenge.columns[7]: 'cashback',
    }
    
)
df_tenge.head(10)
#changing types
#df_euro['id'] = df_euro['id'].astype('int')

df_tenge['sum_in_currency'] = df_tenge['sum_in_currency'].str.replace(r'\s+', '', regex=True)
df_tenge['sum_in_currency'] = pd.to_numeric(df_tenge['sum_in_currency'])

df_tenge['total_sum'] = df_tenge['total_sum'].str.replace(r'\s+', '', regex=True)
df_tenge['total_sum'] = pd.to_numeric(df_tenge['total_sum'])

df_tenge['fee'] = df_tenge['fee'].str.replace(r'\s+', '', regex=True)
df_tenge['fee'] = pd.to_numeric(df_tenge['fee'])

df_tenge['cashback'] = df_tenge['cashback'].str.replace(r'\s+', '', regex=True)
df_tenge['cashback'] = pd.to_numeric(df_tenge['cashback'])

df_tenge['record_dt'] = pd.to_datetime(df_tenge['record_dt'], format='%d.%m.%Y')
df_tenge['transaction_dt'] = pd.to_datetime(df_tenge['transaction_dt'], format='%d.%m.%Y')

In [20]:
df_tenge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   record_dt        241 non-null    datetime64[ns]
 1   transaction_dt   241 non-null    datetime64[ns]
 2   details          241 non-null    object        
 3   sum_in_currency  241 non-null    float64       
 4   currency         241 non-null    object        
 5   fee              241 non-null    float64       
 6   total_sum        241 non-null    float64       
 7   cashback         117 non-null    float64       
dtypes: datetime64[ns](2), float64(4), object(2)
memory usage: 15.2+ KB


In [21]:
df_tenge.head(3)

Unnamed: 0,record_dt,transaction_dt,details,sum_in_currency,currency,fee,total_sum,cashback
0,2024-01-01,2024-01-01,Перевод с карты 446375******1579 на карту 462818******4620 через систему BCC.KZ. ИИН получателя - 920404050799. Получатель - ГЕРМАН ВЛАДИМИРОВИЧ КОРЕНБЛЮМ. КНП 119 - Прочие безвозмездные переводы. Безналичный перевод. Плательщик: Самойленко Ксения Владимировна,513400.0,KZT,0.0,513400.0,
1,2024-01-01,2024-01-01,Перевод с карты 446375******8122 на карту 462818******4620 через систему BCC.KZ. ИИН получателя - 920404050799. Получатель - ГЕРМАН ВЛАДИМИРОВИЧ КОРЕНБЛЮМ. КНП 119 - Прочие безвозмездные переводы. Безналичный перевод. Плательщик: Соколов Александр Сергеевич,308000.0,KZT,0.0,308000.0,
2,2024-01-02,2024-01-02,Перевод с карты 446375******2183 на карту 462818******4620 через систему BCC.KZ. ИИН получателя - 920404050799. Получатель - ГЕРМАН ВЛАДИМИРОВИЧ КОРЕНБЛЮМ. КНП 119 - Прочие безвозмездные переводы. Безналичный перевод. Плательщик: Кудреватых Александр Валерьевич,225682.0,KZT,0.0,225682.0,
