In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tools import clean_data, set_new_names, set_category
import os
import glob


In [77]:
# Ustaw ścieżkę do folderu, który chcesz przeszukać
folder_path = '..\data'

# Ustaw, czy chcesz przeszukać również podfoldery
include_subfolders = True

# Wyszukiwanie plików .csv
if include_subfolders:
    pattern = '**\*.csv'
else:
    pattern = '*.csv'

# Użyj os.path.join, aby uwzględnić ścieżkę folderu
search_pattern = os.path.join(folder_path, pattern)

In [78]:
search_pattern

'..\\data\\**\\*.csv'

In [79]:
csv_files = glob.glob(search_pattern, recursive=include_subfolders)

In [80]:
csv_files

['..\\data\\Lista_transakcji_nr_0165421859_200723.csv',
 '..\\data\\Lista_transakcji_nr_0178969007_300124.csv',
 '..\\data\\Lista_transakcji_nr_0178969233_300124.csv',
 '..\\data\\Lista_transakcji_nr_0178969327_300124.csv',
 '..\\data\\Lista_transakcji_nr_0178969442_300124.csv']

In [81]:
# Wczytaj wszystkie pliki .csv do jednego DataFrame
df = pd.concat((pd.read_csv(file, sep=';', decimal=',', encoding='windows-1250') for file in csv_files), ignore_index=True)

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16816 entries, 0 to 16815
Data columns (total 28 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Data transakcji                     16816 non-null  object 
 1   Data księgowania                    667 non-null    object 
 2   Dane kontrahenta                    16816 non-null  object 
 3   Tytuł                               686 non-null    object 
 4   Nr rachunku                         15001 non-null  object 
 5   Nazwa banku                         2558 non-null   object 
 6   Szczegóły                           667 non-null    object 
 7   Nr transakcji                       16415 non-null  object 
 8   Kwota transakcji (waluta rachunku)  16422 non-null  float64
 9   Waluta                              16422 non-null  object 
 10  Kwota blokady/zwolnienie blokady    394 non-null    float64
 11  Waluta.1                            394 n

In [83]:
# remove columns if all values are NaN
df.dropna(axis=1, how='all', inplace=True)

In [84]:
df = df[['Data transakcji',
 'Dane kontrahenta',
 'Tytuł',
 'Nr rachunku',
 'Szczegóły',
 'Nr transakcji',
 'Kwota transakcji (waluta rachunku)']]

In [85]:
df.columns = ['date', 'contractor', 'title', 'account_number', 'details', 'transaction_number', 'amount']

In [86]:
def clean_data(df):
    # Drop duplicate rows across all columns
    df = df.drop_duplicates()
    # Drop rows with missing data in columns: 'contractor', 'date' and 2 other columns
    df = df.dropna(subset=['contractor', 'date', 'transaction_number', 'amount'])
    # Drop column: 'details'
    df = df.drop(columns=['details'])
    # Drop column: 'account_number'
    df = df.drop(columns=['account_number'])
    # Remove leading and trailing whitespace in columns: 'contractor', 'title', 'transaction_number'
    df['contractor'] = df['contractor'].str.strip()
    df['title'] = df['title'].str.strip()
    df['transaction_number'] = df['transaction_number'].str.strip()
    # Replace all instances of "'" with "" in column: 'transaction_number'
    df['transaction_number'] = df['transaction_number'].str.replace("'", "", case=False, regex=False)
    # Drop column: 'title'
    df = df.drop(columns=['title'])
    # Drop duplicate rows across all columns
    df = df.drop_duplicates()
    return df

df_clean = clean_data(df.copy())

In [87]:
df_clean.head()

Unnamed: 0,date,contractor,transaction_number,amount
0,2023-07-20,Dawid Ludwa,202320197209545076,-50.0
3,2023-07-20,"ZUS Centrum Obsługi Świadczeń dla R, odzin, ul...",202320197203279992,500.0
4,2023-07-19,"LUDWA DAWID JAN, i LUDWA ANNA, EDMUNDA CIEĆKIE...",202320097202882525,1000.0
11,2023-07-18,LIDL LUKASINSKIEGO Nowy Sacz POL,202320197304224746,-56.2
12,2023-07-18,PIEKARNIA ORACZ A. ORACZ NOWY SACZ,202320197301037340,-40.57


In [88]:
# drop duplicates in column both
df_clean = df_clean.drop_duplicates(subset=['transaction_number'], keep=False)

In [89]:
df = df_clean.copy()

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8123 entries, 0 to 16811
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   date                8123 non-null   object 
 1   contractor          8123 non-null   object 
 2   transaction_number  8123 non-null   object 
 3   amount              8123 non-null   float64
dtypes: float64(1), object(3)
memory usage: 317.3+ KB


In [91]:
df['date'] = df['date'].astype('datetime64[ns]')

In [92]:
df = df[['date', 'contractor', 'amount']]

In [93]:
df.rename(columns={'date': 'date', 'contractor': 'details', 'amount': 'amount'}, inplace=True)

In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8123 entries, 0 to 16811
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   date     8123 non-null   datetime64[ns]
 1   details  8123 non-null   object        
 2   amount   8123 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 253.8+ KB


In [95]:
df = set_new_names(df.copy())
df["details_2"].fillna("not set", inplace=True)
df = set_category(df.copy())

df['category'].fillna('other', inplace=True)
df['category'] = df['category'].str.lower()
df['details'] = df['details'].str.lower()
df['category'] = df['category'].astype('category')

df['y_m'] = df['date'].dt.to_period('M')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

In [96]:
df.head()

Unnamed: 0,date,details,amount,details_2,category,y_m,year,month
0,2023-07-20,dawid ludwa,-50.0,not set,other,2023-07,2023,7
11,2023-07-18,lidl lukasinskiego nowy sacz pol,-56.2,lidl,grocery store,2023-07,2023,7
12,2023-07-18,piekarnia oracz a. oracz nowy sacz,-40.57,oracz,grocery store,2023-07,2023,7
13,2023-07-18,jmp s.a. biedronka 4028 nowy sacz,-9.87,biedronka,grocery store,2023-07,2023,7
14,2023-07-18,jmp s.a. biedronka 4028 nowy sacz,-57.81,biedronka,grocery store,2023-07,2023,7


In [97]:
df = df[~df['details'].str.contains('ludwa|anna|dawid|inter-bud|currency one|unikorona|katarzyna predka|ext ltd')]

In [98]:
df_expenses = df[df['amount'] < 0].copy()

In [99]:
min(df_expenses['amount']), max(df_expenses['amount'])

(-6560.0, -0.02)

In [100]:
df_expenses.sort_values(by='amount', ascending=True).head(50)

Unnamed: 0,date,details,amount,details_2,category,y_m,year,month
1061,2023-10-30,kancelaria notarialna s krakow po,-6560.0,not set,other,2023-10,2023,10
4518,2021-08-04,revolut 5883 internet lt,-3000.0,revolut,other,2021-08,2021,8
221,2023-05-31,ryanair k67e6r0 00000k67e6 irl,-2973.4,ryanair,transport vacation,2023-05,2023,5
4711,2021-07-01,jysk sp z oo pp440 nowy sacz pl,-2915.0,jysk,retail,2021-07,2021,7
7502,2020-08-28,amso krakow czarnowiejska krakow 30,-2459.02,not set,other,2020-08,2020,8
2870,2022-09-06,ryanair dublin irl,-2456.71,ryanair,transport vacation,2022-09,2022,9
2324,2023-01-10,decathlon sp. z o.o. warszawa pol,-2449.0,decathlon,other,2023-01,2023,1
985,2023-11-18,"urzďż˝d miasta krakowa wydziaďż˝ skarbu, miasta",-2438.0,not set,other,2023-11,2023,11
4486,2021-08-11,agata nowy sacz nowy sacz pl,-2425.11,not set,other,2021-08,2021,8
7370,2020-10-01,"zobacz ďż˝ycie - krystian koc, ul. staromiejsk...",-2400.0,not set,other,2020-10,2020,10
