# Muestreo de transacciones bancarias

- Python 3.10.11

- Requirements:
    - numpy==1.26.2
    - pandas==2.1.3
    - python-dateutil==2.8.2
    - pytz==2023.3.post1
    - six==1.16.0
    - tzdata==2023.3


In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import glob
import datetime

pd.set_option("display.max_colwidth", 3000)

In [2]:
df = pd.concat([pd.read_csv(csv, sep=";") for csv in glob.glob("*.csv")])

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   amount     20000 non-null  float64
 1   date       20000 non-null  object 
 2   title      20000 non-null  object 
 3   Categoría  20000 non-null  object 
dtypes: float64(1), object(3)
memory usage: 781.2+ KB


In [4]:
all_strings = all([isinstance(obj, str) for obj in df.date.iloc])
print(f"Column 'date': {all_strings=}")

Column 'date': all_strings=True


In [5]:
all_strings = all([isinstance(obj, str) for obj in df.title.iloc])
print(f"Column 'title': {all_strings=}")

Column 'title': all_strings=True


In [6]:
df.rename(columns={"Categoría": "category"}, inplace=True)
all_strings = all([isinstance(obj, str) for obj in df.category.iloc])
print(f"Column 'category': {all_strings=}")

Column 'category': all_strings=True


### Date

In [7]:
df['datetime'] = pd.to_datetime(df.date)

df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute
df['second'] = df['datetime'].dt.second
df['utc_offset'] = df['datetime'].apply(lambda x: x.utcoffset().total_seconds())

In [8]:
df = df.drop(["date"], axis=1)

In [21]:
df._get_numeric_data().columns

Index(['amount', 'year', 'month', 'day', 'hour', 'minute', 'second',
       'utc_offset'],
      dtype='object')

In [24]:
datetime_figs = [px.histogram(df[[column, "amount", "year"]], x=column, y="amount", nbins=len(df[column].unique()), color="year") for column in df._get_numeric_data().columns[2:-1]]

In [27]:
sum_amount_per_months_fig = px.histogram(df[['year', 'month', 'amount']], x='month', y='amount', color='year')
sum_amount_per_months_fig

In [30]:
count_amount_per_months_fig = px.histogram(df[['year', 'month', 'amount']], x='month', y=None, color='year')
count_amount_per_months_fig

In [31]:
sum_amount_per_days_fig = px.histogram(df[['day', 'month', 'amount']], x='day', y='amount', color='month')
sum_amount_per_days_fig

In [32]:
count_amount_per_days_fig = px.histogram(df[['day', 'month', 'amount']], x='day', y=None, color='month')
count_amount_per_days_fig

In [34]:
sum_amount_per_hour_fig = px.histogram(df[['day', 'amount', 'hour']], x='hour', y='amount', color='day')
sum_amount_per_hour_fig

In [35]:
sum_amount_per_hour_fig = px.histogram(df[['month', 'amount', 'hour']], x='hour', y='amount', color='month')
sum_amount_per_hour_fig

In [36]:
count_amount_per_hour_fig = px.histogram(df[['day', 'amount', 'hour']], x='hour', y=None, color='day')
count_amount_per_hour_fig

In [38]:
count_amount_per_hour_fig = px.histogram(df[['month', 'amount', 'hour']], x='hour', y=None, color='month')
count_amount_per_hour_fig