# Análisis exploratorio

In [1]:
import pandas as pd
import plotly.express as px
import glob

pd.set_option("display.max_colwidth", 3000)

In [2]:
df = pd.concat([pd.read_csv(csv, sep=";") for csv in glob.glob("*.csv")])
# df = [pd.read_csv(csv, sep=";") for csv in glob.glob("*.csv")][1]

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   amount     20000 non-null  float64
 1   date       20000 non-null  object 
 2   title      20000 non-null  object 
 3   Categoría  20000 non-null  object 
dtypes: float64(1), object(3)
memory usage: 781.2+ KB


In [4]:
all_strings = all([isinstance(obj, str) for obj in df.date.iloc])
print(f"Column 'date': {all_strings=}")

Column 'date': all_strings=True


In [5]:
all_strings = all([isinstance(obj, str) for obj in df.title.iloc])
print(f"Column 'title': {all_strings=}")

Column 'title': all_strings=True


In [6]:
df.rename(columns={"Categoría": "category"}, inplace=True)
all_strings = all([isinstance(obj, str) for obj in df.category.iloc])
print(f"Column 'category': {all_strings=}")

Column 'category': all_strings=True


#### Columna 'category'

In [23]:
px.histogram(df.category)

#### Columna 'amount'

In [8]:
px.histogram(df[df.amount.between(-5_000, 5_000)].amount, nbins=10000)

#### Columna 'date'

In [9]:
df['datetime'] = pd.to_datetime(df.date)

df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute
df['second'] = df['datetime'].dt.second
df['utc_offset'] = df['datetime'].apply(lambda x: x.utcoffset().total_seconds())

In [10]:
seasons_map = {
    1 : 'winter',
    2 : 'winter',
    3 : 'spring',
    4 : 'spring',
    5 : 'spring',
    6 : 'summer',
    7 : 'summer',
    8 : 'summer',
    9 : 'fall',
    10 : 'fall',
    11 : 'fall',
    12 : 'winter',
}
df['season'] = df.month.replace(seasons_map)

In [11]:
df = df.drop(["date"], axis=1)

In [12]:
px.histogram(df[['season', 'month', 'amount']], x='month', y='amount', color='season')

In [13]:
px.histogram(df[['datetime', 'amount']].groupby('datetime', as_index=False).median(),x='datetime', y='amount', nbins = 365)

In [14]:
px.histogram(df[['day', 'season', 'amount']], x='day', y='amount', color='season')

In [15]:
px.histogram(df[['day', 'season', 'amount']], x='day', y=None, color='season')

In [16]:
px.histogram(df[['day', 'amount', 'hour']], x='hour', y='amount')

In [17]:
px.histogram(df[['day', 'amount', 'hour']], x='hour', y=None)

#### Columna 'title'

In [18]:
months_title_dict = dict()
for month in df['month'].unique():
    titles_for_month = set(df[df.month == month].title)
    months_title_dict[month] = titles_for_month

In [19]:
common_titles = set.intersection(*months_title_dict.values())

print("Titles common across all months:")
print(common_titles)

Titles common across all months:
set()


In [20]:
for title in df['title'].iloc[:20]:
    print(title)

COMPRA TARJ. 5402XXXXXXXX1026 562-GADIS XINZO DE LIMIA-XINZO DE LIMI
COMPRA APPLE PAY EN CONDIS BUSTAMAN. MADRID. TARJ. :*401403
767001220289 SUPER.FROIZ URUGUAY \VIGO\ES1710242023
TRANSF. - RODRIGUEZ MONTALVO MARIA NIEVES
S/ORD.TRANSF. - EOI
Mandarina ho
TRANSF. - Begona Vicente
REINTEGRO. ATM:00492211. NARON. TARJ. :*320546
Traspaso recibido Cuenta Nómina
ALQUILER MES DE DICIEMBRE
TELEFONOS TELEFONICA DE ESPANA. S.A.U. FIJO965451558.ENE
COMPRA EN RESTAURANTE BOC. A CORU¥A. TARJ. :*446134
COMPRA
COMPRAS CON TARJETA IBERCAJA-M.CHOCO ZIELO MADR
Pago en FARMACIA LDA.SUSANA RABELVERA ES
RECIBO STEP UP SCHOOL S.L. Nº RECIBO 0049 4370 755 BBBYMWC REF. MANDATO 010000000610 490
Mytaxi.com*1a6ezd
IBERIA EXPRESS
Transferencia emitida a natalia muñoz olza Cata Vinos Andres Zancada
767001220289 ESTANCO CANIDO RU.CANIDO. 227
