In [28]:
import pandas as pd
import numpy as np
from scipy import stats

import plotly.express as px
import plotly

import glob
import datetime
from pprint import pprint

pd.set_option("display.max_colwidth", 3000)

In [29]:
df = pd.concat([pd.read_csv(csv, sep=";") for csv in glob.glob("*.csv")])

df['datetime'] = pd.to_datetime(df.date)

# df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month.apply(lambda x: str(x))
df['weekday'] = df['datetime'].dt.weekday.apply(lambda x: str(x))
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
# df['minute'] = df['datetime'].dt.minute
# df['second'] = df['datetime'].dt.second

seasons_map = {
    1 : 'winter',
    2 : 'winter',
    3 : 'spring',
    4 : 'spring',
    5 : 'spring',
    6 : 'summer',
    7 : 'summer',
    8 : 'summer',
    9 : 'fall',
    10 : 'fall',
    11 : 'fall',
    12 : 'winter',
}
df['season'] = df.month.replace(seasons_map)

df = df.drop(["date"], axis=1)

df = df.rename(columns={"Categoría": "category"})

df = df.drop(['title', 'datetime'], axis=1)

df.head()

Unnamed: 0,amount,category,month,weekday,day,hour,season
0,-34.13,compras,12,2,27,20,12
1,-8.37,compras,3,6,11,0,3
2,-27.77,compras,11,5,25,5,11
3,2624.84,transferencias,10,0,30,17,10
4,-31.15,transferencias,12,6,31,9,12


## Criterios
- La distribución temporal de la muestra debe asemejarse a la de la población.
- Estadísticos descriptivos de `amount` en la muestra se asemejan a los de la población.
- Los clusterings aplicados a la muestra y a la población se asemejan.
- La distribución de tokens de `title` de la muestra debe ser similar a la de la población.
- [OPCIONAL] La distribución de `category` de la muestra debe ser similar a la de la población.

In [30]:
DROP_CATEGORY = False

if DROP_CATEGORY:
    df = df.drop('category', axis=1)

In [23]:
from evaluation.core import Evaluation
from evaluation.evaluations import DistributionComparisson
from evaluation.sampling import RandomSampling

sampling_strategies = [
    RandomSampling(),
]

evaluation_strategy = DistributionComparisson()

evaluation = Evaluation(df = df, sampling_strategies = sampling_strategies, evaluation_strategy = evaluation_strategy)

In [24]:
e = evaluation.run(random_state=42)

In [40]:
e

Unnamed: 0,amount,category,month,weekday,day,hour,minute,second,season,SampleSize,SamplingStrategy
0,"(0.5527, 0.8946000000000001)","(6.262840359477124, 0.9594253309769336)","(7.65133410467128, 0.7441543590868026)","(5.798726359619306, 0.44611055588459114)","(0.80165, 0.39670000000000005)","(0.68665, 0.6267)","(0.85085, 0.2983)","(0.85165, 0.29669999999999996)","(7.65133410467128, 0.7441543590868026)",1,RandomSampling
1,"(0.16760098039215687, 0.10125725619608605)","(12.138213904928344, 0.5952040705964994)","(14.350332577462373, 0.21421062647898423)","(7.464496282873659, 0.2800160460030563)","(0.07850392156862745, 0.8873409779569602)","(0.11076176470588234, 0.5228685464007132)","(0.1332441176470589, 0.29843050095867085)","(0.09904999999999997, 0.6622278463454316)","(14.350332577462373, 0.21421062647898423)",51,RandomSampling
2,"(0.08904257425742573, 0.38355127366763875)","(14.211156131059631, 0.4341011829348257)","(7.723589640540087, 0.7378294377377022)","(11.261972963881401, 0.08060952436670397)","(0.05866584158415844, 0.8612303013031003)","(0.061434158415841544, 0.8221910368126253)","(0.12895049504950495, 0.06562581866160333)","(0.04887821782178217, 0.9612137963535062)","(7.723589640540087, 0.7378294377377022)",101,RandomSampling
3,"(0.059289735099337756, 0.6452682659001107)","(18.380694158107193, 0.18998799251413134)","(5.284547670221443, 0.9166067250116605)","(14.736469161849048, 0.0224084592273876)","(0.03975596026490058, 0.9641663074665314)","(0.04766456953642384, 0.8686511653814791)","(0.10744370860927155, 0.058092248813601643)","(0.07714337748344369, 0.31748069193250117)","(5.284547670221443, 0.9166067250116605)",151,RandomSampling
4,"(0.07011243781094528, 0.2691559840669008)","(12.46127689975362, 0.5693105514889628)","(4.998523941103338, 0.9312380736819904)","(11.079282000590487, 0.08595681533599443)","(0.03343308457711447, 0.9739103470255455)","(0.04316517412935328, 0.8364127221349353)","(0.08225497512437807, 0.12797879507792287)","(0.05796517412935326, 0.4973954209370086)","(4.998523941103338, 0.9312380736819904)",201,RandomSampling
...,...,...,...,...,...,...,...,...,...,...,...
395,"(0.00045453141613077674, 1.0)","(0.0817777169741483, 0.9999999999999634)","(0.08459989027060903, 0.9999999999066506)","(0.007373709888122401, 0.9999999916705538)","(0.00040913371474871063, 1.0)","(0.0004680193407928246, 1.0)","(0.0008467292795301118, 1.0)","(0.00036243481342718153, 1.0)","(0.08459989027060903, 0.9999999999066506)",19751,RandomSampling
396,"(0.0004406317862734044, 1.0)","(0.08662070188908473, 0.9999999999999454)","(0.05791856767365221, 0.9999999999882517)","(0.008583861802741073, 0.9999999868656346)","(0.00021715822433210352, 1.0)","(0.0003065047219837913, 1.0)","(0.0006315994141710135, 1.0)","(0.00027553911418615185, 1.0)","(0.05791856767365221, 0.9999999999882517)",19801,RandomSampling
397,"(0.00038530804493475124, 1.0)","(0.06453233512211533, 0.999999999999993)","(0.06654612466504368, 0.9999999999748772)","(0.004566773728416896, 0.9999999980191867)","(0.00028056269205578577, 1.0)","(0.0003790287642939649, 1.0)","(0.0003990831696136232, 1.0)","(0.00026811747519017715, 1.0)","(0.06654612466504368, 0.9999999999748772)",19851,RandomSampling
398,"(0.00040004271142152215, 1.0)","(0.038097843897259405, 0.9999999999999998)","(0.06048102833268955, 0.9999999999851095)","(0.004902330358164919, 0.9999999975499884)","(0.0002735415305763489, 1.0)","(0.00035201246168531375, 1.0)","(0.0004590874830410485, 1.0)","(0.00016184613838499118, 1.0)","(0.06048102833268955, 0.9999999999851095)",19901,RandomSampling
