In [1]:
import pandas as pd
import numpy as np
from scipy import stats

import plotly.express as px
import plotly

import glob
import datetime
from pprint import pprint

pd.set_option("display.max_colwidth", 3000)

In [2]:
df = pd.concat([pd.read_csv(csv, sep=";") for csv in glob.glob("*.csv")])

df['datetime'] = pd.to_datetime(df.date)

df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute
df['second'] = df['datetime'].dt.second
df['utc_offset'] = df['datetime'].apply(lambda x: x.utcoffset().total_seconds())

seasons_map = {
    1 : 'winter',
    2 : 'winter',
    3 : 'spring',
    4 : 'spring',
    5 : 'spring',
    6 : 'summer',
    7 : 'summer',
    8 : 'summer',
    9 : 'fall',
    10 : 'fall',
    11 : 'fall',
    12 : 'winter',
}
df['season'] = df.month.replace(seasons_map)

df = df.drop(["date"], axis=1)

df = df.rename(columns={"Categoría": "category"})

df = df.drop(['title', 'datetime', 'year', 'utc_offset'], axis=1)

df.head()

Unnamed: 0,amount,category,month,day,hour,minute,second,season
0,-34.13,compras,12,27,20,30,12,winter
1,-8.37,compras,3,11,0,0,0,spring
2,-27.77,compras,11,25,5,21,29,fall
3,2624.84,transferencias,10,30,17,33,3,fall
4,-31.15,transferencias,12,31,9,7,15,winter


## Criterios
- La distribución temporal de la muestra debe asemejarse a la de la población.
- Estadísticos descriptivos de `amount` en la muestra se asemejan a los de la población.
- Los clusterings aplicados a la muestra y a la población se asemejan.
- La distribución de tokens de `title` de la muestra debe ser similar a la de la población.
- [OPCIONAL] La distribución de `category` de la muestra debe ser similar a la de la población.

In [3]:
DROP_CATEGORY = False

In [4]:
from evaluation.core import Evaluation
from evaluation.evaluations import DistributionComparisson
from evaluation.sampling import RandomSampling

sampling_strategies = [
    RandomSampling(),
]

evaluation_strategy = DistributionComparisson()

evaluation = Evaluation(df = df, sampling_strategies = sampling_strategies, evaluation_strategy = evaluation_strategy)

In [5]:
e = evaluation.run(random_state=42)

In [7]:
e

Unnamed: 0,amount,category,month,day,hour,minute,second,season,SampleSize,SamplingStrategy
0,0.552700,0.364658,0.119116,0.984250,0.024235,0.952461,0.574763,0.377758,1,RandomSampling
1,0.167601,0.996660,0.212378,0.869502,0.814772,0.034088,0.583774,0.263113,51,RandomSampling
2,0.089043,0.557067,0.366725,0.885725,0.825784,0.255862,0.680471,0.840638,101,RandomSampling
3,0.059290,0.299194,0.546406,0.097270,0.265558,0.257525,0.930415,0.625815,151,RandomSampling
4,0.070112,0.286828,0.679442,0.366734,0.996499,0.846827,0.100304,0.391466,201,RandomSampling
...,...,...,...,...,...,...,...,...,...,...
395,0.000455,0.037991,0.259182,0.789832,0.142869,0.506008,0.018635,0.548678,19751,RandomSampling
396,0.000441,0.030363,0.211027,0.992997,0.669211,0.507676,0.702650,0.285742,19801,RandomSampling
397,0.000385,0.085220,0.868236,0.375360,0.989678,0.693401,0.704656,0.712004,19851,RandomSampling
398,0.000400,0.567983,0.292403,0.400274,0.365287,0.304066,0.570604,0.424487,19901,RandomSampling


In [6]:
evaluation.plot()