In [67]:
import pandas as pd
import numpy as np
from scipy import stats

import plotly.express as px
import plotly

import glob
import datetime
from pprint import pprint

pd.set_option("display.max_colwidth", 3000)

In [68]:
df = pd.concat([pd.read_csv(csv, sep=";") for csv in glob.glob("*.csv")])

df['datetime'] = pd.to_datetime(df.date)

df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute
df['second'] = df['datetime'].dt.second
df['utc_offset'] = df['datetime'].apply(lambda x: x.utcoffset().total_seconds())

seasons_map = {
    1 : 'winter',
    2 : 'winter',
    3 : 'spring',
    4 : 'spring',
    5 : 'spring',
    6 : 'summer',
    7 : 'summer',
    8 : 'summer',
    9 : 'fall',
    10 : 'fall',
    11 : 'fall',
    12 : 'winter',
}
df['season'] = df.month.replace(seasons_map)

df = df.drop(["date"], axis=1)

df = df.rename(columns={"Categoría": "category"})

df = df.drop(['title', 'datetime', 'year', 'utc_offset'], axis=1)

df.head()

Unnamed: 0,amount,category,month,day,hour,minute,second,season
0,-34.13,compras,12,27,20,30,12,winter
1,-8.37,compras,3,11,0,0,0,spring
2,-27.77,compras,11,25,5,21,29,fall
3,2624.84,transferencias,10,30,17,33,3,fall
4,-31.15,transferencias,12,31,9,7,15,winter


## Criterios
- La distribución temporal de la muestra debe asemejarse a la de la población.
- Estadísticos descriptivos de `amount` en la muestra se asemejan a los de la población.
- Los clusterings aplicados a la muestra y a la población se asemejan.
- La distribución de tokens de `title` de la muestra debe ser similar a la de la población.
- [OPCIONAL] La distribución de `category` de la muestra debe ser similar a la de la población.

In [69]:
DROP_CATEGORY = False

### Amount

In [70]:
def compare_numeric_series(
        population_series: pd.Series,
        sample_series: pd.Series,
    ) -> tuple[float]:
    """
    Compares the distributions of two pandas Series

    Parameters
    ---
    - ``population_series`` is a pandas Series object that represents the population.
    - ``sample_series`` is a pandas Series object that represents a sample.

    Return
    ---
    - Returns the ks statistic and its p-value in a tuple
    """

    statistic, p = stats.ks_2samp(data1 = sample_series, data2 = population_series)
    return statistic, p

In [71]:
from typing import Callable

def eval_numeric_series(
        series: pd.Series,
        sampling: Callable[[pd.Series, int], pd.Series],
        threshold: float = 0.05,
        alpha: float = 0.05,
        step: int = 50,
        random_state: int | float | None = None,
        compute_best: bool = False
    ) -> tuple[int, dict, plotly.graph_objs.Figure]:
    """
    Samples a pandas Series object multiple times and computes ks statistics for each one of the samples againts it.

    Parameters
    ---
    - ``series`` is the pandas Series object to be evaluated.
    - ``sampling`` is the function that samples the series.
    - ``threshold`` is the tolerance for the ks statistic, relevant when computing the best sample size with ``compute_best = True``.
    - ``alpha`` is the p-value threshold, also relevant when computing the best sample size.
    - ``step`` defines the difference between two consecutive sample sizes. Increasing this parameter trades coverage density for algorithmic performance.
    - ``random_state`` is the rng seed used in pandas objects' methods. It helps with evaluation consistency.
    - ``compute_best`` computes the best sample size when set to ``True``. It's set at ``False``
    """

    evaluations = {
        'statistic' : [],
        'significance' : [],
    }

    for i in range(1, len(series), step):
        statistic, p_value = compare_numeric_series(
            population_series=series,
            sample_series=series.sample(i, random_state=random_state)
        )

        for _ in range(step):
            evaluations['statistic'].append(statistic)
            evaluations['significance'].append(p_value)
    
    fig = px.line(evaluations)

    if compute_best:
        best = min([idx + 1 for idx, (stat, p) in enumerate(zip(evaluations['statistic'], evaluations['significance'])) if p > alpha and stat < threshold])

        return best, evaluations, fig
    
    return evaluations, fig