In [187]:
import numpy as np
import pandas as pd
import statsmodels.stats.api as sms
from scipy import stats
from scipy.stats import ttest_ind_from_stats, kstest, norm, skew, kurtosis
from statsmodels.stats.weightstats import ztest

In [260]:
class Describe:
    def __init__(self, data):
        self.data = data

    def base_properties(self):
        pd_data = self.data
        if not isinstance(self.data, pd.Series):
            pd_data = pd.Series(self.data)
        
        # Prueba de normalidad
        _, p_value = kstest(pd_data, 'norm')
        
        # Medidas de forma
        sk = skew(pd_data)
        kur = kurtosis(pd_data)

        return {
            "is_normal_distribution": p_value > 0.05,
            "mean": pd_data.mean(),
            "std": pd_data.std(),
            "skew": sk,
            "kurtosis": kur
        }


class ABTest:
    def __init__(
        self,
        control_name: str = "Control",
        variation_name: str = "Variation"
    ):
        self.control_name = control_name
        self.variation_name = variation_name

    def get_cohen_d(self, control, variation) -> float:
        # Calcular la desviación estándar combinada
        pooled_std = np.sqrt(((control['sample_size'] - 1) * control['std']**2 + (variation['sample_size'] - 1) * variation['std']**2) / (control['sample_size'] + variation['sample_size'] - 2))

        # Calcular el valor de Cohen's d
        cohen_d = (variation['mean'] - control['mean']) / pooled_std

        return cohen_d

    @staticmethod
    def _result_to_json(
        test_name: str, metric_name: str, metric_control: float, metric_variation: float,
        sample_size_control: int, sample_size_variation: int, cohen_d,
        additional_metrics: dict = {}, variation_winner: str = "",
        probability_winner: float = np.nan, test_stats: float = np.nan, 
        p_value: float = np.nan, ci: tuple = (np.nan, np.nan)
    ) -> dict:
        return {
            "test_name": test_name,
            "tested_metric": metric_name,
            "result_metric_control": metric_control,
            "result_metric_variation": metric_variation,
            "additional_metrics": additional_metrics,
            "test_statistics": test_stats,
            "p_value": p_value,
            "cohen_d": cohen_d,
            "CI_lower_limit": ci[0],
            "CI_upper_limit": ci[1],
            "uplift": metric_variation - metric_control,
            "sample_size_control": sample_size_control,
            "sample_size_variation": sample_size_variation,
            "variation_winner": variation_winner,
            "probability_winner": probability_winner,
        }

    
class Perform(ABTest):
    def t_test_from_status(self, control, variation):
        """Performs a t-test using the provided control and variation data."""
        test_name = "t_test"
        tested_metric = "mean"

        # Extract metrics from control and variation data
        mean_control, std_control, len_control = self._extract_metrics(control)
        mean_variation, std_variation, len_variation = self._extract_metrics(variation)

        test_stats, p_value = ttest_ind_from_stats(
            mean_control, std_control, len_control, 
            mean_variation, std_variation, len_variation
        )
        result_metric_control = mean_control
        result_metric_variation = mean_variation
        conf_int = (np.nan, np.nan)  # Placeholder for confidence interval
        # Convert sample sizes to integers safely
        sample_size_control = int(len_control) if not np.isnan(len_control) else 0
        sample_size_variation = int(len_variation) if not np.isnan(len_variation) else 0

        control_metrics = {
            'mean': mean_control,
            'std': std_control,
            'sample_size': len_control
        }

        variation_metrics = {
            'mean': mean_variation,
            'std': std_variation,
            'sample_size': len_variation
        }
        # Calculate Cohen's d
        cohen_d = self.get_cohen_d(control_metrics, variation_metrics)
        # Convert test results to JSON
        return ABTest._result_to_json(
            test_name=test_name,
            metric_name=tested_metric,
            metric_control=result_metric_control,
            metric_variation=result_metric_variation,
            sample_size_control=sample_size_control,
            sample_size_variation=sample_size_variation,
            additional_metrics={},
            test_stats=test_stats,
            p_value=p_value,
            ci=conf_int,
            cohen_d=cohen_d
        )

    def t_test(self, control, variation, equal_var = True):
        """
        Performs a t-test using the provided control and variation values.
        You need to pass an array with control values and an array with variation values
        """

        test_name = "t_test"
        tested_metric = "mean"
        test_stats, p_value = stats.ttest_ind(control, variation, equal_var=equal_var)

        # Calculate mean and sample size for each group
        mean_control = np.mean(control)
        mean_variation = np.mean(variation)
        len_control = len(control)
        len_variation = len(variation)
        
        # Calculate standard deviations
        std_control = np.std(control, ddof=1)
        std_variation = np.std(variation, ddof=1)
        
        conf_int = (np.nan, np.nan)  # Placeholder for confidence interval
        

        control_metrics = {
            'mean': mean_control,
            'std': std_control,
            'sample_size': len_control
        }

        variation_metrics = {
            'mean': mean_variation,
            'std': std_variation,
            'sample_size': len_variation
        }
        # Calculate Cohen's d
        cohen_d = self.get_cohen_d(control_metrics, variation_metrics)

        
        # Convert test results to JSON
        return ABTest._result_to_json(
            test_name=test_name,
            metric_name=tested_metric,
            metric_control=mean_control,
            metric_variation=mean_variation,
            sample_size_control=len_control,
            sample_size_variation=len_variation,
            additional_metrics={},
            test_stats=test_stats,
            p_value=p_value,
            ci=conf_int,
            cohen_d = cohen_d
        )
    
    def _extract_metrics(self, data):
        required_columns = ['mean', 'std', 'sample_size']

        """Extrae media, desviación estándar y tamaño de muestra desde el DataFrame proporcionado."""
        if data.empty or not all(col in data.columns for col in required_columns):
            raise ValueError("El DataFrame está vacío o no contiene las columnas necesarias: 'mean', 'std', 'sample_size'.")
        
        # Extraer métricas
        mean = data['mean'].values[0]
        std = data['std'].values[0]
        sample_size = data['sample_size'].values[0]
        
        return mean, std, sample_size

In [262]:
abtest = ABTest()
perform = Perform(abtest)

### Experimentos

In [172]:
np.random.seed(42)

# Generar datos simulados para el grupo de control
n_control = 10000
clicks_control = np.random.poisson(lam=5, size=n_control)
mean_control = np.mean(clicks_control)
std_control = np.std(clicks_control, ddof=1)

# Generar datos simulados para el grupo de variación
n_variation = 10000
clicks_variation = np.random.poisson(lam=6, size=n_variation)
mean_variation = np.mean(clicks_variation)
std_variation = np.std(clicks_variation, ddof=1)

# Crear un DataFrame con estadísticas descriptivas
data_summary = pd.DataFrame({
    'mean': [mean_control, mean_variation],
    'std': [std_control, std_variation],
    'sample_size': [n_control, n_variation],
    'Clicks': [clicks_control.sum(), clicks_variation.sum()],
    'sessions': [n_control, n_variation],
    'Group': ['Control', 'Variation'],
})

In [273]:
data_summary

Unnamed: 0,mean,std,sample_size,Clicks,sessions,Group
0,5.0005,2.236918,10000,50005,10000,Control
1,5.9816,2.432006,10000,59816,10000,Variation


In [270]:
control_data = data_summary[data_summary['Group'] == 'Control']
variation_data = data_summary[data_summary['Group'] == 'Variation']
# test_result = perform.t_test_from_status(control_data, variation_data)
# test_result

In [268]:
# test_result = perform.t_test(clicks_control, clicks_variation)
# test_result

In [269]:
# print(Describe(clicks_control).base_properties())