In [57]:
# === NECESSARY LIBRARIES ===
!pip install scikit-posthocs
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scikit_posthocs as sp



In [65]:
# === OUTLIER REMOVAL FUNCTION ===
def remove_outliers_iqr(values):
    q1 = np.percentile(values, 25)
    q3 = np.percentile(values, 75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return values[(values >= lower) & (values <= upper)]

In [66]:
# === NORMALITY AND VARIANCE TESTING ===
def test_normality_and_variance(datasets):
    print("== Shapiro-Wilk Normality Test ==")
    for i, data in enumerate(datasets):
        if len(data) >= 3:
            stat, p = stats.shapiro(data)
            print(f"Group {i + 1}: stat = {stat:.4f}, p = {p:.4f}")
        else:
            print(f"Group {i + 1}: Not enough data for Shapiro-Wilk test.")
    print("\n== Levene's Test for Equal Variance ==")
    stat, p = stats.levene(*datasets)
    print(f"Levene stat = {stat:.4f}, p = {p:.4f}")
    return stat, p

In [67]:
# === REMOVE OUTLIERS USING IQR METHOD ===
def remove_outliers_iqr(values):
    q1 = np.percentile(values, 25)
    q3 = np.percentile(values, 75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return values[(values >= lower) & (values <= upper)]

In [68]:
# === STATISTICAL TESTING FUNCTION ===
def run_tests_on_variable(dfs, col_name, groups):
    print(f"\n===== {col_name} =====")
    datasets = []
    for df in dfs:
        vals = df[col_name].dropna().values
        vals = remove_outliers_iqr(vals)
        datasets.append(vals)

    if any(len(d) == 0 for d in datasets):
        print("❌ Some group has no data after outlier removal.")
        return

    # === NORMALITY AND VARIANCE ===
    levene_stat, levene_p = test_normality_and_variance(datasets)

    df_combined = pd.DataFrame({
        'value': np.concatenate(datasets),
        'group': np.concatenate([[groups[i]] * len(datasets[i]) for i in range(len(datasets))])
    })

    if all(len(d) >= 3 for d in datasets) and all(stats.shapiro(d)[1] > 0.05 for d in datasets) and levene_p > 0.05:
        print("Running One-Way ANOVA...")
        stat, p = stats.f_oneway(*datasets)
        print(f"ANOVA result: F = {stat:.4f}, p = {p:.4f}")
        if p < 0.05:
            tukey = pairwise_tukeyhsd(endog=df_combined['value'], groups=df_combined['group'], alpha=0.05)
            print(tukey)
    else:
        print("Running Kruskal-Wallis Test...")
        stat, p = stats.kruskal(*datasets)
        print(f"Kruskal-Wallis result: H = {stat:.4f}, p = {p:.4f}")
        if p < 0.05:
            dunn = sp.posthoc_dunn(datasets, p_adjust='bonferroni')
            print("Posthoc Dunn's Test (Bonferroni corrected):")
            print(dunn)

In [69]:
# === MAIN ANALYSIS WRAPPER ===
def analyze_dataframe_grouped_by_column(df, group_col, test_cols):
    groups = df[group_col].unique()
    dfs = [df[df[group_col] == group] for group in groups]

    for col in test_cols:
        run_tests_on_variable(dfs, col, groups)

In [70]:
# === PAIRED TESTS BETWEEN INITIAL AND FINAL VARIABLES ===
def run_paired_tests(dfs, var_pairs):
    print("\n\n===== PAIRED TESTS BETWEEN INITIAL AND FINAL VARIABLES =====")
    for var_inicial, var_final in var_pairs:
        print(f"\n🔍 {var_inicial} vs {var_final}")
        for i, df in enumerate(dfs):
            data = df[[var_inicial, var_final]].dropna()
            if len(data) < 3:
                print(f"Group G{i+1}: Insufficient data")
                continue

            v1 = remove_outliers_iqr(data[var_inicial].values)
            v2 = remove_outliers_iqr(data[var_final].values)

            size = min(len(v1), len(v2))
            v1 = v1[:size]
            v2 = v2[:size]

            if size < 3:
                print(f"Group G{i+1}: Insufficient data after outlier removal")
                continue

            diff = v1 - v2
            _, p_shapiro = stats.shapiro(diff)
            normal = p_shapiro > 0.05

            if normal:
                stat, p = stats.ttest_rel(v1, v2)
                test_name = "paired t-test"
            else:
                stat, p = stats.wilcoxon(v1, v2)
                test_name = "Wilcoxon"

            result = "✅ significant difference" if p < 0.05 else "❌ no significant difference"
            print(f"Group G{i+1} ({test_name}): p = {p:.4f} — {result}")

In [71]:
# === DEFINE YOUR DATAFRAMES ===
df1 = pd.read_csv("ex1.csv")
df2 = pd.read_csv("ex2.csv")
df3 = pd.read_csv("ex3.csv")
df4 = pd.read_csv("ex4.csv")
df5 = pd.read_csv("ex5.csv")

dfs = [df1, df2, df3, df4, df5]
groups = ['G1', 'G2', 'G3', 'G4', 'G5']

# === VARIABLES TO COMPARE BETWEEN GROUPS ===
variables = [
    'Pressão Inicial (mmHg)',
    'Temperatura Inicial (°C)',
    'Glicose Inicial (mg/dL)',
    'Frequência Cardíaca Inicial (bpm)',
    'Nível de Energia Inicial',
    'Pressão Final (mmHg)',
    'Temperatura Final (°C)',
    'Glicose Final (mg/dL)',
    'Frequência Cardíaca Final (bpm)',
    'Nível de Energia Final'
]

# === RUN GROUP COMPARISONS ===
for var in variables:
    run_tests_on_variable(dfs, var, groups)

# === PAIRS OF VARIABLES FOR WITHIN-GROUP PAIRED COMPARISONS ===
paired_variables = [
    ('Pressão Inicial (mmHg)', 'Pressão Final (mmHg)'),
    ('Temperatura Inicial (°C)', 'Temperatura Final (°C)'),
    ('Glicose Inicial (mg/dL)', 'Glicose Final (mg/dL)'),
    ('Frequência Cardíaca Inicial (bpm)', 'Frequência Cardíaca Final (bpm)'),
    ('Nível de Energia Inicial', 'Nível de Energia Final')
]

# === RUN PAIRED COMPARISONS WITHIN EACH GROUP ===
run_paired_tests(dfs, paired_variables)


===== Pressão Inicial (mmHg) =====
== Shapiro-Wilk Normality Test ==
Group 1: stat = 0.9621, p = 0.3509
Group 2: stat = 0.9738, p = 0.6466
Group 3: stat = 0.9547, p = 0.2248
Group 4: stat = 0.9799, p = 0.8236
Group 5: stat = 0.9721, p = 0.5987

== Levene's Test for Equal Variance ==
Levene stat = 0.9235, p = 0.4521
Running One-Way ANOVA...
ANOVA result: F = 0.6122, p = 0.6545

===== Temperatura Inicial (°C) =====
== Shapiro-Wilk Normality Test ==
Group 1: stat = 0.9875, p = 0.9717
Group 2: stat = 0.9640, p = 0.4104
Group 3: stat = 0.9809, p = 0.8490
Group 4: stat = 0.9662, p = 0.4420
Group 5: stat = 0.9777, p = 0.7622

== Levene's Test for Equal Variance ==
Levene stat = 1.5031, p = 0.2044
Running One-Way ANOVA...
ANOVA result: F = 0.2395, p = 0.9156

===== Glicose Inicial (mg/dL) =====
== Shapiro-Wilk Normality Test ==
Group 1: stat = 0.9775, p = 0.7572
Group 2: stat = 0.9772, p = 0.7484
Group 3: stat = 0.9785, p = 0.7977
Group 4: stat = 0.9886, p = 0.9824
Group 5: stat = 0.9705, p =