In [None]:
# Standard Data Science Toolkit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
import category_encoders as ce

# Inferential Statistical Tests
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.proportion import proportions_ztest

from scipy.stats import chi2_contingency, ttest_ind, ttest_rel, f_oneway, pearsonr
from scipy.stats import zscore, norm, binom, poisson
from scipy.stats import kruskal
from scikit_posthocs import posthoc_dunn

In [None]:
# Calculate the required sample size
def determine_sample_size(p0, p1, alpha=0.05, power=0.8):
    """Takes in:
    p0: Current rate as float
    p1: Desired rate (current rate +/- change) as float
    alpha: Confidence level as float
    power: Desired power level as float

    Returns sample size required for given inputs
    """
    effect_size = sm.stats.proportion_effectsize(p0,p1)
    sample_size = sm.stats.NormalIndPower().solve_power(effect_size, alpha=alpha, power=power, ratio=1)

    return print(f'Required sample size per group: {sample_size:.0f}')

In [None]:
# This computes the z-scores for each value in normal_dist, standardizing
# the data by subtracting the mean and dividing by the standard deviation.
z_dist = [(x - np.mean(normal_dist)) / np.std(normal_dist)
          for x in normal_dist]

In [None]:
# Calculate the Confidence Interval of a Normally distributed sample
sample_size = #
sample = df.sample(n=sample_size)
sample_mean = sample.mean()

ci = stats.t.interval(
    confidence=0.95,               # Confidence level
    df=sample_size - 1,             # Degrees of freedom
    loc=sample_mean,                # Sample mean
    scale=sample.std() / np.sqrt(sample_size)  # Standard error calculated here
)

print("Confidence Interval:", ci)

In [None]:
stats.ttest_1samp(sample_mean, h0_mean)

In [None]:
# One Tailed T-Test
"""
# Default here is two-tailed t-test
result = stats.ttest_ind(own, rent)
print(result)
pvalue = result.pvalue
print(pvalue)
pvalue < alpha 
"""

t_stat, p_value = stats.ttest_ind(own, rent)

# Use < for "less than" or > for "greater than" one tailed t-test
# H1 < H0 so in this case use <

one_tailed_p_value = p_value / 2 if t_stat < 0 else 1.0
print(one_tailed_p_value)
one_tailed_p_value < alpha

In [None]:
# ANOVA
def perform_anova(df, group_col, value_col, alpha):
    """
    Performs a one-way ANOVA test to compare means across groups.
    
    Parameters:
        df (pd.dfFrame) - The df with data/groups
        group_col (str) - The column name representing the groups
        value_col (str) - The column name representing the values to compare
        alpha (float) - Confidence level
    
    Returns:
        dict: A dictionary containing the F-Statistic, P-Value, and interpretation.
    """
    # Group the df by the specified group column
    grouped = df.groupby(group_col)[value_col]
    
    # Extract values for each group into separate lists
    grouped_values = [grouped.get_group(group).values for group in grouped.groups.keys()]
    
    # Perform ANOVA test
    f_stat, p_value = f_oneway(*grouped_values)
    
    # Interpretation
    result = {
        "F-Statistic": f_stat,
        "P-Value": p_value,
        "Significance": "Reject the null in favor of the alterantive hypothesis."
        if p_value < 0.05 else "Fail to reject the null hypothesis"
    }
    
    return result

In [None]:
formula = 'S ~ C(E) + C(M) + X'
lm = ols(formula, df).fit()
table = sm.stats.anova_lm(lm, typ=2)
print(table)

In [None]:
# Goodness of Fit
observed_values = counts.values
result = stats.chisquare(observed_values, expected_values)
result

In [None]:
# Visualizing the Result

def plot_chi_squared_distribution(df, critical_percentile=0.95, x_range=(0, 30), num_points=500):
    """
    Plots a Chi-square distribution PDF with a critical value line.
    
    Parameters:
    - df: Degrees of freedom for the Chi-square distribution.
    - critical_percentile: The percentile for the critical value (default is 0.95).
    - x_range: The range for the x-axis (default is from 0 to 30).
    - num_points: The number of points in the x range (default is 500).
    """
    # Generate x values (Chi-square statistic range)
    x = np.linspace(x_range[0], x_range[1], num_points)

    # Calculate the corresponding y values (PDF of the Chi-square distribution)
    y = stats.chi2.pdf(x, df)

    # Calculate the critical value for the Chi-square distribution
    critical_value = stats.chi2.ppf(critical_percentile, df)

    # Create the plot
    fig, ax = plt.subplots()
    ax.plot(x, y, color='darkblue', label=r"$\chi^2$ distribution PDF")
    ax.axvline(critical_value, color='green', linestyle="--", label=r"critical $\chi^2$")

    # Add labels and legend
    ax.set_xlabel(r"$\chi^2$ statistic")
    ax.set_ylabel("Probability Density")
    ax.legend()

    # Show the plot
    plt.show()

# Example usage:
plot_chi_squared_distribution(df=10)  # You can modify df and other parameters as needed


In [None]:
# Test of Independence
independence_table = pd.crosstab(col_1, col_2)
chi2, pvalue, dof, ex = stats.chi2_contingency(independence_table)

print("Chi-square statistic:", chi2)
print("p-value:", pvalue)

In [None]:
# Test of Homogeneity

# The goal is comparing the distributions of two population samples,
# to understand whether their underlying populations follow the same distribution.
homogeneity_table = pd.crosstab(col_1, col_2)

chi2, p, dof, ex = stats.chi2_contingency(homogeneity_table)

print("Chi-square statistic:", chi2)
print("p-value:", p)