# Chi-Squared test

- Determines relationships between two categorical variables and one target variable
- Non-parametric (doesn't follow normal distribution)
- Good for feature selection in ML, testing independence, and assessing model fit


In [1]:
# Example: Are people's subscription status and income level independent?
# Null hypothesis: No association, independent
# Alternative hypothesis: Significant association, not independent

In [10]:
# Create a contingency table
import pandas as pd

contingency = pd.DataFrame(
    [[20, 30], [40, 25], [10, 15]],
    index=["Low Income", "Medium Income", "High Income"],
    columns=["Subscribed", "Not Subscribed"],
)
contingency

Unnamed: 0,Subscribed,Not Subscribed
Low Income,20,30
Medium Income,40,25
High Income,10,15


In [26]:
# Calculate expected frequencies
import pandas as pd

column_totals = contingency.sum()
row_totals = contingency.sum(axis=1)
total_count = column_totals.sum()


expected = pd.DataFrame(
    [
        [
            income_count * subscription_count / total_count
            for subscription_count in column_totals
        ]
        for income_count in row_totals
    ],
    index=contingency.index,
    columns=contingency.columns,
)
expected

Unnamed: 0,Subscribed,Not Subscribed
Low Income,25.0,25.0
Medium Income,32.5,32.5
High Income,12.5,12.5


In [None]:
observed_values = [item for rows in contingency.values for item in rows]

chi2 = sum([
    (observed_values[i * len(rows) + j] - expected_value) ** 2 / expected_value
    for i, rows in enumerate(expected.values)
    for j, expected_value in enumerate(rows)
])
chi2

np.float64(6.461538461538462)

In [None]:
# Degrees of freedom: number of independent values
# (number of rows - 1) * (number of columns - 1)
rows, cols = expected.shape
df = (rows - 1) * (cols - 1)
df

2

In [None]:
import scipy.stats as stats

alpha = 0.05

critical_value = stats.chi2.ppf(1 - alpha, df)
critical_value

np.float64(5.991464547107979)

In [66]:
chi2 < critical_value
# Reject null hypothesis
# Significant association

np.False_