In [6]:
import numpy as np
from scipy.stats import chi2

# Step 1: Create the observed data
observed = np.array([[75, 8],
                     [13, 52]])

print("Observed Contingency Table:")
print(observed)

# Step 2: Calculate row and column totals
row_totals = observed.sum(axis=1)
col_totals = observed.sum(axis=0)
total = observed.sum()

print("\nRow totals:", row_totals)
print("Column totals:", col_totals)
print("Grand total:", total)

# Step 3: Calculate expected frequencies
expected = np.outer(row_totals, col_totals) / total

print("\nExpected Frequencies:")
print(expected)

# Step 4: Calculate chi-square statistic
chi_square = np.sum((observed - expected)**2 / expected)

# Step 5: Calculate degrees of freedom
df = (observed.shape[0] - 1) * (observed.shape[1] - 1)

# Step 6: Calculate p-value
p_value = 1 - chi2.cdf(chi_square, df)

print(f"\nChi-square statistic: {chi_square:.8f}")
print(f"Degrees of freedom: {df}")
print(f"p-value: {p_value:.20f}")

# Step 7: Calculate residuals
residuals = (observed - expected) / np.sqrt(expected)

print("\nResiduals:")
print(residuals)

# Step 8: Calculate standardized residuals
std_residuals = residuals / np.sqrt((1 - row_totals[:, np.newaxis]/total) * (1 - col_totals/total))

print("\nStandardized Residuals:")
print(std_residuals)

Observed Contingency Table:
[[75  8]
 [13 52]]

Row totals: [83 65]
Column totals: [88 60]
Grand total: 148

Expected Frequencies:
[[49.35135135 33.64864865]
 [38.64864865 26.35135135]]

Chi-square statistic: 74.86671048
Degrees of freedom: 1
p-value: 0.00000000000000000000

Residuals:
[[ 3.65102632 -4.42161245]
 [-4.12569702  4.99646722]]

Standardized Residuals:
[[ 8.65255514 -8.65255514]
 [-8.65255514  8.65255514]]


In [7]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# Create the dataset
data = pd.DataFrame({
    'smokes': ['yes'] * 83 + ['no'] * 65,
    'lung_cancer': ['yes'] * 75 + ['no'] * 8 + ['yes'] * 13 + ['no'] * 52
})

# Create the contingency table
contingency_table = pd.crosstab(data['smokes'], data['lung_cancer'])
print("Contingency Table:")
print(contingency_table)

# Perform chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-square statistic: {chi2:.8f}")
print(f"p-value: {p_value:.20f}")
print(f"Degrees of freedom: {dof}")

print("\nExpected Frequencies:")
print(pd.DataFrame(expected, index=contingency_table.index, columns=contingency_table.columns))

# Calculate residuals
observed = contingency_table.values
residuals = (observed - expected) / np.sqrt(expected)
print("\nResiduals:")
print(pd.DataFrame(residuals, index=contingency_table.index, columns=contingency_table.columns))

# Calculate standardized residuals
n = np.sum(observed)
row_totals = np.sum(observed, axis=1)
col_totals = np.sum(observed, axis=0)
std_residuals = residuals / np.sqrt((1 - row_totals[:, np.newaxis]/n) * (1 - col_totals/n))
print("\nStandardized Residuals:")
print(pd.DataFrame(std_residuals, index=contingency_table.index, columns=contingency_table.columns))

# Interpret the results
alpha = 0.05
print(f"\nInterpretation:")
if p_value <= alpha:
    print(f"The p-value ({p_value:.20f}) is less than the significance level ({alpha}).")
    print("We reject the null hypothesis.")
    print("There is a significant association between smoking and lung cancer.")
else:
    print(f"The p-value ({p_value:.20f}) is greater than the significance level ({alpha}).")
    print("We fail to reject the null hypothesis.")
    print("There is not enough evidence to conclude a significant association between smoking and lung cancer.")

Contingency Table:
lung_cancer  no  yes
smokes              
no           52   13
yes           8   75

Chi-square statistic: 71.97622771
p-value: 0.00000000000000002178
Degrees of freedom: 1

Expected Frequencies:
lung_cancer         no        yes
smokes                           
no           26.351351  38.648649
yes          33.648649  49.351351

Residuals:
lung_cancer        no       yes
smokes                         
no           4.996467 -4.125697
yes         -4.421612  3.651026

Standardized Residuals:
lung_cancer        no       yes
smokes                         
no           8.652555 -8.652555
yes         -8.652555  8.652555

Interpretation:
The p-value (0.00000000000000002178) is less than the significance level (0.05).
We reject the null hypothesis.
There is a significant association between smoking and lung cancer.


In [8]:
data

Unnamed: 0,smokes,lung_cancer
0,yes,yes
1,yes,yes
2,yes,yes
3,yes,yes
4,yes,yes
...,...,...
143,no,no
144,no,no
145,no,no
146,no,no


In [9]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# Create the exact contingency table as provided
contingency_table = pd.DataFrame({
    'yes': [75, 13],
    'no': [8, 52]
}, index=['smokes_yes', 'smokes_no'])

print("Contingency Table:")
print(contingency_table)

# Perform chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-square statistic: {chi2:.8f}")
print(f"p-value: {p_value:.20f}")
print(f"Degrees of freedom: {dof}")

print("\nExpected Frequencies:")
print(pd.DataFrame(expected, index=contingency_table.index, columns=contingency_table.columns))

# Calculate residuals
observed = contingency_table.values
residuals = (observed - expected) / np.sqrt(expected)
print("\nResiduals:")
print(pd.DataFrame(residuals, index=contingency_table.index, columns=contingency_table.columns))

# Calculate standardized residuals
n = np.sum(observed)
row_totals = np.sum(observed, axis=1)
col_totals = np.sum(observed, axis=0)
std_residuals = residuals / np.sqrt((1 - row_totals[:, np.newaxis]/n) * (1 - col_totals/n))
print("\nStandardized Residuals:")
print(pd.DataFrame(std_residuals, index=contingency_table.index, columns=contingency_table.columns))

# Interpret the results
alpha = 0.05
print(f"\nInterpretation:")
if p_value <= alpha:
    print(f"The p-value ({p_value:.20f}) is less than the significance level ({alpha}).")
    print("We reject the null hypothesis.")
    print("There is a significant association between smoking and lung cancer.")
else:
    print(f"The p-value ({p_value:.20f}) is greater than the significance level ({alpha}).")
    print("We fail to reject the null hypothesis.")
    print("There is not enough evidence to conclude a significant association between smoking and lung cancer.")

Contingency Table:
            yes  no
smokes_yes   75   8
smokes_no    13  52

Chi-square statistic: 71.97622771
p-value: 0.00000000000000002178
Degrees of freedom: 1

Expected Frequencies:
                  yes         no
smokes_yes  49.351351  33.648649
smokes_no   38.648649  26.351351

Residuals:
                 yes        no
smokes_yes  3.651026 -4.421612
smokes_no  -4.125697  4.996467

Standardized Residuals:
                 yes        no
smokes_yes  8.652555 -8.652555
smokes_no  -8.652555  8.652555

Interpretation:
The p-value (0.00000000000000002178) is less than the significance level (0.05).
We reject the null hypothesis.
There is a significant association between smoking and lung cancer.


In [10]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# Create the exact contingency table as provided
contingency_table = pd.DataFrame({
    'yes': [75, 13],
    'no': [8, 52]
}, index=['smokes_yes', 'smokes_no'])

print("Contingency Table:")
print(contingency_table)

# Perform chi-square test with Yates' correction (default in scipy)
chi2_yates, p_value_yates, dof, expected = chi2_contingency(contingency_table)

# Perform chi-square test without Yates' correction (Pearson's chi-square)
chi2_pearson, p_value_pearson, dof, expected = chi2_contingency(contingency_table, correction=False)

print(f"\nYates' corrected Chi-square statistic: {chi2_yates:.8f}")
print(f"Yates' corrected p-value: {p_value_yates:.20f}")

print(f"\nPearson's Chi-square statistic: {chi2_pearson:.8f}")
print(f"Pearson's p-value: {p_value_pearson:.20f}")

print(f"\nDegrees of freedom: {dof}")

print("\nExpected Frequencies:")
print(pd.DataFrame(expected, index=contingency_table.index, columns=contingency_table.columns))

# Calculate Pearson's chi-square manually to verify
observed = contingency_table.values
chi2_manual = np.sum((observed - expected)**2 / expected)
print(f"\nManually calculated Pearson's Chi-square: {chi2_manual:.8f}")

Contingency Table:
            yes  no
smokes_yes   75   8
smokes_no    13  52

Yates' corrected Chi-square statistic: 71.97622771
Yates' corrected p-value: 0.00000000000000002178

Pearson's Chi-square statistic: 74.86671048
Pearson's p-value: 0.00000000000000000504

Degrees of freedom: 1

Expected Frequencies:
                  yes         no
smokes_yes  49.351351  33.648649
smokes_no   38.648649  26.351351

Manually calculated Pearson's Chi-square: 74.86671048
