### Imports

In [2]:
import scipy.stats as stats
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## Chi Square Test

Applies when testing two categorical variables from a single population. 

Used to determine whether or not there is a significant association between two categorical variables.

In [4]:
# Cross-table
df = pd.crosstab(df['sex'], df['smoker'])
print(df)

smoker  Yes  No
sex            
Male     60  97
Female   33  54


In [5]:
# Calculating observed values
observed_values = df.values
print('Observed values: \n', observed_values)

Observed values: 
 [[60 97]
 [33 54]]


In [6]:
# Chi - Square test of independence of variables in contingency table
val = stats.chi2_contingency(df)

In [7]:
val

(0.0,
 1.0,
 1,
 array([[59.84016393, 97.15983607],
        [33.15983607, 53.84016393]]))

In [8]:
expected_values = val[3]

In [9]:
# Applying chi2 
from scipy.stats import chi2
chi_square = sum([(o-e)**2./e for o,e in zip(observed_values, expected_values)])
chi_square_statistics = chi_square[0]+chi_square[1]

print('Chi-Square statistic: ', chi_square_statistics)

Chi-Square statistic:  0.001934818536627623


In [12]:
no_of_rows = len(df.iloc[0:2,0])
no_of_columns = len(df.iloc[0,0:2])
ddof = (no_of_rows-1)*(no_of_columns-1)
print('Degree of Freedom: ', ddof)
alpha = 0.05

Degree of Freedom:  1


In [13]:
# Finding critical value
critical_value = chi2.ppf(q=1-alpha, df=ddof)
print('Critical value: ', critical_value)

Critical value:  3.841458820694124


In [14]:
# p-value
p_value = 1-chi2.cdf(x=chi_square_statistics, df=ddof)
print('p-value: ', p_value)
print('Significance level: ', alpha)
print('Degree of Freedom: ', ddof)

p-value:  0.964915107315732
Significance level:  0.05
Degree of Freedom:  1


In [17]:
# Hypothesis test

if chi_square_statistics >= critical_value:
    print('Reject H0. There is a relationship between 2 categorical variables.')
else:
    print('Retain H0. There is NO relationship between 2 categorical variables. ')
    
if p_value <= alpha:
    print('Reject H0. There is a relationship between 2 categorical variables.')
else:
    print('Retain H0. There is NO relationship between 2 categorical variables.')

Retain H0. There is NO relationship between 2 categorical variables. 
Retain H0. There is NO relationship between 2 categorical variables.
