# Chi square test
Used when we have 2 categorical features from the same population.
Used to determine if there is a huge association between the 2

In [1]:
import scipy.stats as stats
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# import a dataset
dataset=sns.load_dataset('tips')

In [3]:
dataset.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


We want to see if there is any association between "sex" and "smoker"

In [6]:
# we pick these features
# crosstab will create a 2D array
dataset_table=pd.crosstab(dataset['sex'],dataset['smoker'])
dataset_table

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,60,97
Female,33,54


In [27]:
# observed values
observed_vals=dataset_table.values

In [28]:
print("Observed values: \n", observed_vals)

Observed values: 
 [[60 97]
 [33 54]]


Now we apply chi square

In [29]:
vals=stats.chi2_contingency(dataset_table)
vals

Chi2ContingencyResult(statistic=0.0, pvalue=1.0, dof=1, expected_freq=array([[59.84016393, 97.15983607],
       [33.15983607, 53.84016393]]))

We notice there is a small difference between the observed values and the ones in the chi test

In [30]:
expected_vals=vals[3]

In [31]:
nb_rows=len(dataset_table.iloc[0:2,0])
nb_cols=len(dataset_table.iloc[0,0:2])

In [32]:
ddof=(nb_rows-1)*(nb_cols-1)
print("Degree of fredom is: ", ddof)
alpha=0.05

Degree of fredom is:  1


## The chi square formula is:
\begin{equation}
x^2 = \sum \frac{(o - e)^2}{e}
\end{equation}

In [35]:
from scipy.stats import chi2
chi_square=sum([(o-e)**2./e for o,e in zip(observed_vals,expected_vals)])
chi_square_statistic=chi_square[0]+chi_square[1]
print("chi-square statistic:-",chi_square_statistic)

chi-square statistic:- 0.001934818536627623


In [38]:
# now we are going to calculate percent point function (ppf)
critical_val=chi2.ppf(q=1-alpha,df=ddof)
critical_val

3.841458820694124

In [51]:
# another way is to simply calculate p_value
# cdf is the opposite of ppf
p_value=1-chi2.cdf(x=chi_square_statistic,df=ddof)

In [52]:
print(" p_value: {} \n Significance level: {} \n Degree of freedom {}".format(p_value, alpha, ddof))

 p_value: 0.964915107315732 
 Significance level: 0.05 
 Degree of freedom 1


In [53]:
if chi_square_statistic>=critical_val:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("H0 true,There is no relationship between 2 categorical variables")
    
if p_value<=alpha:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("H0 true,There is no relationship between 2 categorical variables")

H0 true,There is no relationship between 2 categorical variables
H0 true,There is no relationship between 2 categorical variables


This way chi square test helps us do the feature selection by seeing if features are associated