In [13]:
import numpy as np
import pandas as pd

from scipy import stats
from sklearn.feature_selection import SelectKBest, chi2


In [14]:
df = pd.read_csv("Iris.csv")
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [20]:
sepallength_variety_crosstab=pd.crosstab(df['sepal.length'],df['variety'],
                                        margins=True)
sepallength_variety_crosstab
                                        

variety,Setosa,Versicolor,Virginica,All
sepal.length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4.3,1,0,0,1
4.4,3,0,0,3
4.5,1,0,0,1
4.6,4,0,0,4
4.7,2,0,0,2
4.8,5,0,0,5
4.9,4,1,1,6
5.0,8,2,0,10
5.1,8,1,0,9
5.2,3,1,0,4


In [21]:
sepalwidth_variety_crosstab=pd.crosstab(df['sepal.width'],df['variety'],
                                        margins=True)
sepalwidth_variety_crosstab
                               

variety,Setosa,Versicolor,Virginica,All
sepal.width,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2.0,0,1,0,1
2.2,0,2,1,3
2.3,1,3,0,4
2.4,0,3,0,3
2.5,0,4,4,8
2.6,0,3,2,5
2.7,0,5,4,9
2.8,0,6,8,14
2.9,1,7,2,10
3.0,6,8,12,26


In [22]:
petallength_variety_crosstab=pd.crosstab(df['petal.length'],df['variety'],
                                        margins=True)
petallength_variety_crosstab

variety,Setosa,Versicolor,Virginica,All
petal.length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,1,0,0,1
1.1,1,0,0,1
1.2,2,0,0,2
1.3,7,0,0,7
1.4,13,0,0,13
1.5,13,0,0,13
1.6,7,0,0,7
1.7,4,0,0,4
1.9,2,0,0,2
3.0,0,1,0,1


In [23]:
petalwidth_variety_crosstab=pd.crosstab(df['petal.width'],df['variety'],
                                        margins=True)
petalwidth_variety_crosstab

variety,Setosa,Versicolor,Virginica,All
petal.width,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.1,5,0,0,5
0.2,29,0,0,29
0.3,7,0,0,7
0.4,7,0,0,7
0.5,1,0,0,1
0.6,1,0,0,1
1.0,0,7,0,7
1.1,0,3,0,3
1.2,0,5,0,5
1.3,0,13,0,13


In [37]:
def check_categorical_dependency(crosstab_table, confidence_interval):
    stat, p, dof, expected = stats.chi2_contingency(crosstab_table)
    print ("Chi-Square Statistic value = {}".format(stat))
    print ("P - Value = {}".format(p))
    alpha = 1.0 - confidence_interval
    if p <= alpha:
        print('Dependent (reject)')
    else:
            print('Independent (fail to reject)')
    return expected

In [42]:
exp_table_1 = check_categorical_dependency(sepallength_variety_crosstab, 0.95)

Chi-Square Statistic value = 156.26666666666668
P - Value = 0.0008753326828092068
Dependent (reject)


In [43]:
exp_table_2 = check_categorical_dependency(sepalwidth_variety_crosstab, 0.95)

Chi-Square Statistic value = 89.54628704628703
P - Value = 0.048864220708299266
Dependent (reject)


In [44]:
exp_table_3 = check_categorical_dependency(petallength_variety_crosstab, 0.95)

Chi-Square Statistic value = 271.8
P - Value = 3.2198352572375757e-12
Dependent (reject)


In [47]:
exp_table_4 = check_categorical_dependency(petalwidth_variety_crosstab, 0.95)

Chi-Square Statistic value = 271.75
P - Value = 8.828863453832789e-27
Dependent (reject)


In [51]:
pd.DataFrame(exp_table_4)

Unnamed: 0,0,1,2,3
0,1.666667,1.666667,1.666667,5.0
1,9.666667,9.666667,9.666667,29.0
2,2.333333,2.333333,2.333333,7.0
3,2.333333,2.333333,2.333333,7.0
4,0.333333,0.333333,0.333333,1.0
5,0.333333,0.333333,0.333333,1.0
6,2.333333,2.333333,2.333333,7.0
7,1.0,1.0,1.0,3.0
8,1.666667,1.666667,1.666667,5.0
9,4.333333,4.333333,4.333333,13.0


<h1>Feature Selection using Chi-Square<h1>

In [62]:
X = df[["sepal.length", "sepal.width", "petal.length", "petal.width"]]

In [63]:
Y = df[["variety"]]

In [64]:
X_new = SelectKBest(chi2, k=1).fit_transform(X, y)

In [65]:
X_new.shape

(150, 1)

In [66]:
pd.crosstab(np.squeeze(X_new), np.squeeze(y))

variety,Setosa,Versicolor,Virginica
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,1,0,0
1.1,1,0,0
1.2,2,0,0
1.3,7,0,0
1.4,13,0,0
1.5,13,0,0
1.6,7,0,0
1.7,4,0,0
1.9,2,0,0
3.0,0,1,0
