# CHI-SQUARE

## LIBS

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

In [3]:
#dataset
df = pd.read_csv('titanic.csv')
df.shape

(1306, 9)

In [4]:
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,1,1,female,29.0,0,0,211.3375,B5,S
1,1,1,male,0.9167,1,2,151.55,C22,S
2,1,0,female,2.0,1,2,151.55,C22,S
3,1,0,male,30.0,1,2,151.55,C22,S
4,1,0,female,25.0,1,2,151.55,C22,S


In [5]:
#encode categorical into numbers

# gender
df['sex'] = np.where(df['sex'] == 'male', 1, 0)
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,1,1,0,29.0,0,0,211.3375,B5,S
1,1,1,1,0.9167,1,2,151.55,C22,S
2,1,0,0,2.0,1,2,151.55,C22,S
3,1,0,1,30.0,1,2,151.55,C22,S
4,1,0,0,25.0,1,2,151.55,C22,S


In [6]:
# embarked
ordinal_label = {k: i for i, k in enumerate(df['embarked'].unique(), 0)}
df['embarked'] = df['embarked'].map(ordinal_label)

df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,1,1,0,29.0,0,0,211.3375,B5,0
1,1,1,1,0.9167,1,2,151.55,C22,0
2,1,0,0,2.0,1,2,151.55,C22,0
3,1,0,1,30.0,1,2,151.55,C22,0
4,1,0,0,25.0,1,2,151.55,C22,0


In [9]:
# train and test
X_train, X_test, y_train, y_test = train_test_split(df[['pclass', 'sex', 'embarked']],
                                   df['survived'],
                                   test_size = 0.3,
                                   random_state = 0)

X_train.shape, X_test.shape

((914, 3), (392, 3))

In [10]:
# calculate the chi2 p_value between each of the variables
# and the target

# chi2 returns 2 arrays, one contains the F-Scores which are then
# evaluated against the chi2 distribution to obtain the pvalue.
# The pvalues are in the second array
f_score = chi2(X_train.fillna(0), y_train)

# 2 arrays of values
f_score

(array([32.49351818, 81.97868872,  5.66498574]),
 array([1.19590735e-08, 1.37561991e-19, 1.73068577e-02]))

In [11]:
# 1) let's capture the p_values (in the second array, remember python indexes at 0) in a pandas Series
# 2) add the variable names in the index
# 3) order the variables based on their fscore

pvalues = pd.Series(f_score[1])
pvalues.index = X_train.columns
pvalues.sort_values(ascending=True)

sex         1.375620e-19
pclass      1.195907e-08
embarked    1.730686e-02
dtype: float64

Contrarily to MI, where we were interested in the higher MI values, for the chi2, the smaller the p_value the more significant the feature is to predict the target.

Thus, from the result above, Sex is the most important feature, as it has the smallest p-value.

In this demo, we used chi2 to determine the predictive value of 3 categorical variables only. If the dataset contained several categorical variables, we could then combine this procedure with SelectKBest or SelectPercentile, as we did in the previous notebook, to select the top k features, or the features in the top n percentile, based on the chi2 p-values.

Let's select the top 1 feature for the demo:

In [12]:
sel_ = SelectKBest(chi2, k=1).fit(X_train, y_train)

# display features
X_train.columns[sel_.get_support()]

Index(['sex'], dtype='object')

In [13]:
#remove the rest of the features:
X_train = sel_.transform(X_train)
X_test = sel_.transform(X_test)