In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

import scipy as sp
import scipy.stats as scs

## Cramer's V

In [79]:
# Load the dataset
df = sns.load_dataset('diamonds')

In [261]:
# Create a price bin variable
df['price_bins'] = pd.cut(
    df['price'],
    bins= [0, df.price.mean()*0.8, df.price.mean(), df.price.mean()*1.2, np.inf],
    labels= ['cheaper', 'on_average', 'high_price', 'expensive']
    )

In [273]:
df.sample(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,price_bins
41286,0.42,Ideal,D,VVS2,61.7,57.0,1210,4.82,4.8,2.97,cheaper
2298,0.9,Good,I,VS2,62.4,65.0,3162,6.09,6.12,3.81,on_average
38307,0.37,Good,G,SI2,63.3,55.0,487,4.58,4.61,2.91,cheaper
51261,0.56,Ideal,G,VVS2,62.1,56.0,2357,5.31,5.29,3.29,cheaper
5922,1.0,Good,G,SI2,64.0,55.0,3941,6.3,6.26,4.02,high_price
51834,0.76,Very Good,G,SI2,62.0,57.0,2422,5.82,5.86,3.62,cheaper
41157,0.52,Good,E,SI2,60.5,62.0,1199,5.15,5.2,3.13,cheaper
6365,0.32,Premium,H,SI1,61.9,59.0,576,4.41,4.38,2.72,cheaper
5830,0.77,Very Good,F,VVS2,60.6,60.0,3917,5.89,5.92,3.58,on_average
50467,0.61,Very Good,D,VS2,61.5,57.0,2266,5.45,5.48,3.36,cheaper


### Using SelectKBest from Scikit-Learn

In [96]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [274]:
# Split X and y
X = df.drop(['price', 'x', 'y', 'z', 'depth', 
             'table', 'carat', 'price_bins'], axis=1)
y= df.price_bins

In [229]:
# Select categorial variables
categorical_vars = df.select_dtypes(include='category').columns.to_list()

# Encode the categorial variables
X[categorical_vars] = X[categorical_vars].apply(lambda x: x.cat.codes)

In [232]:
# Instance of SelectKBest
fsel= SelectKBest(score_func=chi2, k=3)

# Fit
fsel.fit(X, y)

SelectKBest(k=3, score_func=<function chi2 at 0x7f4dd003b790>)

In [275]:
# Show a dataframe of the results
(
    pd.DataFrame({
    'variable': X.columns,
    'chi2_stat': fsel.scores_,
    'p_value': fsel.pvalues_})
    .sort_values(by='p_value', ascending=False)
)

Unnamed: 0,variable,chi2_stat,p_value
0,cut,382.290236,3.9477059999999995e-85
1,color,1914.338101,0.0
2,clarity,1632.632736,0.0


### Cramer's V

In [267]:
# Creating a contingency table
cont_table = pd.crosstab(index= df.cut, 
                         columns= df['price_bins'])

# Chi-square value
X2 = scs.chi2_contingency(cont_table)
chi_stat = X2[0]

In [277]:
# whats returned in the test : 
# (1) Chi² stat; (2) p-value; (3) Degrees of freedom; and(4) expected values.
X2

(1603.5199669055353,
 0.0,
 12,
 array([[12378.84006303,  1318.4705228 ,  1509.44898035,  6344.24043382],
        [ 7921.51562848,   843.72080089,   965.93248053,  4059.8310901 ],
        [ 6939.87033741,   739.16573971,   846.23277716,  3556.73114572],
        [ 2817.9940304 ,   300.14460512,   343.6200964 ,  1444.24126808],
        [  924.77994067,    98.49833148,   112.76566555,   473.95606229]]))

In [284]:
# Contingency table
cont_table

price_bins,cheaper,on_average,high_price,expensive
clarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IF,1427,61,26,276
VVS1,2958,140,72,485
VVS2,3667,189,91,1119
VS1,5040,470,229,2432
VS2,7347,546,456,3909
SI1,6934,678,1215,4238
SI2,3268,1078,1633,3215
I1,342,138,56,205


In [280]:
# Performing Cramer's V calculation

# Size of the sample
N = len(df)
# Minimum dimension:
# Minimum between Number of categories in rows-1, # categs columns-1
minimum_dimension = (min(cont_table.shape)-1)
  
# Calculate Cramer's V
result = np.sqrt((chi_stat/N) / minimum_dimension)
  
# Print the result
print(result)

0.09954537514956


In [283]:
# Creating a contingency table
cont_table = pd.crosstab(index= df.clarity, 
                         columns= df['price_bins'])

# Chi-square value
X2 = scs.chi2_contingency(cont_table)
chi_stat = X2[0]

# Performing Cramer's V calculation

# Size of the sample
N = len(df)
# Minimum dimension
minimum_dimension = (min(cont_table.shape)-1)
  
# Calculate Cramer's V
result = np.sqrt((chi_stat/N) / minimum_dimension)
  
# Print the result
print(result)

0.18476912508901078


In [282]:
# Creating a contingency table
cont_table = pd.crosstab(index= df.color, 
                         columns= df['price_bins'])

# Chi-square value
X2 = scs.chi2_contingency(cont_table)
chi_stat = X2[0]

# Performing Cramer's V calculation

# Size of the sample
N = len(df)
# Minimum dimension
minimum_dimension = (min(cont_table.shape)-1)
  
# Calculate Cramer's V
result = np.sqrt((chi_stat/N) / minimum_dimension)
  
# Print the result
print(result)

0.11533210649576157
