In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.style.use('seaborn-whitegrid')
%matplotlib inline

In [2]:
df = pd.read_csv('../../data/clean/house_data.csv',
                 usecols=['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'condition', 'grade', 'sqft_above', 'sqft_basement'])
df.drop_duplicates(inplace=True)
df.dropna(axis=0, inplace=True)
df = df[df.bedrooms != 0]
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,grade,sqft_above,sqft_basement
0,221900.0,3,1.0,1180,5650,1.0,3,7,1180,0
1,538000.0,3,2.25,2570,7242,2.0,3,7,2170,400
2,180000.0,2,1.0,770,10000,1.0,3,6,770,0
3,604000.0,4,3.0,1960,5000,1.0,5,7,1050,910
4,510000.0,3,2.0,1680,8080,1.0,3,8,1680,0


---
<h1 style="text-align:center">CORRELATION</h1>
<img src="../../images/corr.JPG" style="background:white; display: block; margin-left: auto;margin-right: auto; width:60%"/>
<ul style="font-size:13px">
    <li><strong>WHAT</strong><ul>
            <li>Correlation is a statistical measure that explains how one or more variables are related to each other
            <li>These variables can be input data features which have been used to forecast our target variable
            <li><strong>Correlation does not cause causation</strong> (ie. : correlation between Ice cream sales and sunglasses sold)
            <li>Correlations can be divided into three common types:<ul>
                    <li><strong>Strong Positive Correlation</strong> — Two features can be positively correlated with each other which means that when the value of one variable increases then the value of the other variable(s) also increases
                    <li><strong>Strong Negative Correlation</strong> — Two features can be negatively correlated with each other that occurs when the value of one variable increases and the value of other variable(s) decreases
                    <li><strong>No Correlation</strong> — Two features might not have any relationship with each other which happens when the value of a variable is changed then the value of the other variable is not impacted
                </ul>
        </ul>
    <li><strong>WHY</strong><ul>
            <li>Features with high correlation are more linearly dependent and hence have almost the same effect on the dependent variable so when two features have high correlation, we can drop one of the two features, this is called <i><strong>"Correlation-Based Feature Selection"</strong></i>
            <li>Correlation helps us in determining the degree of relationship between variables and enables us to make our decision for the future course of actions
            <li>Essentially, correlation analysis is used for spotting patterns within datasets
            <li>Through the correlation analysis, we evaluate correlation coefficient that tells us how much one variable changes when the other one does
        </ul>
    <li><strong>HOW</strong><ul>
            <li>Calculate based on level of measurement<ul>
                    <li>Linear correlation (both are continuous/ordinal variables)<ul>
                            <li><strong>Pearson's r</strong>
                            <li><strong>Spearman's Rho</strong>
                            <li><strong>Kendall's Tau</strong>
                        </ul>
                    <li>Categorical correlation (both are categorical — nominal variables)<ul>
                            <li><strong>Cramer's V</strong>
                            <li><strong>Theil's U</strong>
                        </ul>
                    <li>Different types of data correlation (continuous and categorical/binary variabels)<ul>
                            <li><strong>Point-Biserial</strong>
                            <li><strong>Correlation Ratio</strong>
                        </ul>
                </ul>
            <li>Result of calculation<ul>
                    <li><strong>Correlation coefficient (<i>r</i>)</strong> — for Linear Correlation<ul>
                                <li><img src="../../images/corrvalue.jpeg" style="background:white; width:25%"/>
                                <li>The main result of a correlation is called the correlation coefficient
                                <li>The degree of association is measured by correlation coefficient
                                <li>A correlation coefficient is a way to put a value to the relationship
                                <li>Correlation coefficients have a value of between -1 and 1
                                <li>A 0 means there is no relationship between the variables at all, while -1 or 1 means that there is a perfect negative or positive correlation
                                <li>The four posibilities<ul>
                                                <li><strong>r-value(low) & p-value(low)</strong> — Model doesn’t explain much about variation, but is significant (Better than nothing)
                                                <li><strong>r-value(low) & p-value(high)</strong> — Model doesn’t explain much about variation and not significant (Worst model)
                                                <li><strong>r-value(high) & p-value(low)</strong> — Model tells much about variation and significant (Best model)
                                                <li><strong>r-value(high) & p-value(high)</strong> — Model explains well about variation but not significant (Worthless)
                                        </ul>
                        </ul>
                </ul>
        </ul>
</ul>

---
---
<h2>1. a) Pearson's r</h2>
<ul style="font-size:13px">
    <li>It measures the linear relationship between the variables and assumes that the variables are <strong>normally distributed</strong>
    <li>Pearson's r correlation is the most widely used correlation statistic to measure the degree of the relationship between linearly related variables
    <li>In general, when the data is normally distributed we use Pearson correlation
    <li> Pearson’s correlation coefficient is naturally sensitive to <strong>skewed distributions</strong> and <strong>outliers</strong>
    <li>Features with high correlation are more linearly dependent and hence have almost the same effect on the dependent variable so when two features have high correlation, we can drop one of the two features
    <li>The formula<ul>
            <li>covariance(X, Y) = (sum (x - mean(X)) * (y - mean(Y)) ) * 1/(n-1)
            <li>Pearson's correlation coefficient = covariance(X, Y) / (stdv(X) * stdv(Y))
        </ul>
</ul>

In [3]:
from scipy.stats import pearsonr

# Between 2 features
df_feats = df[['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']].copy()
corr_feats = pd.DataFrame(data=[[0]*len(df_feats.columns) for i in range(len(df_feats.columns))],
                          columns=df_feats.columns
                          ).set_index([pd.Index(df_feats.columns)])

for i in df_feats.columns:
    for j in df_feats.columns:
        r, pval = pearsonr(df_feats[i], df_feats[j])
        corr_feats.loc[i, j] = r

display(corr_feats)
print("as we can see, between sqft_living-sqft_above has strong relationship and sqft_living-sqft_basement has moderate relationship,\nso we can drop one of them from features but which one? \n\n")

# Between feature and dependent variable
df_dep = df[['price', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']].copy()
corr_dep = pd.DataFrame(data=[[0]*len(df_dep.columns)],
                          columns=df_dep.columns
                          ).set_index([pd.Index(['price'])])

for i in df_dep.columns:
    r, pval = pearsonr(df_dep['price'], df_dep[i])
    corr_dep.loc['price', i] = r

display(corr_dep)
print("Based on the table above we can draw a conclusion that sqft_living has stronger relationship rather than sqft_above and sqft_basement\nso we can exclude sqft_above and sqft_basement from feature selection since both of them have relationship with sqft_living!")

Unnamed: 0,sqft_living,sqft_lot,sqft_above,sqft_basement
sqft_living,1.0,0.173319,0.876483,0.43522
sqft_lot,0.173319,1.0,0.184037,0.015354
sqft_above,0.876483,0.184037,1.0,-0.051982
sqft_basement,0.43522,0.015354,-0.051982,1.0


as we can see, between sqft_living-sqft_above has strong relationship and sqft_living-sqft_basement has moderate relationship,
so we can drop one of them from features but which one? 




Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement
price,1.0,0.701933,0.089824,0.605427,0.323832


Based on the table above we can draw a conclusion that sqft_living has stronger relationship rather than sqft_above and sqft_basement
so we can exclude sqft_above and sqft_basement from feature selection since both of them have relationship with sqft_living!


---
<h2>1. b) Spearman's Rho</h2>
<ul style="font-size:13px">
    <li>The Spearman rank correlation coefficient often denotes as <i><strong>ρ</strong></i> (<i>rho</i>) or rs, is the non-parametric version of the Pearson correlation coefficient
    <li>The Spearman correlation is less sensitive to strong outliers than the Pearson correlation as Spearman’s ρ limits the outlier to the value of its rank
    <li>The variables are not required to have normal distribution
    <li>It can deal with ordinal, interval or ratio variables for monotonic relationships (strictly increase or decrease, not both)
    <li>Spearman rank correlation could be interpreted similarly as the Pearson correlation coefficient as their value falls between -1 to 1
    <li>The formula<ul>
            <li>Spearman's correlation coefficient = covariance(rank(X), rank(Y)) / (stdv(rank(X)) * stdv(rank(Y)))
        </ul>
</ul>

In [4]:
from scipy.stats import spearmanr

# Between 2 features
df_feats = df[['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'condition', 'grade']].copy()
corr_feats = pd.DataFrame(data=[[0]*len(df_feats.columns) for i in range(len(df_feats.columns))],
                          columns=df_feats.columns
                          ).set_index([pd.Index(df_feats.columns)])

for i in df_feats.columns:
    for j in df_feats.columns:
        rho, pval = spearmanr(df_feats[i], df_feats[j], axis=0)
        corr_feats.loc[i, j] = rho

display(corr_feats)
print("The correlation result between features seems tend to be same with Pearson correlation coefficient, then we can see the relationship with the dependent variable \n\n")

# Between feature and dependent variable
df_dep = df[['price', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'condition', 'grade']].copy()
corr_dep = pd.DataFrame(data=[[0]*len(df_dep.columns)],
                          columns=df_dep.columns
                          ).set_index([pd.Index(['price'])])

for i in df_dep.columns:
    rho, pval = spearmanr(df_dep['price'], df_dep[i], axis=0)
    corr_dep.loc['price', i] = rho

display(corr_dep)
print("sqft_living (continuous variable) and grade (ordinal variable) features are the strongest that we can use as independent variables rather than others!")

Unnamed: 0,sqft_living,sqft_lot,sqft_above,sqft_basement,condition,grade
sqft_living,1.0,0.304279,0.843414,0.328018,-0.063062,0.716646
sqft_lot,0.304279,1.0,0.272747,0.036512,0.114492,0.152752
sqft_above,0.843414,0.272747,1.0,-0.165651,-0.158464,0.712054
sqft_basement,0.328018,0.036512,-0.165651,1.0,0.161327,0.092867
condition,-0.063062,0.114492,-0.158464,0.161327,1.0,-0.167699
grade,0.716646,0.152752,0.712054,0.092867,-0.167699,1.0


The correlation result between features seems tend to be same with Pearson correlation coefficient, then we can see the relationship with the dependent variable 




Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,condition,grade
price,1.0,0.64406,0.07502,0.541619,0.251641,0.018194,0.658298


sqft_living (continuous variable) and grade (ordinal variable) features are the strongest that we can use as independent variables rather than others!


---
<h2>1. c) Kendall's Tau</h2>
<ul style="font-size:13px">
    <li>Kendall's Tau is a non--parametric measure that does not require any assumptions regarding the joint probability distributions of variables
    <li>Generally, when the sample size is small and has many tied ranks, Kendall’s correlation often denotes as <i><strong>τ</strong></i> (<i>Tau</i>), is the best alternative to Spearman’s correlation
    <li>Kendall’s correlation requires the same data assumptions as Spearman’s correlation, which 1) ordinal, interval or ratio variables and 2) monotonic relationships between the two variables
    <li>However, unlike Spearman’s coefficient, Kendall Tau only measures directional agreement, not the rank differences
    <li>Therefore, this coefficient is more appropriate for discrete data
    <li>The formula<ul>
            <li>Kendall's Correlation Coefficient = (n⁺ − n⁻) / √((n⁺ + n⁻ + nˣ)(n⁺ + n⁻ + nʸ))
        </ul>
</ul>

In [5]:
from scipy.stats import kendalltau

# Between 2 features
df_feats = df[['bedrooms', 'bathrooms', 'floors', 'condition', 'grade']].copy()
corr_feats = pd.DataFrame(data=[[0]*len(df_feats.columns) for i in range(len(df_feats.columns))],
                          columns=df_feats.columns
                          ).set_index([pd.Index(df_feats.columns)])

for i in df_feats.columns:
    for j in df_feats.columns:
        tau, pval = kendalltau(df_feats[i], df_feats[j], method='auto')
        corr_feats.loc[i, j] = tau

display(corr_feats)
print("Between bedrooms-bathrooms, floors-bathrooms, and grade-floors have moderate correlations between them but grade-bathrooms has high correlation so we need to decide which features we want to drop off based on the table below\n\n")

# Between feature and dependent variable
df_dep = df[['price', 'bedrooms', 'bathrooms', 'floors', 'condition', 'grade']].copy()
corr_dep = pd.DataFrame(data=[[0]*len(df_dep.columns)],
                          columns=df_dep.columns
                          ).set_index([pd.Index(['price'])])

for i in df_dep.columns:
    tau, pval = kendalltau(df_dep['price'], df_dep[i], method='auto')
    corr_dep.loc['price', i] = tau

display(corr_dep)
print("In fact, only grade feature that has high correlation with dependent variable so we can use it as feature selected and drop the others!")

Unnamed: 0,bedrooms,bathrooms,floors,condition,grade
bedrooms,1.0,0.439277,0.202964,0.010533,0.331088
bathrooms,0.439277,1.0,0.451756,-0.13799,0.556846
floors,0.202964,0.451756,1.0,-0.265224,0.435467
condition,0.010533,-0.13799,-0.265224,1.0,-0.14908
grade,0.331088,0.556846,0.435467,-0.14908,1.0


Between bedrooms-bathrooms, floors-bathrooms, and grade-floors have moderate correlations between them but grade-bathrooms has high correlation so we need to decide which features we want to drop off based on the table below




Unnamed: 0,price,bedrooms,bathrooms,floors,condition,grade
price,1.0,0.264938,0.369347,0.251784,0.014245,0.527893


In fact, only grade feature that has high correlation with dependent variable so we can use it as feature selected and drop the others!


---
---
<h2>2. a) Cramer's V</h2>
<ul style="font-size:13px">
    <li>Cramer’s V or sometimes referred to as Cramér's <i><strong>φc</strong></i> (<i>phi</i>) is a measure of association between two discrete or nominal variables and indicates how strongly two categorical variables are associated
    <li>The measurement is based on the Pearson chi-square statistic and has an output range between 0 to 1<ul>
            <li>The closer the value to 0 means less association between the two variables
            <li>1 means strong association between the two variables
        </ul>
    <li>There is no negative (-) value as an output because there is no such thing as a negative association
    <li>Like correlation, Cramer’s V is symmetrical — it is insensitive to swapping x and y
</ul>

In [6]:
from scipy.stats import chi2_contingency

def cramers_v(x, y): # someone already implemented Cramer's V as a Python function
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    confusion_matrix = pd.crosstab(x,y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# we don't have any nominal type variable in our current dataset so we'll use ordinal variables instead of nominal
df_feats = df[['bedrooms', 'bathrooms', 'floors', 'condition', 'grade']].copy()

corr_feats = pd.DataFrame(data=[[0]*len(df_feats.columns) for i in range(len(df_feats.columns))],
                          columns=df_feats.columns
                          ).set_index([pd.Index(df_feats.columns)])

for i in df_feats.columns:
    for j in df_feats.columns:
        phi = cramers_v(df_feats[i], df_feats[j])
        corr_feats.loc[i, j] = phi

display(corr_feats)
print("Since all of features that we used are ordinal types, seems the result didn't show any strong or moderate correlation between each feature")

Unnamed: 0,bedrooms,bathrooms,floors,condition,grade
bedrooms,1.0,0.2591,0.146051,0.048906,0.163679
bathrooms,0.2591,1.0,0.299194,0.146831,0.39555
floors,0.146051,0.299194,1.0,0.178469,0.24456
condition,0.048906,0.146831,0.178469,1.0,0.129388
grade,0.163679,0.39555,0.24456,0.129388,1.0


Since all of features that we used are ordinal types, seems the result didn't show any strong or moderate correlation between each feature


---
<h2>2. b) Theil's U</h2>
<ul style="font-size:13px">
    <li>Theil’s U, also referred to as the Uncertainty Coefficient, is based on the conditional entropy between x and y — or in human language, given the value of x, how many possible states does y have, and how often do they occur
    <li>Unlike Cramer’s V, it is asymmetric, meaning U(x,y)≠U(y,x) while V(x,y)=V(y,x), where V is Cramer’s V
    <li>The measurement has an output range between 0 to 1<ul>
            <li>The closer the value to 0 means less association between the two variables
            <li>1 means strong association between the two variables
        </ul>
</ul>

In [7]:
# Theil's U already defined as Python functions by Shaked Zychlinski and codes can be found at https://github.com/shakedzy/dython
import math
from collections import Counter
from scipy.stats import entropy

_REPLACE = 'replace'
_DEFAULT_REPLACE_VALUE = 0.0
_DROP = 'drop'

def remove_incomplete_samples(x, y):
    x = [v if v is not None else np.nan for v in x]
    y = [v if v is not None else np.nan for v in y]
    arr = np.array([x, y]).transpose()
    arr = arr[~np.isnan(arr).any(axis=1)].transpose()
    if isinstance(x, list):
        return arr[0].tolist(), arr[1].tolist()
    else:
        return arr[0], arr[1]

def replace_nan_with_value(x, y, value):
    x = np.array([v if v == v and v is not None else value for v in x]) 
    y = np.array([v if v == v and v is not None else value for v in y])
    return x, y

def conditional_entropy(x, y, nan_strategy=_REPLACE, nan_replace_value=_DEFAULT_REPLACE_VALUE, log_base: float = math.e):
    if nan_strategy == _REPLACE:
        x, y = replace_nan_with_value(x, y, nan_replace_value)
    elif nan_strategy == _DROP:
        x, y = remove_incomplete_samples(x, y)
    y_counter = Counter(y)
    xy_counter = Counter(list(zip(x, y)))
    total_occurrences = sum(y_counter.values())
    entropy_val = 0.0
    for xy in xy_counter.keys():
        p_xy = xy_counter[xy] / total_occurrences
        p_y = y_counter[xy[1]] / total_occurrences
        entropy_val += p_xy * math.log(p_y / p_xy, log_base)
    return entropy_val

def theils_u(x, y):
    s_xy = conditional_entropy(x, y, nan_strategy='replace', nan_replace_value=0.0)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
    s_x = entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x

In [8]:
# we don't have any nominal type variable in our current dataset so we'll use ordinal variables instead of nominal
df_feats = df[['bedrooms', 'bathrooms', 'floors', 'condition', 'grade']].copy()

corr_feats = pd.DataFrame(data=[[0]*len(df_feats.columns) for i in range(len(df_feats.columns))],
                          columns=df_feats.columns
                          ).set_index([pd.Index(df_feats.columns)])

for i in df_feats.columns:
    for j in df_feats.columns:
        u = theils_u(df_feats[i], df_feats[j])
        corr_feats.loc[i, j] = u

display(corr_feats)
print("Since all of features that we used are ordinal types, seems the result didn't show any strong or moderate correlation between each feature")

Unnamed: 0,bedrooms,bathrooms,floors,condition,grade
bedrooms,1.0,0.146666,0.037666,0.003022,0.076706
bathrooms,0.086184,1.0,0.114478,0.020905,0.150357
floors,0.045653,0.236123,1.0,0.064546,0.151672
condition,0.004458,0.052477,0.078555,1.0,0.030151
grade,0.065649,0.218992,0.107101,0.017494,1.0


Since all of features that we used are ordinal types, seems the result didn't show any strong or moderate correlation between each feature


---
---
<h2>3. a) Point-Biserial</h2>
<ul style="font-size:13px">
    <li>The Point-Biserial correlation coefficient (<i>rpb</i>) is a correlation coefficient used when we need to correlate a continuous variable with another dichotomous variable (<strong>nominal type/binary variable that only has 2 values, ie. : Male/Female, Yes/No, True/False, etc</strong>)
    <li>The point biserial correlation coefficient lies in the range [-1, 1] with 0 implying no correlation and its interpretation is very similar to Pearson’s Product Moment Correlation Coefficient
</ul>

In [9]:
# we don't have any binary variable in our current dataset so we'll use ordinal variables instead of binary
from scipy.stats import pointbiserialr

# Between 2 features
df_feats = df.drop(columns=['price']).copy()
cont_cols = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']
cat_cols = ['bedrooms', 'bathrooms', 'floors', 'condition', 'grade']
corr_feats = pd.DataFrame(data=[[0]*len(cont_cols) for i in range(len(cat_cols))],
                          columns=cont_cols
                          ).set_index([pd.Index(cat_cols)])

for i in cat_cols:
    for j in cont_cols:
        rpb, pval = pointbiserialr(df_feats[i], df_feats[j])
        corr_feats.loc[i, j] = rpb

display(corr_feats)

# Between feature and dependent variable
df_dep = df[['price', 'bedrooms', 'bathrooms', 'floors', 'condition', 'grade']].copy()
corr_dep = pd.DataFrame(data=[[0]*len(df_dep.columns)],
                          columns=df_dep.columns
                          ).set_index([pd.Index(['price'])])

for i in df_dep.columns:
    rpb, pval = pointbiserialr(df_dep[i], df_dep['price'])
    corr_dep.loc['price', i] = rpb

display(corr_dep)

Unnamed: 0,sqft_living,sqft_lot,sqft_above,sqft_basement
bedrooms,0.578406,0.032301,0.479557,0.302985
bathrooms,0.755912,0.088225,0.686822,0.283586
floors,0.354475,-0.004675,0.524496,-0.245561
condition,-0.059378,-0.00893,-0.158815,0.17383
grade,0.762957,0.114576,0.756211,0.168436


Unnamed: 0,price,bedrooms,bathrooms,floors,condition,grade
price,1.0,0.308947,0.525908,0.257024,0.03611,0.66789


---
<h2>3. b) Correlation Ratio</h2>
<ul style="font-size:13px">
    <li>The correlation ratio is a measure of the relationship between the statistical dispersion within individual categories and the dispersion across the whole population or sample
    <li>We can use the Correlation Ratio to measure the correlation a pair of a continuous feature and a categorical feature
    <li>Mathematically, it is defined as the weighted variance of the mean of each category divided by the variance of all samples
    <li>in human language, the Correlation Ratio answers the question <i>"given a continuous number, how well can we know to which category it belongs to?"</i>
    <li>The measurement has an output range between 0 to 1<ul>
            <li>The closer the value to 0 means less association between the two variables
            <li>1 means strong association between the two variables
        </ul>
</ul>

In [10]:
def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0,cat_num):
        cat_measures = measurements.iloc[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = np.sqrt(numerator/denominator)
    return eta

In [11]:
# Between 2 features
df_feats = df.drop(columns=['price']).copy()
cont_cols = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']
cat_cols = ['bedrooms', 'bathrooms', 'floors', 'condition', 'grade']
corr_feats = pd.DataFrame(data=[[0]*len(cont_cols) for i in range(len(cat_cols))],
                          columns=cont_cols
                          ).set_index([pd.Index(cat_cols)])

for i in cat_cols:
    for j in cont_cols:
        eta = correlation_ratio(df_feats[i], df_feats[j])
        corr_feats.loc[i, j] = eta

display(corr_feats)

# Between feature and dependent variable
df_dep = df[['price', 'bedrooms', 'bathrooms', 'floors', 'condition', 'grade']].copy()
corr_dep = pd.DataFrame(data=[[0]*len(df_dep.columns)],
                          columns=df_dep.columns
                          ).set_index([pd.Index(['price'])])

for i in df_dep.columns:
    eta = correlation_ratio(df_dep[i], df_dep['price'])
    corr_dep.loc['price', i] = eta

display(corr_dep)

Unnamed: 0,sqft_living,sqft_lot,sqft_above,sqft_basement
bedrooms,0.602777,0.039613,0.511411,0.324039
bathrooms,0.769482,0.122311,0.707841,0.377634
floors,0.44303,0.049153,0.609873,0.255572
condition,0.120097,0.042785,0.198974,0.174174
grade,0.774582,0.145166,0.770775,0.197681


Unnamed: 0,price,bedrooms,bathrooms,floors,condition,grade
price,1.0,0.326477,0.597962,0.291021,0.082506,0.72115
