# Pearson/Linear Correlation Coefficient

In [1]:
import math
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
leaf_data = np.loadtxt('../Data_USL/leaf.csv',delimiter=',')

In [3]:
leaf_data.shape

(340, 16)

In [4]:
def linear_correlation_coeff(x,y):
    xmean = np.mean(x)
    ymean = np.mean(y)
    stdvx = np.sum((x-xmean)**2)
    stdvy = np.sum((y-ymean)**2)
    r = np.dot(x-xmean,y-ymean)/np.sqrt((stdvx*stdvy))
    return r

In [5]:
loop_end = leaf_data.shape[1]
r_lcc = []
for i in range (1,loop_end):
    r = linear_correlation_coeff(leaf_data[:,0],leaf_data[:,i])
    r_lcc.append(r)
    print('i: ', r_lcc[i-1])

i:  -0.015142016547326376
i:  0.09141460302089634
i:  0.2752101178490783
i:  0.14127524729577043
i:  0.11184290470599254
i:  0.04667831920371106
i:  -0.04976661317047229
i:  -0.04002558235513152
i:  -0.0170480505941396
i:  0.10245321896478979
i:  0.07624645270769874
i:  0.0948852053530928
i:  0.05852012269141438
i:  0.18771684999019722
i:  0.01769003965398185


In [6]:
from scipy.stats import pearsonr
loop_end = leaf_data.shape[1]
r_p = []
for i in range (1,loop_end):
    r, pval = pearsonr(leaf_data[:,0],leaf_data[:,i])
    r_p.append(r)
    print('i: ', r_p[i-1])

i:  -0.01514201654732638
i:  0.09141460302089632
i:  0.27521011784907823
i:  0.1412752472957704
i:  0.11184290470599252
i:  0.046678319203711065
i:  -0.04976661317047228
i:  -0.04002558235513153
i:  -0.017048050594139594
i:  0.10245321896478976
i:  0.07624645270769873
i:  0.09488520535309278
i:  0.05852012269141436
i:  0.18771684999019722
i:  0.017690039653981846


In [7]:
count = 0
for i in range (loop_end-1):
    b = math.isclose(r_p[i], r_lcc[i], abs_tol=0.000001)
    if b == True:
        count += 1
    
if count == loop_end - 1:    
    print("The correlation coefficients calculated by us and by Scipy are EQUAL.")
else:
    print("The correlation coefficients vs Scipy are NOT EQUAL.")

The correlation coefficients calculated by us and by Scipy are EQUAL.


In [8]:
np.set_printoptions(precision=3,suppress=True,edgeitems=16)
corr_coeff_leaf_data = np.corrcoef(leaf_data.transpose())
print("For sake of completeness, we can also compute the correlation\
 matrix with Numpy as follows.\nThe matrix shape is: ",
      np.corrcoef(leaf_data.transpose()).shape)

print("\n\nThe matrix is:\n",corr_coeff_leaf_data)

# WHY TRANSPOSE?

# From the docs: https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html
# numpy.corrcoef(x, y=None,...)
# Parameters: x: array_like
# A 1-D or 2-D array containing multiple variables and observations.
# Each row of x represents a variable.
# Each column is a single observation of all those variables.

For sake of completeness, we can also compute the correlation matrix with Numpy as follows.
The matrix shape is:  (16, 16)


The matrix is:
 [[ 1.    -0.015  0.091  0.275  0.141  0.112  0.047 -0.05  -0.04  -0.017
   0.102  0.076  0.095  0.059  0.188  0.018]
 [-0.015  1.    -0.077 -0.025 -0.028 -0.072 -0.025  0.004  0.065  0.062
  -0.01  -0.013  0.005  0.011 -0.04  -0.034]
 [ 0.091 -0.077  1.     0.551  0.554  0.374  0.386 -0.036 -0.274 -0.214
  -0.226 -0.195 -0.191 -0.15  -0.25  -0.24 ]
 [ 0.275 -0.025  0.551  1.     0.678  0.005  0.107 -0.471  0.092  0.122
  -0.282 -0.298 -0.263 -0.234 -0.229 -0.313]
 [ 0.141 -0.028  0.554  0.678  1.    -0.411 -0.379 -0.793  0.437  0.408
  -0.205 -0.189 -0.179 -0.151 -0.251 -0.211]
 [ 0.112 -0.072  0.374  0.005 -0.411  1.     0.863  0.755 -0.886 -0.824
   0.085  0.083  0.078  0.059  0.126  0.056]
 [ 0.047 -0.025  0.386  0.107 -0.379  0.863  1.     0.656 -0.769 -0.7
   0.063  0.058  0.051  0.037  0.107  0.053]
 [-0.05   0.004 -0.036 -0.471 -0.793  0.75

In [9]:
r_ord_lcc = r_lcc.copy() # copy first, since sort is in-place
r_ord_lcc.sort(reverse=True) # reverse, for descending order
r_ord_lcc

[0.2752101178490783,
 0.18771684999019722,
 0.14127524729577043,
 0.11184290470599254,
 0.10245321896478979,
 0.0948852053530928,
 0.09141460302089634,
 0.07624645270769874,
 0.05852012269141438,
 0.04667831920371106,
 0.01769003965398185,
 -0.015142016547326376,
 -0.0170480505941396,
 -0.04002558235513152,
 -0.04976661317047229]

In [10]:
max_indices = np.argsort(r_lcc) # default is ascending max indices 
max_indices =  max_indices[::-1] # Descending
print("The features of interest are, in descending order of relevance, COLUMNS:",
      max_indices+1)

The features of interest are, in descending order of relevance, COLUMNS: [ 3 14  4  5 10 12  2 11 13  6 15  1  9  8  7]
