# Variable Ranking by Pearson/Linear Correlation Coefficient - LEAF Dataset

In [15]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
leaf_data = np.loadtxt('../Data_USL/leaf.csv',delimiter=',')

In [3]:
leaf_data.shape

(340, 16)

In [4]:
def linear_correlation_coeff(x,y):
    xmean = np.mean(x)
    ymean = np.mean(y)
    stdvx = np.sum((x-xmean)**2)
    stdvy = np.sum((y-ymean)**2)
    r = np.dot(x-xmean,y-ymean)/np.sqrt((stdvx*stdvy))
    return r

In [5]:
loop_end = leaf_data.shape[1]
r_lcc = []
for i in range (1,loop_end):
    r = linear_correlation_coeff(leaf_data[:,0],leaf_data[:,i])
    r_lcc.append(r)
    print('i: ', r_lcc[i-1])

i:  -0.015142016547326376
i:  0.09141460302089634
i:  0.2752101178490783
i:  0.14127524729577043
i:  0.11184290470599254
i:  0.04667831920371106
i:  -0.04976661317047229
i:  -0.04002558235513152
i:  -0.0170480505941396
i:  0.10245321896478979
i:  0.07624645270769874
i:  0.0948852053530928
i:  0.05852012269141438
i:  0.18771684999019722
i:  0.01769003965398185


In [6]:
from scipy.stats import pearsonr
loop_end = leaf_data.shape[1]
r_p = []
for i in range (1,loop_end):
    r, pval = pearsonr(leaf_data[:,0],leaf_data[:,i])
    r_p.append(r)
    print('i: ', r_p[i-1])

i:  -0.01514201654732638
i:  0.09141460302089632
i:  0.27521011784907823
i:  0.1412752472957704
i:  0.11184290470599252
i:  0.046678319203711065
i:  -0.04976661317047228
i:  -0.04002558235513153
i:  -0.017048050594139594
i:  0.10245321896478976
i:  0.07624645270769873
i:  0.09488520535309278
i:  0.05852012269141436
i:  0.18771684999019722
i:  0.017690039653981846


In [7]:
count = 0
for i in range (loop_end-1):
    b = math.isclose(r_p[i], r_lcc[i], abs_tol=0.000001)
    if b == True:
        count += 1
    
if count == loop_end - 1:    
    print("The correlation coefficients calculated by us and by Scipy are EQUAL.")
else:
    print("The correlation coefficients vs Scipy are NOT EQUAL.")

The correlation coefficients calculated by us and by Scipy are EQUAL.


In [12]:
np.set_printoptions(precision=17,suppress=True,edgeitems=16)
corr_coeff_leaf_data = np.corrcoef(leaf_data.transpose())
print("For sake of completeness, we can also compute the correlation\
 matrix with Numpy as follows.\nThe matrix shape is: ",
      np.corrcoef(leaf_data.transpose()).shape)

print("\n\nThe matrix is:\n",corr_coeff_leaf_data)

# WHY TRANSPOSE?

# From the docs: https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html
# numpy.corrcoef(x, y=None,...)
# Parameters: x: array_like
# A 1-D or 2-D array containing multiple variables and observations.
# Each row of x represents a variable.
# Each column is a single observation of all those variables.

For sake of completeness, we can also compute the correlation matrix with Numpy as follows.
The matrix shape is:  (16, 16)


The matrix is:
 [[ 1.                  -0.01514201654732637  0.09141460302089631
   0.27521011784907823  0.14127524729577037  0.11184290470599252
   0.04667831920371095 -0.04976661317047221 -0.04002558235513153
  -0.01704805059413954  0.1024532189647896   0.0762464527076986
   0.0948852053530928   0.05852012269141438  0.18771684999019725
   0.0176900396539819 ]
 [-0.01514201654732637  0.9999999999999999  -0.07677164528704074
  -0.02549034264831938 -0.02772203832821884 -0.07162331500178314
  -0.02521871872505347  0.00412931063626591  0.06501620767914641
   0.06232432212049924 -0.00972467242531314 -0.01266410681958376
   0.00462822148520764  0.01063207126162024 -0.04040705316444876
  -0.03354806139936364]
 [ 0.0914146030208963  -0.07677164528704074  1.
   0.5510688069767617   0.553561341598504    0.3735355974537756
   0.3863318587834139  -0.03608618694427429 -0.274

In [9]:
r_ord_lcc = r_lcc.copy() # copy first, since sort is in-place
r_ord_lcc.sort(reverse=True) # reverse, for descending order
r_ord_lcc

[0.2752101178490783,
 0.18771684999019722,
 0.14127524729577043,
 0.11184290470599254,
 0.10245321896478979,
 0.0948852053530928,
 0.09141460302089634,
 0.07624645270769874,
 0.05852012269141438,
 0.04667831920371106,
 0.01769003965398185,
 -0.015142016547326376,
 -0.0170480505941396,
 -0.04002558235513152,
 -0.04976661317047229]

In [10]:
max_indices = np.argsort(r_lcc) # default is ascending max indices 
max_indices =  max_indices[::-1] # Descending
print("The features of interest are, in descending order of relevance, COLUMNS:",
      max_indices+1)

The features of interest are, in descending order of relevance, COLUMNS: [ 3 14  4  5 10 12  2 11 13  6 15  1  9  8  7]


# Variable Ranking by Mutual Information - Congressional Voting Records Dataset

In [27]:
cv_df = pd.read_csv('../Data_USL/house-votes-84.data',header=None)

In [102]:
cv_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,republican,n,n,y,y,y,y,n,n,y,y,n,y,y,y,n,y
431,democrat,n,n,y,n,n,n,y,y,y,y,n,n,n,n,n,y
432,republican,n,?,n,y,y,y,n,n,n,n,y,y,y,y,n,y
433,republican,n,n,n,y,y,y,?,?,?,?,n,y,y,y,n,y


In [103]:
cv_df[cv_df[16]=='?'][16]

1      ?
9      ?
11     ?
12     ?
13     ?
      ..
389    ?
390    ?
393    ?
400    ?
425    ?
Name: 16, Length: 104, dtype: object

In [99]:
print("Number of missing values per column:")
print("Column \t\t Number of Missing Values")
for i in range (cv_df.shape[1]):
    print(i,"\t\t\t", cv_df[cv_df[i] == '?'][i].size)

Number of missing values per column:
Column 		 Number of Missing Values
0 			 0
1 			 12
2 			 48
3 			 11
4 			 11
5 			 15
6 			 11
7 			 14
8 			 15
9 			 22
10 			 7
11 			 21
12 			 31
13 			 25
14 			 17
15 			 28
16 			 104


In [41]:
cv_df_mod = cv_df.replace('?',np.nan)

In [61]:
tmp = cv_df_mod[2].dropna()
tmp

0      y
1      y
2      y
3      y
4      y
      ..
429    n
430    n
431    n
433    n
434    y
Name: 2, Length: 387, dtype: object

In [64]:
n_cnt = tmp[tmp == 'n'].size
y_cnt = tmp.size - n_cnt
total = y_cnt + n_cnt
orig_size = tmp.size

In [65]:
print(n_cnt,y_cnt,total,orig_size)

192 195 387 387
