# Gabriella Colletti
# Correlation Matrix By Scratch

We calculate the correlation coefficent using the pearson coefficient formula: $r = \frac{1}{n-1} \sum \frac{(x_i - \mu_x)}{\sigma_x}* \frac{(y_i-\mu_y)}{\sigma_y}$ where $\mu$ is the feature mean and $\sigma$ is the feature standard deviation 

#### Import Data

In [1]:
from sklearn import datasets
import pandas as pd

wine_data = datasets.load_wine(as_frame=True)
wine_df = pd.DataFrame(wine_data.frame)

In [2]:
wine_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


#### Get feature mean & standard deviation

In [3]:
features = list(wine_df.columns)  # List of Features 
feature_means = {}     #  Dictionary of Feature means
feature_std_dev = {}   #  Dictionary of Feature Standard Deviations

In [4]:
from math import sqrt
for feature in features:
    #================ CALCULATE FEATURE MEANS: u =================#
    feature_mean = 0
    for row_idx, value in wine_df[feature].iteritems():    
        feature_mean += value
    feature_mean = feature_mean/wine_df.shape[0]
    feature_means[feature]= feature_mean
    #===== CALCULATE FEATURE STD DEV. sum of (x_i-u)^2 ==========#
    std_dev = 0
    for row_idx, value in wine_df[feature].iteritems():
        std_dev += (value-feature_mean)**2
    std_dev = sqrt(std_dev/(wine_df.shape[0]-1))
    feature_std_dev[feature] = std_dev

### Calculate Correlation Coefficients

In [5]:
corr_matrix = []
for feature_X in features:
    column = []
    for feature_Y in features:
        X_dev,  Y_dev  = feature_std_dev[feature_X], feature_std_dev[feature_Y]
        X_mean, Y_mean = feature_means[feature_X], feature_means[feature_Y]
        coefficient = 0
        for row, observation in wine_df.iterrows():
            coefficient += ((observation[feature_X] - X_mean)/X_dev)* ((observation[feature_Y]-Y_mean)/Y_dev)
        coefficient = coefficient/(wine_df.shape[0]-1)
        column.append(coefficient)
    corr_matrix.append(column)

In [6]:
#======== DISPLAY MATRIX USING PANDAS DATAFRAMES ========#
matrix = pd.DataFrame({'Feature':features})
for idx in range(len(features)):
    matrix[features[idx]]=corr_matrix[idx]
display(matrix)

Unnamed: 0,Feature,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,alcohol,1.0,0.094397,0.211545,-0.310235,0.270798,0.289101,0.236815,-0.155929,0.136698,0.546364,-0.071747,0.072343,0.64372,-0.328222
1,malic_acid,0.094397,1.0,0.164045,0.2885,-0.054575,-0.335167,-0.411007,0.292977,-0.220746,0.248985,-0.561296,-0.36871,-0.192011,0.437776
2,ash,0.211545,0.164045,1.0,0.443367,0.286587,0.12898,0.115077,0.18623,0.009652,0.258887,-0.074667,0.003911,0.223626,-0.049643
3,alcalinity_of_ash,-0.310235,0.2885,0.443367,1.0,-0.083333,-0.321113,-0.35137,0.361922,-0.197327,0.018732,-0.273955,-0.276769,-0.440597,0.517859
4,magnesium,0.270798,-0.054575,0.286587,-0.083333,1.0,0.214401,0.195784,-0.256294,0.236441,0.19995,0.055398,0.066004,0.393351,-0.209179
5,total_phenols,0.289101,-0.335167,0.12898,-0.321113,0.214401,1.0,0.864564,-0.449935,0.612413,-0.055136,0.433681,0.699949,0.498115,-0.719163
6,flavanoids,0.236815,-0.411007,0.115077,-0.35137,0.195784,0.864564,1.0,-0.5379,0.652692,-0.172379,0.543479,0.787194,0.494193,-0.847498
7,nonflavanoid_phenols,-0.155929,0.292977,0.18623,0.361922,-0.256294,-0.449935,-0.5379,1.0,-0.365845,0.139057,-0.26264,-0.50327,-0.311385,0.489109
8,proanthocyanins,0.136698,-0.220746,0.009652,-0.197327,0.236441,0.612413,0.652692,-0.365845,1.0,-0.02525,0.295544,0.519067,0.330417,-0.49913
9,color_intensity,0.546364,0.248985,0.258887,0.018732,0.19995,-0.055136,-0.172379,0.139057,-0.02525,1.0,-0.521813,-0.428815,0.3161,0.265668


#### Verify Correctness:
As we see the from scratch and built in correlation matrices are identical, as desired. 

In [7]:
wine_df.corr()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
alcohol,1.0,0.094397,0.211545,-0.310235,0.270798,0.289101,0.236815,-0.155929,0.136698,0.546364,-0.071747,0.072343,0.64372,-0.328222
malic_acid,0.094397,1.0,0.164045,0.2885,-0.054575,-0.335167,-0.411007,0.292977,-0.220746,0.248985,-0.561296,-0.36871,-0.192011,0.437776
ash,0.211545,0.164045,1.0,0.443367,0.286587,0.12898,0.115077,0.18623,0.009652,0.258887,-0.074667,0.003911,0.223626,-0.049643
alcalinity_of_ash,-0.310235,0.2885,0.443367,1.0,-0.083333,-0.321113,-0.35137,0.361922,-0.197327,0.018732,-0.273955,-0.276769,-0.440597,0.517859
magnesium,0.270798,-0.054575,0.286587,-0.083333,1.0,0.214401,0.195784,-0.256294,0.236441,0.19995,0.055398,0.066004,0.393351,-0.209179
total_phenols,0.289101,-0.335167,0.12898,-0.321113,0.214401,1.0,0.864564,-0.449935,0.612413,-0.055136,0.433681,0.699949,0.498115,-0.719163
flavanoids,0.236815,-0.411007,0.115077,-0.35137,0.195784,0.864564,1.0,-0.5379,0.652692,-0.172379,0.543479,0.787194,0.494193,-0.847498
nonflavanoid_phenols,-0.155929,0.292977,0.18623,0.361922,-0.256294,-0.449935,-0.5379,1.0,-0.365845,0.139057,-0.26264,-0.50327,-0.311385,0.489109
proanthocyanins,0.136698,-0.220746,0.009652,-0.197327,0.236441,0.612413,0.652692,-0.365845,1.0,-0.02525,0.295544,0.519067,0.330417,-0.49913
color_intensity,0.546364,0.248985,0.258887,0.018732,0.19995,-0.055136,-0.172379,0.139057,-0.02525,1.0,-0.521813,-0.428815,0.3161,0.265668


Note that the diagonal of the matrix has all 1s. A correlation coefficient of 1 indicates a perfect positive linear correlation; indicating that on the diagonals the observations from the two compared features are identical. This makes sense because the feature comparisons are identical (e.g. Feature A vs Feature A) that any feature with itself will be perferctly positively correlated on every observation since each value for each observation is identical and the standard deviation and means of the identical features are the same. 
