## This is an example of how to proceed bivariate analysis for quantitative data using covariance and the pearson correlation coefficient, with Python, Pandas, NumPy and Matplotlib

* the formulas shown at this notebook have been taken from the following reference:<br>
FÁVERO, L. P.; BELFIORE, P. **Manual de Análise de Dados: Estatística e Machine Learning com Excel®, SPSS®, Stata®, R® e Python®**. 2ª edição, 1288 p. Brasil: ccGEN LTC, 2024.<br>
Available in Brazil at:<br>
https://www.amazon.com.br/Manual-An-C3-A1lise-Dados-Estat-C3-ADstica-Learning-dp-8595159920/dp/8595159920

In [1]:
# importing libs and setting default plot style
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("dark_background")
import pandas as pd
import numpy as np
from scipy import stats
from functools import reduce

![covariance-correlation-coefficient](covariance-correlation-coefficient.png)

In [2]:
# Case: check if there is any association between each two disciplines when compared student grades
# getting sample data from .csv file
disciplines_grades_dataframe = pd.read_csv("disciplines-grades.csv")
from pandas import option_context
with option_context('display.max_rows', 10):
    print(disciplines_grades_dataframe)

    Observation  Math  Physics  Literature
0             1  5.50      7.5         9.0
1             2  9.00      8.5         5.5
2             3  4.50      5.0         6.5
3             4  6.50      8.0         6.5
4             5  7.50      6.0         5.0
..          ...   ...      ...         ...
25           26  7.00      7.5         8.0
26           27  5.00      4.5         9.0
27           28  6.50      8.0         5.0
28           29  8.50      6.0         6.0
29           30  9.75      5.0         6.5

[30 rows x 4 columns]


In [3]:
# calculating univariate statistics for each discipline - mean measure:

math_mean = np.mean(disciplines_grades_dataframe["Math"])
print(f"math_mean = {math_mean}")

physics_mean = np.mean(disciplines_grades_dataframe["Physics"])
print(f"physics_mean = {physics_mean}")

literature_mean = np.mean(disciplines_grades_dataframe["Literature"])
print(f"literature_mean = {literature_mean}")

math_mean = 6.775
physics_mean = 6.866666666666666
literature_mean = 6.566666666666666


In [4]:
# calculating univariate statistics for each discipline - standard deviation

math_sample_standard_deviation = np.std(disciplines_grades_dataframe["Math"], ddof=1)
print(f"math_sample_standard_deviation = {math_sample_standard_deviation}")

physics_sample_standard_deviation = np.std(disciplines_grades_dataframe["Physics"], ddof=1)
print(f"physics_sample_standard_deviation = {physics_sample_standard_deviation}")

literature_sample_standard_deviation = np.std(disciplines_grades_dataframe["Literature"], ddof=1)
print(f"literature_sample_standard_deviation = {literature_sample_standard_deviation}")


math_sample_standard_deviation = 2.0535398369812468
physics_sample_standard_deviation = 1.71671967332515
literature_sample_standard_deviation = 1.7157150581957192


In [5]:
# calculating bivariate statistics for each pair of disciplines - covariance

my_n = np.count_nonzero(disciplines_grades_dataframe["Observation"])

def calculate_sample_covariance_of_two_disciplines(discipline_1_series, discipline_2_series, discipline_1_mean, discipline_2_mean, my_n):
    my_accumulator = 0
    for i in range(0,my_n):
        temp_discipline_1_grade = discipline_1_series[i]
        temp_discipline_2_grade = discipline_2_series[i]
        my_accumulator+=(temp_discipline_1_grade-discipline_1_mean)*(temp_discipline_2_grade-discipline_2_mean)/(my_n-1)
    return my_accumulator

In [6]:
math_physics_sample_covariance = calculate_sample_covariance_of_two_disciplines(disciplines_grades_dataframe["Math"], disciplines_grades_dataframe["Physics"], math_mean, physics_mean, my_n)
print(f"math_physics_sample_covariance = {math_physics_sample_covariance}")
math_literature_sample_covariance = calculate_sample_covariance_of_two_disciplines(disciplines_grades_dataframe["Math"], disciplines_grades_dataframe["Literature"], math_mean, literature_mean, my_n)
print(f"math_literature_sample_covariance = {math_literature_sample_covariance}")
physics_literature_sample_covariance = calculate_sample_covariance_of_two_disciplines(disciplines_grades_dataframe["Physics"], disciplines_grades_dataframe["Literature"], physics_mean, literature_mean, my_n)
print(f"physics_literature_sample_covariance = {physics_literature_sample_covariance}")

math_physics_sample_covariance = 2.124137931034483
math_literature_sample_covariance = -1.0879310344827589
physics_literature_sample_covariance = -0.8528735632183908


In [7]:
# calculating bivariate statistics for each pair of disciplines - pearson correlation coefficient

math_physics_sample_pearson_correlation_coefficient = math_physics_sample_covariance/(math_sample_standard_deviation*physics_sample_standard_deviation)
print(f"math_physics_sample_pearson_correlation_coefficient = {math_physics_sample_pearson_correlation_coefficient}")
math_literature_sample_pearson_correlation_coefficient = math_literature_sample_covariance/(math_sample_standard_deviation*literature_sample_standard_deviation)
print(f"math_literature_sample_pearson_correlation_coefficient = {math_literature_sample_pearson_correlation_coefficient}")
physics_literature_sample_pearson_correlation_coefficient = physics_literature_sample_covariance/(physics_sample_standard_deviation*literature_sample_standard_deviation)
print(f"physics_literature_sample_pearson_correlation_coefficient = {physics_literature_sample_pearson_correlation_coefficient}")


math_physics_sample_pearson_correlation_coefficient = 0.6025321124160095
math_literature_sample_pearson_correlation_coefficient = -0.30878277832686224
physics_literature_sample_pearson_correlation_coefficient = -0.28956102521926375


In [10]:
# INTERPRETATION (the correct interpretation depends not on descriptive measures alone, but also on probability distributions and hypothesis 
# tests, which are explained later at other modules of this course, available at this same Github repository)

# At a first glance, all disciplines "appear" to be correlated in terms of student grades, when compared in pairs - there's no zero 
# correlation! - being the positive correlation the one between math and physics (correlation=0.6025), quite high, and the lower negative 
# correlations between math and literature (correlation=-0.3088) and physics and literature (correlation=-0.2896). Students who are great 
# in Math tend also to be great in Physics, but worse in Literature. Students who are great in Physics tend also to be great in Math, but 
# worse in Literature. And, finally, students good in Literature tend to be worse both in Math and Physics. For comparing stats, the pearson 
# correlation coefficient is better than the covariance as the range of its extreme values is always between -1 and 1... while the covariance
# has no limits.

# Unfortunately, the interpretation above is partially wrong, when we test the significance of such correlations between disciplines.
# As we can observe at another module of this course, if we test the H0 and H1 hypothesis for each pair above - through a Student t 
# probabilty distribution test, considering a 5% level of significance - and calculate the t statistic for each pearson correlation 
# coefficient value above, we will see that only the first correlation is significant - between math/physics - but not the other two - 
# math/literature and physics/literature. You can verify that at the Hypothesis Test module, where we get back to this example, in order 
# to do just that. This means that we can extrapolate to the population our sample results for the pearson correlation coefficient only 
# regarding the math/physics disciplines, with a level of significance (alfa) of 5%. But we can't do the same for the other pairs, 
# considering that same alfa of 5%. In other words, extrapolating to the general population, we can say that math and physics have a 
# statistically significant correlation regarding student grades, but math and literature or physics and literature are NOT statistically 
# correlated, if considered a significance level of 5%. For more, check out the Hypothesis Test module at this same Github repository.

# That being said, we now may have in mind that it is not sufficient to calculate descriptive measures between variables, either 
# qualitative or quantitative (discrete or continuous), at mono or bivariable analysis. We must perform hypothesis tests, based on 
# adequate probability distributions, and calculate other statistics, in order to quantify and compare the significance between such 
# descriptive measures' results, either in our own sample or between our sample and the population in general (e.g. the whole considerable 
# population for an event or the whole historical data regarding some production line, etc.). We can't predict if the differences we find 
# only at descriptive measures (numerically) are actually significant in reality without hypothesis tests analysed at respective 
# probability distributions of such events. We can't apply our sample results to the real world without analysing their significance.

In [9]:
# or:

In [10]:
# calculating covariance directly from two series

def calculate_sample_covariance_of_two_disciplines_2(discipline_1_series, discipline_2_series):
    return np.cov(discipline_1_series, discipline_2_series, ddof=1)[0][1]

math_physics_sample_covariance_2 = calculate_sample_covariance_of_two_disciplines_2(disciplines_grades_dataframe["Math"], disciplines_grades_dataframe["Physics"])
print(f"math_physics_sample_covariance_2 = {math_physics_sample_covariance_2}")
math_literature_sample_covariance_2 = calculate_sample_covariance_of_two_disciplines_2(disciplines_grades_dataframe["Math"], disciplines_grades_dataframe["Literature"])
print(f"math_literature_sample_covariance_2 = {math_literature_sample_covariance_2}")
physics_literature_sample_covariance_2 = calculate_sample_covariance_of_two_disciplines_2(disciplines_grades_dataframe["Physics"], disciplines_grades_dataframe["Literature"])
print(f"physics_literature_sample_covariance_2 = {physics_literature_sample_covariance_2}")

# P.s: the np.cov() method above returns a 2D-matrix representing the covariance as following:
# np.cov()[0][0] = cov(a,a), being the auto-covariance of a single 'a' variable
# np.cov()[0][1] = cov(a,b), being the bivariate analysis of two variables
# np.cov()[1][0] = cov(a,b), being the bivariate analysis of two variables
# np.cov()[1][1] = cov(b,b), being the auto-covariance of a single 'b' variable
# That's why in the example above we take the np.cov()[a,b] element of the 2D matrix returned, for taking only the result of the variance
# for a and b variables (disciplines).

math_physics_sample_covariance_2 = 2.124137931034482
math_literature_sample_covariance_2 = -1.087931034482759
physics_literature_sample_covariance_2 = -0.8528735632183907


In [11]:
# calculating pearson correlation coefficient directly from two series

def calculate_sample_pearson_correlation_coefficient_of_two_disciplines_2(discipline_1_series, discipline_2_series):
    return np.corrcoef(discipline_1_series, discipline_2_series)[0][1]

math_physics_sample_pearson_correlation_coefficient_2 = calculate_sample_pearson_correlation_coefficient_of_two_disciplines_2(disciplines_grades_dataframe["Math"], disciplines_grades_dataframe["Physics"])
print(f"math_physics_sample_pearson_correlation_coefficient_2 = {math_physics_sample_pearson_correlation_coefficient_2}")
math_literature_sample_pearson_correlation_coefficient_2 = calculate_sample_pearson_correlation_coefficient_of_two_disciplines_2(disciplines_grades_dataframe["Math"], disciplines_grades_dataframe["Literature"])
print(f"math_literature_sample_pearson_correlation_coefficient_2 = {math_literature_sample_pearson_correlation_coefficient_2}")
physics_literature_sample_pearson_correlation_coefficient_2 = calculate_sample_pearson_correlation_coefficient_of_two_disciplines_2(disciplines_grades_dataframe["Physics"], disciplines_grades_dataframe["Literature"])
print(f"physics_literature_sample_pearson_correlation_coefficient_2 = {physics_literature_sample_pearson_correlation_coefficient_2}")

# P.s: the same about the 2D matrix return for the np.cov() applies for np.corrcoef(), that's why we return only the [0][1] position of
# the resulting 2D-matrix, as we want the value for corrcoef(a,b)

# P.s: even for sample pearson_correlation_coefficients you don't have to set ddof to 1 (it is deprecated) for the np.corrcoef() method

math_physics_sample_pearson_correlation_coefficient_2 = 0.6025321124160095
math_literature_sample_pearson_correlation_coefficient_2 = -0.30878277832686235
physics_literature_sample_pearson_correlation_coefficient_2 = -0.28956102521926375
