In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn import linear_model
from scipy import stats
import pingouin as pg

In [2]:
df = pd.read_excel('ELE.xlsx', header=None)
df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,82.23,333.65,125.95,12231.27,1110.97,105069.63,263.66,77811.79,423.5,112743.19,823200.19,403.91,982.62,3955.22,19381.56,4324.97,125876.24
1,79.17,349.29,120.41,11977.18,1069.27,107210.46,261.29,69254.01,435.26,98112.25,705245.02,400.45,962.11,3946.92,17222.31,4262.06,111311.84
2,79.92,342.56,113.92,11712.42,997.68,108138.53,262.07,55266.32,422.36,84301.39,603368.98,392.59,944.62,3981.55,14876.55,4260.74,98425.38


In [3]:
fig = px.scatter_matrix(df, width=1500, height=1500)
fig.show()

## 1. Analizar la correlación entre pares de variables

In [18]:
corr = df.corr(method ='pearson').style.background_gradient(cmap='coolwarm').set_precision(2)
corr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,1.0,0.43,0.5,-0.14,-0.28,-0.52,0.66,0.55,-0.33,0.5,0.46,-0.58,0.05,0.7,0.47,0.56,0.39
1,0.43,1.0,0.55,-0.25,-0.41,-0.41,0.43,0.02,-0.37,0.04,0.1,-0.42,0.2,0.41,0.1,0.32,-0.08
2,0.5,0.55,1.0,-0.33,-0.5,-0.41,0.57,0.05,-0.5,0.05,0.09,-0.66,0.18,0.51,0.15,0.41,-0.02
3,-0.14,-0.25,-0.33,1.0,0.86,0.37,0.08,0.5,0.72,0.56,0.7,0.39,-0.29,0.18,0.61,0.32,0.74
4,-0.28,-0.41,-0.5,0.86,1.0,0.68,-0.35,0.39,0.85,0.49,0.53,0.7,-0.05,-0.21,0.43,-0.07,0.61
5,-0.52,-0.41,-0.41,0.37,0.68,1.0,-0.74,-0.25,0.57,-0.13,-0.15,0.82,0.39,-0.67,-0.26,-0.59,-0.07
6,0.66,0.43,0.57,0.08,-0.35,-0.74,1.0,0.42,-0.28,0.32,0.46,-0.83,-0.34,0.96,0.48,0.92,0.37
7,0.55,0.02,0.05,0.5,0.39,-0.25,0.42,1.0,0.17,0.95,0.92,-0.19,-0.09,0.53,0.92,0.49,0.91
8,-0.33,-0.37,-0.5,0.72,0.85,0.57,-0.28,0.17,1.0,0.26,0.34,0.67,-0.26,-0.18,0.23,0.02,0.39
9,0.5,0.04,0.05,0.56,0.49,-0.13,0.32,0.95,0.26,1.0,0.95,-0.05,-0.06,0.43,0.95,0.39,0.93


## 2. Análizar dependecia entre una variable y todas las demás. En este caso encontrar las betas de la regresión múltiple de la variable que mejor se puede explicar a partir de las demás.

In [5]:
def matriz_P(x):
    """
    La matriz P es cuadrada simétrica e idempotente ()PP = P)
    Tiene rango (m-1) es ortogonal al espacio definido por el vector 1 ya que P*1 = 0 y proyecta los
    datos ortogonalmente al espacio deÞnido por el vector constante
    """
    xx = np.array(x)
    m,n = xx.shape
    unos = np.ones([m,1])
    Identidad = np.identity(m)
    P = Identidad - float(1.0/m) * unos.dot(unos.T)
    return P

def Matriz_covarianzasCorregida(x):
    """
    matriz de covarianzas a partir de los datos. OBSERVE que se divide por (n-1)
    np.cov(Data, rowvar=False) # esta funcion me da la covarianza corregida
    """
    xx = np.array(x)
    m,n = xx.shape
    P = matriz_P(x)
    intermedio =P.dot(xx)
    S = float(1.0/(m-1))*(xx.T.dot(intermedio))
    return S

In [6]:
s_jj__jj = np.diag(Matriz_covarianzasCorregida(df))*np.diag(np.linalg.inv(Matriz_covarianzasCorregida(df))).round(4)

### Variable con mayor correlación múltiple

In [7]:
R2 = 1 - 1/s_jj__jj
R2 = list(R2.round(4))
print("La variable que mejor se explica a partir de las demas es la", R2.index(max(R2)))

La variable que mejor se explica a partir de las demas es la 4


#### Parámetros de la regresión lineal múltiple de la variable con el resto

In [8]:
X = df.drop(columns=[4])
X['unos'] = 1
Y = df[4]
betas = np.linalg.inv(X.T.dot(X)).dot(X.T.dot(Y))
betas

array([-3.77894895e-01, -9.05118155e-02, -4.69805924e-01,  1.29356546e-02,
        1.35074945e-02, -1.99159248e+00, -1.25645662e-03,  5.42446745e-02,
       -2.31968624e-04,  1.19949076e-03, -1.63896511e-02, -4.66975704e-01,
       -3.71492772e-02, -1.69051634e-02,  8.03967165e-02,  6.38030201e-04,
       -1.84162569e+02])

## 3. Analizar la correlacion entre pares de variables pero eliminando el efecto de las demás variables.

In [9]:
def calculate_partial_correlation(input_df):
    """
    Returns the sample linear partial correlation coefficients between pairs of variables,
    controlling for all other remaining variables

    Parameters
    ----------
    input_df : array-like, shape (n, p)
        Array with the different variables. Each column is taken as a variable.

    Returns
    -------
    P : array-like, shape (p, p)
        P[i, j] contains the partial correlation of input_df[:, i] and input_df[:, j]
        controlling for all other remaining variables.
    """
    partial_corr_matrix = np.zeros((input_df.shape[1], input_df.shape[1]));
    for i, column1 in enumerate(input_df):
        for j, column2 in enumerate(input_df):
            control_variables = np.delete(np.arange(input_df.shape[1]), [i, j]);
            if i==j:
                partial_corr_matrix[i, j] = 1;
                continue
            data_control_variable = input_df.iloc[:, control_variables]
            data_column1 = input_df[column1].values
            data_column2 = input_df[column2].values
            fit1 = linear_model.LinearRegression(fit_intercept=True)
            fit2 = linear_model.LinearRegression(fit_intercept=True)
            fit1.fit(data_control_variable, data_column1)
            fit2.fit(data_control_variable, data_column2)
            residual1 = data_column1 - (np.dot(data_control_variable, fit1.coef_) + fit1.intercept_)
            residual2 = data_column2 - (np.dot(data_control_variable, fit2.coef_) + fit2.intercept_)
            partial_corr_matrix[i,j] = stats.pearsonr(residual1, residual2)[0]
    return pd.DataFrame(partial_corr_matrix, columns = input_df.columns, index = input_df.columns)

In [21]:
partial_corr1 = df.pcorr().round(2)
partial_corr = calculate_partial_correlation(df).style.background_gradient(cmap='coolwarm').set_precision(2)
partial_corr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,1.0,-0.41,-0.64,-0.87,-0.06,0.6,0.49,0.02,-0.66,-0.2,0.21,0.85,-0.13,0.49,0.41,0.14,-0.24
1,-0.41,1.0,-0.2,-0.37,-0.16,0.23,-0.02,-0.22,-0.34,-0.12,0.38,0.56,0.02,0.37,0.08,0.1,-0.42
2,-0.64,-0.2,1.0,-0.7,-0.38,0.86,0.16,-0.26,-0.42,-0.33,0.34,0.5,-0.36,0.27,0.52,0.31,-0.15
3,-0.87,-0.37,-0.7,1.0,0.12,0.59,0.59,0.03,-0.68,-0.44,0.16,0.81,-0.2,0.58,0.59,-0.03,-0.27
4,-0.06,-0.16,-0.38,0.12,1.0,0.65,-0.55,-0.34,0.19,-0.04,0.76,-0.03,-0.53,-0.38,-0.33,0.56,0.24
5,0.6,0.23,0.86,0.59,0.65,1.0,0.03,0.19,0.34,0.34,-0.61,-0.43,0.66,-0.12,-0.28,-0.4,0.1
6,0.49,-0.02,0.16,0.59,-0.55,0.03,1.0,-0.26,0.49,0.08,0.34,-0.54,-0.24,-0.21,-0.3,0.21,-0.04
7,0.02,-0.22,-0.26,0.03,-0.34,0.19,-0.26,1.0,-0.06,0.37,0.32,-0.07,-0.07,-0.23,-0.18,0.4,0.09
8,-0.66,-0.34,-0.42,-0.68,0.19,0.34,0.49,-0.06,1.0,0.03,0.09,0.71,-0.09,0.23,0.26,0.3,-0.36
9,-0.2,-0.12,-0.33,-0.44,-0.04,0.34,0.08,0.37,0.03,1.0,0.26,0.23,-0.3,0.5,0.43,-0.47,0.03


## 4. Analizar el conjunto completo de todas las variables.

### Matriz de precisión: contiene la información sobre la relación multivariante entrecada una de las variables y el resto

In [20]:
cov = np.cov(df, rowvar=False)
cov_inv = np.linalg.inv(cov)
df_cov_inv = pd.DataFrame(cov_inv).style.set_precision(2)
df_cov_inv 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.44,0.02,0.05,0.01,0.0,-0.0,-0.12,-0.0,0.01,0.0,-0.0,-0.03,0.01,-0.0,-0.0,-0.0,0.0
1,0.02,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0
2,0.05,0.0,0.02,0.0,0.0,-0.0,-0.01,0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0
3,0.01,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0
4,0.0,0.0,0.0,-0.0,0.01,-0.0,0.02,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0
5,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0
6,-0.12,0.0,-0.01,-0.0,0.02,-0.0,0.13,0.0,-0.01,-0.0,-0.0,0.01,0.01,0.0,0.0,-0.0,0.0
7,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0
8,0.01,0.0,0.0,0.0,-0.0,-0.0,-0.01,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0


## Variables que se pueden eliminar con una correlacion >= |0.7|: 

Alto coeficiente en corr y partial_corr 11-8  13-15