In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
import scipy.stats as stats
from sklearn import feature_selection

In [2]:
def gen_data(nobs=1000, a=.5,corr_factor=1):
    x = np.random.normal(scale=1., size=(nobs,2))
    corr = np.random.normal(scale=1., size=(nobs,1))
    corr = np.repeat(corr,2).reshape(nobs,2)
    x = x+corr_factor*corr
    e = np.random.normal(loc=0.0, scale=1.0, size=nobs)
    y = 1 + a*x.sum(axis=1) + e
    return y,x,nobs


def add_constant(x):
    nobs = x.shape[0]
    x = np.concatenate( (np.ones(shape=(nobs,1)),x) ,axis=1)
    return x

def ols(y,x):
    x = add_constant(x)
    xx = (x.T).dot(x)
    beta = np.linalg.inv( xx ).dot( x.T ).dot(y)
    return beta

def predict(y,x):   
    beta  = ols(y,x)
    x = add_constant(x)
    return x.dot(beta)
    



In [3]:
def compare_mse(corr_factor=1,v=False):
    y,x,nobs = gen_data(corr_factor=corr_factor)
    y_hat = predict(y,x)
    if v:
        #print("coeff:", ols(y,x))
        print("mse:", (y - y_hat).var())

    y_hat1 = predict(y,x[:,0].reshape(nobs,1))
    if v:
        #print("coeff:", (ols(y,x[:,0].reshape(nobs,1))))
        print("mse:", (y - y_hat1).var())
    return y_hat1,y_hat

y_hat1,y_hat= compare_mse(corr_factor=1,v=True)
nobs = y_hat1.shape[0]
print('cov',np.cov((y_hat1,y_hat)))
print('corr',np.corrcoef((y_hat1,y_hat)))
print('msi',feature_selection.mutual_info_regression(y_hat1.reshape(nobs,1),y_hat))
print('--')
y_hat1,y_hat= compare_mse(corr_factor=0,v=True)
print('cov',np.cov((y_hat1,y_hat)))
print('corr',np.corrcoef((y_hat1,y_hat)))
print('msi',feature_selection.mutual_info_regression(y_hat1.reshape(nobs,1),y_hat))

mse: 1.0002860915064098
mse: 1.3764093182429125
cov [[1.19266353 1.19266353]
 [1.19266353 1.56916326]]
corr [[1.         0.87181614]
 [0.87181614 1.        ]]
msi [0.70481935]
--
mse: 1.0001223588186012
mse: 1.3081947864322288
cov [[0.26225417 0.26225417]
 [0.26225417 0.57063498]]
corr [[1.         0.67792556]
 [0.67792556 1.        ]]
msi [0.28758421]


In [5]:
def entropy(y,bins=25):
    e = .0000001
    y_counts, y_edges = np.histogram(y,bins=bins)
    y_dens = (y_counts/y_counts.sum())
    bin_size = ( y.max()-y.min() )/ bins
    return -1*(np.log2(y_dens/bin_size+e)*y_dens).sum()

y_hat1,y_hat= compare_mse(corr_factor=0)
print(entropy(y_hat1,bins=10))
print(entropy(y_hat,bins=10))
print('---')
print(entropy(y_hat1,bins=25))
print(entropy(y_hat,bins=25))

print('---')
y_hat1,y_hat= compare_mse(corr_factor=1)
print(entropy(y_hat1,bins=10))
print(entropy(y_hat,bins=10))

print('---')
print(entropy(y_hat1,bins=25))
print(entropy(y_hat,bins=25))

0.975073672674982
1.503308092507954
---
0.941404222786503
1.4734511658814509
---
2.191904608451839
2.3863750009013445
---
2.1663477820242525
2.370048318813725
