------------------------
#### Multi collinearity
- effect of multi-collinearity on ML models
----------------------------

In [59]:
import pandas as pd
import numpy as np
np.set_printoptions(precision=3, suppress=True)

# for checking multi-collinearity, using eigen decomposition
from numpy.linalg import inv
import scipy 
import scipy.linalg as la

# for checking multi-collinearity, using VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# for checking multi-collinearity, using condition number
from numpy.linalg import cond

# import the ML algorithm
from sklearn.linear_model import LinearRegression

#### simulate a dataset

In [27]:
# generate some random data
np.random.seed(seed=100)

X = np.random.randint(1,   50,  size=(10, 4))
y = np.random.uniform(100, 200, size=10)

In [3]:
X, y

(array([[ 9, 25,  4, 40],
        [24, 16, 49, 11],
        [31, 35,  3, 35],
        [15, 35, 49, 25],
        [16, 37, 44, 17],
        [10, 30, 23,  3],
        [28, 45,  5, 32],
        [ 2, 14, 20, 37],
        [ 5, 28,  4,  8],
        [48,  2, 15,  8]]),
 array([117.862, 153.257, 164.669, 114.207, 158.139, 147.919, 138.642,
        144.046, 140.476, 144.225]))

#### correlations

In [4]:
np.corrcoef(X, rowvar=False)

array([[ 1.   , -0.237, -0.069, -0.16 ],
       [-0.237,  1.   , -0.044,  0.267],
       [-0.069, -0.044,  1.   , -0.322],
       [-0.16 ,  0.267, -0.322,  1.   ]])

#### check for multi-collinearity

**using eigen method**

In [14]:
def check_mc_eigen(X):
    
    corr = np.corrcoef(X, rowvar=False)
    
    # check the multi collinearity
    eigvals, eigvecs = la.eig(corr)
    eigvals = eigvals.real

    # print(eigvals)
    # print(eigvecs)
    
    return eigvals, eigvecs

In [15]:
check_mc_eigen(X)

(array([1.52 , 1.148, 0.741, 0.591]),
 array([[ 0.39 , -0.602,  0.661,  0.218],
        [-0.538,  0.336,  0.718, -0.286],
        [ 0.39 ,  0.685,  0.203,  0.581],
        [-0.638, -0.234, -0.077,  0.73 ]]))

**Inference :**

- If there is perfect multicollinearity (i.e., one variable can be expressed as a linear combination of others), the covariance matrix becomes singular, resulting in at least one eigenvalue equal to 0.
- Small but non-zero eigenvalues indicate the presence of multicollinearity to some extent.

there seems to __no multi-collinearity__, as there no eigen value = 0

**using VIF method**

In [26]:
# VIF = 1: No Multicollinearity
# A VIF value of 1 indicates no multicollinearity. 
# It implies that the variance of the estimated regression coefficient 
# for the corresponding predictor variable is not inflated due to correlations with 
# other predictors.

# VIF between 1 and 5: Moderate Multicollinearity
# VIF values between 1 and 5 are generally considered acceptable and suggest moderate 
# multicollinearity. While there may be some correlation among predictors, 
# it is not severe enough to cause significant issues in the interpretation of coefficients.

# VIF > 5: High Multicollinearity
# VIF values exceeding 5 are often indicative of high multicollinearity. 
# This suggests that the variance of the estimated regression coefficient is 
# significantly inflated, and interpretation of individual coefficients becomes challenging.

def check_mc_vif(X):
    # Add a constant to the array for intercept term
    # X_with_const = add_constant(X)

    # Calculate VIF for each variable
    vif_data = np.array([variance_inflation_factor(X, i) for i in range(X.shape[1])])

    # Display the variables and their corresponding VIF values
    for i, vif_value in enumerate(vif_data):
        print(f"Variable {i}: VIF = {vif_value}")

In [28]:
check_mc_vif(X)

Variable 0: VIF = 2.0991502211171804
Variable 1: VIF = 4.544095986572352
Variable 2: VIF = 2.0295209337525284
Variable 3: VIF = 3.4730813003534053


**using condition number**

In [32]:
# Interpreting the condition number:

# A condition number close to 1 indicates little to no multicollinearity.
# As the condition number increases, the severity of multicollinearity also increases.
# A common rule of thumb is that a condition number above 30 may indicate moderate to 
# high multicollinearity, and above 100 suggests a severe issue.

# Calculate the condition number
condition_number = cond(X)
print(f"Condition Number: {condition_number}")

Condition Number: 4.473138245285738


#### compute the beta coefficients

$$
β = ({X}^\top X)^{-1} ({X}^\top y )
$$

In [33]:
# calculate the coefficients
part1       = inv(np.dot(X.T, X))
part2       = np.dot(X.T, y)

beta_coeffs = np.dot(part1, part2)
beta_coeffs

array([1.977, 1.848, 1.018, 1.145])

In [34]:
# instantiate
linreg = LinearRegression(fit_intercept=False)

# fit the model to the training data (learn the coefficients)
linreg.fit(X, y)

# print the coefficients
print(linreg.intercept_)
print(linreg.coef_)

0.0
[1.977 1.848 1.018 1.145]


#### Scenario 1 
- 2 cols collinear
- add 1 additional col, similar to an existing one

In [35]:
# original columns
df = pd.DataFrame(X, columns=['c0', 'c1', 'c2', 'c3'])

# white noise (for the new similar column)
noise = np.random.randn(10)

df['c4'] = 2 * df['c1'] + .5 * noise  + 3

X_new = df.values

In [38]:
check_mc_eigen(X_new)

(array([2.244, 1.231, 0.923, 0.602, 0.   ]),
 array([[-0.274,  0.192, -0.885, -0.323, -0.004],
        [ 0.635, -0.168, -0.251,  0.058, -0.709],
        [-0.113, -0.772,  0.093, -0.619, -0.002],
        [ 0.33 ,  0.556,  0.278, -0.711,  0.007],
        [ 0.632, -0.176, -0.26 ,  0.062,  0.706]]))

In [39]:
check_mc_vif(X_new)

Variable 0: VIF = 3.8440103506103886
Variable 1: VIF = 5613.533089201047
Variable 2: VIF = 2.9128241789954257
Variable 3: VIF = 3.6082865443128345
Variable 4: VIF = 5893.212562059288


In [40]:
# Calculate the condition number
condition_number = cond(X_new)
print(f"Condition Number: {condition_number}")

Condition Number: 216.3459353186385


**Observation**
1. the eigen value e4 = 0
2. look for the eigen vector of e4 and look for non-zero eigen co-ordinate
    - in this case c1 = [-0.709] and c4 =[ 0.706] are collinear

In [41]:
# calculate the coefficients
part1       = inv(np.dot(X_new.T, X_new))
part2       = np.dot(X_new.T, y)

beta_coeffs = np.dot(part1, part2)
beta_coeffs

array([ -0.149, -92.862,  -0.219,   0.605,  46.355])

In [42]:
# instantiate
linreg = LinearRegression(fit_intercept=False)

# fit the model to the training data (learn the coefficients)
linreg.fit(X_new, y)

# print the coefficients
print(linreg.intercept_)
print(linreg.coef_)

0.0
[ -0.149 -92.862  -0.219   0.605  46.355]


With NO multi-collinearity the coeff were:- [1.977 1.848 1.018 1.145]

1. the coefficients have gone up 
2. Even the directions have changed in some cases

#### Scenario 2 
- 2 cols collinear
- add 1 additional col, similar to a couple of existing ones

In [43]:
df = pd.DataFrame(X, columns=['c0', 'c1', 'c2', 'c3'])

# white noise
noise = np.random.randn(10)

df['c4'] = 2 * df['c0'] + df['c3'] + .5 * noise 

X_new = df.values

In [44]:
check_mc_eigen(X_new)

(array([1.998, 0.   , 1.512, 0.859, 0.631]),
 array([[ 0.646, -0.656, -0.26 , -0.208, -0.201],
        [-0.175, -0.   ,  0.5  , -0.684, -0.502],
        [-0.236,  0.002, -0.466, -0.669,  0.529],
        [ 0.144, -0.321,  0.678, -0.014,  0.645],
        [ 0.689,  0.683,  0.07 , -0.204,  0.108]]))

In [45]:
check_mc_vif(X_new)

Variable 0: VIF = 12018.28726658413
Variable 1: VIF = 4.549986975786913
Variable 2: VIF = 2.110439837242003
Variable 3: VIF = 3566.340020854799
Variable 4: VIF = 24030.544844701042


In [46]:
# Calculate the condition number
condition_number = cond(X_new)
print(f"Condition Number: {condition_number}")

Condition Number: 457.4689400464041


In [47]:
# calculate the coefficients
part1       = inv(np.dot(X_new.T, X_new))
part2       = np.dot(X_new.T, y)

beta_coeffs = np.dot(part1, part2)
beta_coeffs

array([-2.357,  1.845,  1.027, -1.008,  2.159])

In [48]:
# instantiate
linreg = LinearRegression(fit_intercept=False)

# fit the model to the training data (learn the coefficients)
linreg.fit(X_new, y)

# print the coefficients
print(linreg.intercept_)
print(linreg.coef_)

0.0
[-2.357  1.845  1.027 -1.008  2.159]


[  0.451 -71.213   0.352   0.141  35.854]

#### Scenario 3 
- 2 cols perfectly collinear
- add 1 additional col, factor of an existing one

In [53]:
df = pd.DataFrame(X, columns=['c0', 'c1', 'c2', 'c3'])

# white noise
noise = np.random.randn(10)

df['c4'] = df['c0'] * 2 

X_new = df.values

In [54]:
check_mc_eigen(X_new)

(array([2.177, 0.   , 1.364, 0.861, 0.598]),
 array([[-0.647,  0.707,  0.194, -0.202, -0.05 ],
        [ 0.317, -0.   ,  0.289, -0.826,  0.366],
        [-0.004, -0.   , -0.666, -0.486, -0.566],
        [ 0.249,  0.   ,  0.63 , -0.009, -0.736],
        [-0.647, -0.707,  0.194, -0.202, -0.05 ]]))

In [51]:
check_mc_vif(X_new)

Variable 0: VIF = inf
Variable 1: VIF = 4.544095986572352
Variable 2: VIF = 2.0295209337525275
Variable 3: VIF = 3.4730813003534053
Variable 4: VIF = inf


  vif = 1. / (1. - r_squared_i)


In [52]:
# Calculate the condition number
condition_number = cond(X_new)
print(f"Condition Number: {condition_number}")

Condition Number: 2.5392895405253064e+16


In [58]:
# calculate the coefficients
part1       = inv(np.dot(X_new.T, X_new))
part2       = np.dot(X_new.T, y)

beta_coeffs = np.dot(part1, part2)
beta_coeffs

LinAlgError: Singular matrix

To address perfect multicollinearity, consider the following strategies:

- Remove Redundant Variables:

    - Identify and remove one or more variables that are linearly dependent on others.
- Combine Variables:

    - If possible, combine highly correlated variables into a single variable.
- Collect More Data:

    - Increasing the sample size may help mitigate multicollinearity.