Reference - Nipun Batra Sir - https://nipunbatra.github.io/ml-teaching/notebooks/dummy-variables-multi-colinearity.html

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np

In [None]:
x1 = np.array([1, 2, 3])
x2 = 2*x1

y = np.array([4, 6, 8])

In [None]:
all_ones = np.ones(x1.shape[0])
X = np.array([all_ones, x1, x2]).T

In [None]:
X.shape

(3, 3)

In [None]:
X

array([[1., 1., 2.],
       [1., 2., 4.],
       [1., 3., 6.]])

In [None]:
def solve_normal_equation(X, y):
    try:
        theta = np.linalg.inv(X.T @ X) @ X.T @ y
        return theta
    except np.linalg.LinAlgError:
        print('The matrix is singular')
        print("X.T @ X = \n", X.T @ X)
        return None

### Assignment question: Use np.linalg.solve instead of inv. Why is this better?

In [None]:
solve_normal_equation(X, y)

The matrix is singular
X.T @ X = 
 [[ 3.  6. 12.]
 [ 6. 14. 28.]
 [12. 28. 56.]]


In [None]:
np.linalg.matrix_rank(X), np.linalg.matrix_rank(X.T @ X)

(2, 2)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

data = np.array([x1, x2]).T

lr.fit(data, y)
lr.coef_, lr.intercept_


# Assignment question: figure why sklearn is able to solve the problem

(array([0.4, 0.8]), 2.0)

In [None]:
# Regularization

eps = 1e-5
X = np.array([all_ones, x1, x2]).T
X = np.eye(3)*eps + X
X

array([[1.00001, 1.     , 2.     ],
       [1.     , 2.00001, 4.     ],
       [1.     , 3.     , 6.00001]])

In [None]:
np.linalg.matrix_rank(X)

3

In [None]:
solve_normal_equation(X, y)

array([2.00023248, 1.19987743, 0.40001887])

In [None]:
# Drop variables
X = np.array([all_ones, x1]).T
print(X)

[[1. 1.]
 [1. 2.]
 [1. 3.]]


In [None]:
solve_normal_equation(X, y)

array([2., 2.])

In [None]:
# Dummy variables

## dataset
num_records = 12
np.random.seed(42)
windspeed = np.random.randint(0, 10, num_records)
vehicles = np.random.randint(100, 500, num_records)
direction = np.random.choice(['N', 'S', 'E', 'W'], num_records)
pollution = np.random.randint(0, 100, num_records)

df = pd.DataFrame({'windspeed': windspeed, 'vehicles': vehicles, 'direction': direction, 'pollution': pollution})
df

Unnamed: 0,windspeed,vehicles,direction,pollution
0,6,251,S,79
1,3,230,N,14
2,7,249,W,61
3,4,408,N,61
4,6,357,N,46
5,9,443,E,61
6,2,393,E,50
7,6,485,E,54
8,7,291,S,63
9,4,376,W,2


In [None]:
def fit_data(df, X, y):
    try:
        lr = LinearRegression()
        lr.fit(X, y)
        rep = f"y = {lr.intercept_:0.2f}"
        for i, coef in enumerate(lr.coef_):
            rep += f" + {coef:0.2f}*{df.columns[i]}"
        return rep
    except Exception as e:
        print(e)
        return None


In [None]:
fit_data(df, df[df.columns[:-1]], df['pollution'])

could not convert string to float: 'S'


In [None]:
# Ordinal encoding
from sklearn.preprocessing import OrdinalEncoder

In [None]:
enc = OrdinalEncoder()

In [None]:
df2 = df.copy()
df2['direction'] = enc.fit_transform(df[['direction']]).flatten()
df2

Unnamed: 0,windspeed,vehicles,direction,pollution
0,6,251,2.0,79
1,3,230,1.0,14
2,7,249,3.0,61
3,4,408,1.0,61
4,6,357,1.0,46
5,9,443,0.0,61
6,2,393,0.0,50
7,6,485,0.0,54
8,7,291,2.0,63
9,4,376,3.0,2


In [None]:
fit_data(df2, df2[df2.columns[:-1]], df2['pollution'])

'y = 86.27 + 4.69*windspeed + -0.14*vehicles + -11.30*direction'

In [None]:
pd.Series({x: i for i, x in enumerate(enc.categories_[0])})

Unnamed: 0,0
E,0
N,1
S,2
W,3


In [None]:
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)

In [None]:
direction_ohe = ohe.fit_transform(df[['direction']])
direction_ohe

array([[0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [None]:
col_names_ohe = [f"Is it {x}?" for x in enc.categories_[0]]

In [None]:
direction_ohe_df = pd.DataFrame(direction_ohe, columns=col_names_ohe)
direction_ohe_df

Unnamed: 0,Is it E?,Is it N?,Is it S?,Is it W?
0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0
5,1.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0
9,0.0,0.0,0.0,1.0


In [None]:
# Confirm that we can write Is it W? as a linear combination of the other columns
1-direction_ohe_df[["Is it N?", "Is it S?", "Is it E?"]].sum(axis=1) - direction_ohe_df["Is it W?"]

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0


In [None]:
X = np.hstack([df[['windspeed', 'vehicles']].values, direction_ohe])

In [None]:
X

array([[  6., 251.,   0.,   0.,   1.,   0.],
       [  3., 230.,   0.,   1.,   0.,   0.],
       [  7., 249.,   0.,   0.,   0.,   1.],
       [  4., 408.,   0.,   1.,   0.,   0.],
       [  6., 357.,   0.,   1.,   0.,   0.],
       [  9., 443.,   1.,   0.,   0.,   0.],
       [  2., 393.,   1.,   0.,   0.,   0.],
       [  6., 485.,   1.,   0.,   0.,   0.],
       [  7., 291.,   0.,   0.,   1.,   0.],
       [  4., 376.,   0.,   0.,   0.,   1.],
       [  3., 260.,   0.,   0.,   0.,   1.],
       [  7., 413.,   0.,   0.,   0.,   1.]])

In [None]:
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])

In [None]:
X_aug

array([[  1.,   6., 251.,   0.,   0.,   1.,   0.],
       [  1.,   3., 230.,   0.,   1.,   0.,   0.],
       [  1.,   7., 249.,   0.,   0.,   0.,   1.],
       [  1.,   4., 408.,   0.,   1.,   0.,   0.],
       [  1.,   6., 357.,   0.,   1.,   0.,   0.],
       [  1.,   9., 443.,   1.,   0.,   0.,   0.],
       [  1.,   2., 393.,   1.,   0.,   0.,   0.],
       [  1.,   6., 485.,   1.,   0.,   0.,   0.],
       [  1.,   7., 291.,   0.,   0.,   1.,   0.],
       [  1.,   4., 376.,   0.,   0.,   0.,   1.],
       [  1.,   3., 260.,   0.,   0.,   0.,   1.],
       [  1.,   7., 413.,   0.,   0.,   0.,   1.]])

In [None]:
X_aug.shape

(12, 7)

In [None]:
np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T @ X_aug), (X_aug.T @ X_aug).shape

(6, 6, (7, 7))

In [None]:
np.linalg.det(X_aug.T @ X_aug)

0.0

In [None]:
pd.DataFrame(X_aug.T @ X_aug)

Unnamed: 0,0,1,2,3,4,5,6
0,12.0,64.0,4156.0,3.0,3.0,2.0,4.0
1,64.0,390.0,22608.0,17.0,13.0,13.0,21.0
2,4156.0,22608.0,1521964.0,1321.0,995.0,542.0,1298.0
3,3.0,17.0,1321.0,3.0,0.0,0.0,0.0
4,3.0,13.0,995.0,0.0,3.0,0.0,0.0
5,2.0,13.0,542.0,0.0,0.0,2.0,0.0
6,4.0,21.0,1298.0,0.0,0.0,0.0,4.0


In [None]:
ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe.fit_transform(df[['direction']])

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [None]:
direction_ohe_n_1 = ohe.fit_transform(df[['direction']])
col_names_ohe_n_1 = [f"Is it {x}?" for x in enc.categories_[0][1:]]
df_ohe_n_1 = pd.DataFrame(direction_ohe_n_1, columns=col_names_ohe_n_1)
df_ohe_n_1

Unnamed: 0,Is it N?,Is it S?,Is it W?
0,0.0,1.0,0.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,1.0,0.0
9,0.0,0.0,1.0


In [None]:
X = np.hstack([df[['windspeed', 'vehicles']].values, df_ohe_n_1.values])
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])

X_aug

array([[  1.,   6., 251.,   0.,   1.,   0.],
       [  1.,   3., 230.,   1.,   0.,   0.],
       [  1.,   7., 249.,   0.,   0.,   1.],
       [  1.,   4., 408.,   1.,   0.,   0.],
       [  1.,   6., 357.,   1.,   0.,   0.],
       [  1.,   9., 443.,   0.,   0.,   0.],
       [  1.,   2., 393.,   0.,   0.,   0.],
       [  1.,   6., 485.,   0.,   0.,   0.],
       [  1.,   7., 291.,   0.,   1.,   0.],
       [  1.,   4., 376.,   0.,   0.,   1.],
       [  1.,   3., 260.,   0.,   0.,   1.],
       [  1.,   7., 413.,   0.,   0.,   1.]])

In [None]:
np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T @ X_aug), (X_aug.T @ X_aug).shape

(6, 6, (6, 6))

In [None]:
# Interepeting dummy variables

## dataset

X = np.array(['F', 'F', 'F', 'M', 'M'])
y = np.array([5, 5.2, 5.4, 5.8, 6])

In [None]:
from sklearn.preprocessing import LabelBinarizer
l = LabelBinarizer()
l.fit_transform(X)

array([[0],
       [0],
       [0],
       [1],
       [1]])

In [None]:
X_binary = 1 - l.fit_transform(X)

In [None]:
X_binary

array([[1],
       [1],
       [1],
       [0],
       [0]])

In [None]:
lr = LinearRegression()
lr.fit(X_binary, y)

In [None]:
lr.coef_, lr.intercept_

(array([-0.7]), 5.8999999999999995)

In [None]:
y[(X_binary==0).flatten()].mean()

5.9

In [None]:
y[(X_binary==1).flatten()].mean()

5.2