In [13]:
import os
import numpy as np
import pandas
import time
#import random
import matplotlib
import matplotlib.pyplot as plt
#import scipy.stats
from sklearn.linear_model import LinearRegression

In [2]:
#for some reason, this needs to be in a separate cell
params={
    "font.size":15,
    "lines.linewidth":5
}
plt.rcParams.update(params)

In [14]:
my_data = np.genfromtxt('qsar_fish_toxicity.csv', delimiter=';')

6 molecular descriptors and 1 quantitative experimental response:
1) CIC0
2) SM1_Dz(Z)
3) GATS1i
4) NdsCH
5) NdssC
6) MLOGP
7) quantitative response, LC50 [-LOG(mol/L)]

The linear regression model is given by LC50 = $\alpha_1$CIC0 + $\alpha_2$SM1_Dz(Z) + $\alpha_3$GATS1i + $\alpha_4$MLOGP + β

## 1) sklearn

In [41]:
X = my_data[:,[0,1,2,5]]
y = my_data[:,6]
reg = LinearRegression().fit(X, y)
print('alpha 1-4:')
print(reg.coef_)
print('beta:')
print(reg.intercept_)

alpha 1-4:
[ 0.44750162  1.22068139 -0.77463965  0.38310065]
beta:
2.1943526381758227


## 2) explicit formula
\begin{equation}
\mathbf{y} = \mathbf{X}\mathbf{\alpha} + \mathbf{\epsilon}
\end{equation}

\begin{equation}
\mathbf{X} = \begin{bmatrix}
    1 & x_{11}       & x_{12} & x_{13} & x_{14} \\
    1 & x_{21}       & x_{22} & x_{23} & x_{24} \\
    ... & ... & ... & ... & ...\\
    1 & x_{n1}       & x_{n2} & x_{n3} & x_{n4}
\end{bmatrix}
\end{equation}

The colomn corresponds to 4 features the the row corresponds to observations. 

\begin{equation}
\mathbf{\alpha} = \begin{bmatrix}
\beta \\
\alpha_1 \\
\alpha_2 \\
\alpha_3 \\
\alpha_4
\end{bmatrix}
\end{equation}

\begin{equation}
\mathbf{\epsilon} = \begin{bmatrix}
\epsilon_1 \\
\epsilon_2 \\
... \\
\epsilon_n
\end{bmatrix}
\end{equation}

The goal is to find $\tilde{\alpha}$ to minimize linear square $\sum_i^n \epsilon_i^2 = \epsilon'\epsilon  = (\mathbf{y} - \mathbf{X}\mathbf{\alpha})'(\mathbf{y} - \mathbf{X}\mathbf{\alpha})$

By matrix calculus, $\frac{ \partial {(\mathbf{y} - \mathbf{X}\mathbf{\alpha})'(\mathbf{y} - \mathbf{X}\mathbf{\alpha})}}{\partial{\mathbf{\alpha}} } = 2(\mathbf{y} - \mathbf{X}\mathbf{\alpha})'(-\mathbf{X})$

Let $\frac{ \partial {(\mathbf{y} - \mathbf{X}\mathbf{\alpha})'(\mathbf{y} - \mathbf{X}\mathbf{\alpha})}}{\partial{\mathbf{\alpha}} } = 0$, we have $(\mathbf{y} - \mathbf{X}\mathbf{\alpha})'\mathbf{X} = 0$

Transpose it, we have $\mathbf{X}'(\mathbf{y} - \mathbf{X}\mathbf{\alpha}) = 0$

To solve the optimal $\tilde{\alpha}$, we have $\mathbf{X}'\mathbf{y} = \mathbf{X}'\mathbf{X}\tilde{\alpha}$

Therefore, $\tilde{\alpha} = (\mathbf{X}'\mathbf{X})^{-1}\mathbf{X}'\mathbf{y}$




In [30]:
X1 = np.ones(len(y))
X1 = np.reshape(X1, (len(y), 1))
X = np.array(X)
X_new = np.hstack((X1,X))
X_new.shape

(908, 5)

In [37]:
y = np.reshape(y, (len(y), 1))
y.shape

(908, 1)

In [38]:
alpha_left = np.linalg.inv(np.matmul(X_new.transpose(), X_new))
alpha_right = np.matmul(X_new.transpose(), y)
alpha = np.matmul(alpha_left, alpha_right)

In [42]:
print('alpha 1-4:')
print(alpha[1:])
print('beta')
print(alpha[0])

alpha 1-4:
[[ 0.44750162]
 [ 1.22068139]
 [-0.77463965]
 [ 0.38310065]]
beta
[2.19435264]


## 3) gradient descent method


In [114]:
y = np.reshape(y, (len(y), 1))

delta = 0.02
alpha_0 = np.ones(5)
alpha_0 = np.reshape(alpha_0, (5,1))
epsilon_0 = 1e10
change = 1e10
while change > 1e-5:
    epsilon_1 = np.subtract(y, np.matmul(X_new, alpha_0))
    gradient = -2/len(y)*np.matmul(epsilon_1.transpose(), X_new)
    change = abs(np.sum(abs(epsilon_1)) - np.sum(abs(epsilon_0)))
    alpha_0 = alpha_0 - delta*np.reshape(gradient, (5,1))
    epsilon_0 = epsilon_1

In [115]:
alpha_0

array([[ 2.19377227],
       [ 0.4475734 ],
       [ 1.22079906],
       [-0.77444735],
       [ 0.38311658]])

In [116]:
print('alpha 1-4:')
print(alpha_0[1:])
print('beta')
print(alpha_0[0])

alpha 1-4:
[[ 0.4475734 ]
 [ 1.22079906]
 [-0.77444735]
 [ 0.38311658]]
beta
[2.19377227]
