In [3]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import scipy as sp
import sympy as sy
sy.init_printing() 

In [4]:
np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)

In [5]:
def round_expr(expr, num_digits):
    return expr.xreplace({n : round(n, num_digits) for n in expr.atoms(sy.Number)})

# <font face="gotham" color="purple"> Gram-Schmidt过程

<font face="gotham" color="red">Gram-Schmidt过程</font>是生成正交或正交归一基的算法。

## <font face="gotham" color="purple"> 在$\mathbb{R}^3$中的一个例子

$$\text { Let } W=\operatorname{Span}\left\{\mathbf{x}_{1}, \mathbf{x}_{2}, \mathbf{x}_{3}\right\}, \text { where } \mathbf{x}_{1}=\left[\begin{array}{l}
3 \\
6 \\
2
\end{array}\right] \text {, } \mathbf{x}_{2}=\left[\begin{array}{l}
1 \\
2 \\
4
\end{array}\right]\text {, and }\mathbf{x}_{3}=\left[\begin{array}{l}
2 \\
-2 \\
1
\end{array}\right]$$.

它们不是正交的，但是我们可以基于$\left\{\mathbf{x}_{1}, \mathbf{x}_{2}, \mathbf{x}_{3}\right\}$构建$W$的一个正交基$\{\mathbf{v}_1, \mathbf{v}_2, \mathbf{v}_3\}$。我们将可视化这个过程。

首先，我们绘制$W=\operatorname{Span}\left\{\mathbf{x}_{1}, \mathbf{x}_{2},\mathbf{x}_{3}\right\}$。

In [9]:
######################## Subspace W ##############################
s = np.linspace(-1, 1, 10)
t = np.linspace(-1, 1, 10)
S, T = np.meshgrid(s, t)

vec = np.array([[[0,0,0,3, 6, 2]],
             [[0,0,0,1, 2, 4]],
             [[0,0,0,2, -2, 1]]])

X = vec[0,:,3] * S + vec[1,:,3] * T
Y = vec[0,:,4] * S + vec[1,:,4] * T
Z = vec[0,:,5] * S + vec[1,:,5] * T

fig = plt.figure(figsize = (7, 7))
ax = fig.add_subplot(projection='3d')
ax.plot_wireframe(X, Y, Z, linewidth = 1.5, alpha = .3)

############################# x1 and x2 ##############################
colors = ['r','b','g']
s = ['$x_1$', '$x_2$', '$x_3$']
for i in range(vec.shape[0]):
    X,Y,Z,U,V,W = zip(*vec[i,:,:])
    ax.quiver(X, Y, Z, U, V, W, length=1, normalize=False,
              color = colors[i], alpha = .6,arrow_length_ratio = .08, pivot = 'tail',
              linestyles = 'solid',linewidths = 3)
    ax.text(vec[i,:,3][0], vec[i,:,4][0], vec[i,:,5][0], s = s[i], size = 15)

ax.set_xlabel('x-axis')
ax.set_ylabel('y-axis')
ax.set_zlabel('z-axis')
plt.show()

<IPython.core.display.Javascript object>

如果我们选择$\mathbf{v}_1= \mathbf{x}_1$，那么$\mathbf{x}_2$投影到$\mathbf{v}_1$的正交分量就是$\mathbf{v}_2$。

定义$\text{Proj}_{\mathbf{v}_1}\mathbf{x}_2 = \alpha \mathbf{x}_1$，那么$(\mathbf{x}_2 - \alpha \mathbf{x}_1)\cdot \mathbf{x}_1 = 0$，重新排列以解出$\alpha$

$$
\alpha = \frac{\mathbf{x}_2^T\mathbf{x}_1}{\mathbf{x}_1^T\mathbf{x}_1}
$$

根据上述定义

$$
\text{Proj}_{\mathbf{v}_1}\mathbf{x}_2 = \alpha \mathbf{x}_1 = \frac{\mathbf{x}_2^T\mathbf{x}_1}{\mathbf{x}_1^T\mathbf{x}_1}\mathbf{x}_1
$$

正交分量$\mathbf{v}_2$是

$$
\mathbf{x}_2- \text{Proj}_{\mathbf{v}_1}\mathbf{x}_2 =\mathbf{x}_2 - \frac{\mathbf{x}_2^T\mathbf{x}_1}{\mathbf{x}_1^T\mathbf{x}_1}\mathbf{x}_1
$$

In [12]:
x2 = np.array([1, 2, 4])
v2 = x2 - (x2@x1)/(x1@x1)*x1;v2

array([-0.408, -0.816,  3.061])

In [13]:
%matplotlib notebook

######################## Subspace W ##############################

s = np.linspace(-1, 1, 10)
t = np.linspace(-1, 1, 10)
S, T = np.meshgrid(s, t)

x1,v1 = np.array([3, 6, 2]),np.array([3, 6, 2])
x2 = np.array([1, 2, 4])
x3 = np.array([2, -2, 1])

X = x1[0] * S + x2[0] * T
Y = x1[1] * S + x2[1] * T
Z = x1[2] * S + x2[2] * T

fig = plt.figure(figsize = (7, 7))
ax = fig.add_subplot(projection='3d')
ax.plot_wireframe(X, Y, Z, linewidth = 1.5, alpha = .3)

############################# x1, x2, v2, alpha*v1 ##############################

vec = np.array([[0, 0, 0, x1[0], x1[1], x1[2]]])
X, Y, Z, U, V, W = zip(*vec)
ax.quiver(X, Y, Z, U, V, W, length=1, normalize=False, color = 'red', alpha = .6,arrow_length_ratio = .08, pivot = 'tail',
          linestyles = 'solid',linewidths = 3)

vec = np.array([[0, 0, 0, x2[0], x2[1], x2[2]]])
X, Y, Z, U, V, W = zip(*vec)
ax.quiver(X, Y, Z, U, V, W, length=1, normalize=False, color = 'blue', alpha = .6,arrow_length_ratio = .08, pivot = 'tail',
          linestyles = 'solid',linewidths = 3)

vec = np.array([[0, 0, 0, x3[0], x3[1], x3[2]]])
X, Y, Z, U, V, W = zip(*vec)
ax.quiver(X, Y, Z, U, V, W, length=1, normalize=False, color = 'green', alpha = .6,arrow_length_ratio = .08, pivot = 'tail',
          linestyles = 'solid',linewidths = 3)

vec = np.array([[0, 0, 0, v2[0], v2[1], v2[2]]])
X, Y, Z, U, V, W = zip(*vec)
ax.quiver(X, Y, Z, U, V, W, length=1, normalize=False, color = 'purple', alpha = .6,arrow_length_ratio = .08, pivot = 'tail',
          linestyles = 'solid',linewidths = 3)

alpha = (x2@x1)/(x1@x1)

vec = np.array([[0, 0, 0, alpha*x1[0], alpha*x1[1], alpha*x1[2]]])
X, Y, Z, U, V, W = zip(*vec)
ax.quiver(X, Y, Z, U, V, W, length=1, normalize=False, color = 'blue', alpha = .6,arrow_length_ratio = .12, pivot = 'tail',
          linestyles = 'solid',linewidths = 3)


ax.text(x1[0], x1[1], x1[2], '$\mathbf{x}_1 = \mathbf{v}_1 $', size = 15)
ax.text(x2[0], x2[1], x2[2], '$\mathbf{x}_2$', size = 15)
ax.text(x3[0], x3[1], x3[2], '$\mathbf{x}_3$', size = 15)
ax.text(v2[0], v2[1], v2[2], '$\mathbf{v}_2$', size = 15)

ax.text(x=alpha*x1[0], y=alpha*x1[1], z=alpha*x1[2], s = r'$\mathbf{\hat{x}}_2$', size = 15)

ax.set_xlabel('x-axis')
ax.set_ylabel('y-axis')
ax.set_zlabel('z-axis')

################################# Dashed Line ##################################

point1 = [alpha*x1[0], alpha*x1[1], alpha*x1[2]]
point2 = [x2[0], x2[1], x2[2]]
line1 = np.array([point1, point2])
ax.plot(line1[:,0], line1[:,1], line1[:, 2], c = 'b', lw = 3.5,alpha =0.5, ls = '--')

point1 = [v2[0], v2[1], v2[2]]
point2 = [x2[0], x2[1], x2[2]]
line1 = np.array([point1, point2])
ax.plot(line1[:,0], line1[:,1], line1[:, 2], c = 'b', lw = 3.5,alpha =0.5, ls = '--')
plt.show()

  ax.text(x1[0], x1[1], x1[2], '$\mathbf{x}_1 = \mathbf{v}_1 $', size = 15)
  ax.text(x2[0], x2[1], x2[2], '$\mathbf{x}_2$', size = 15)
  ax.text(x3[0], x3[1], x3[2], '$\mathbf{x}_3$', size = 15)
  ax.text(v2[0], v2[1], v2[2], '$\mathbf{v}_2$', size = 15)


<IPython.core.display.Javascript object>

下一步，我们找到$\mathbf{v}_3$，定义$W = \text{Span}\{\mathbf{v}_1, \mathbf{v}_2\}$

$$
\mathbf{x}_3- \text{Proj}_{W}\mathbf{x}_3 =\mathbf{x}_3 - \frac{\mathbf{x}_3^T\mathbf{v}_1}{\mathbf{v}_1^T\mathbf{v}_1}\mathbf{v}_1-\frac{\mathbf{x}_3^T\mathbf{v}_2}{\mathbf{v}_2^T\mathbf{v}_2}\mathbf{v}_2
$$

再次强调，代码冗长但非常直观。

In [14]:
x3 = np.array([2, -2, 1])
projW_x3 = (x3@v1)/(v1@v1)*v1 + (x3@v2)/(v2@v2)*v2
v3 = x3 - projW_x3; v3

array([ 2.4, -1.2, -0. ])

In [24]:
%matplotlib notebook

######################## Subspace W ##############################

s = np.linspace(-1, 1, 10)
t = np.linspace(-1, 1, 10)
S, T = np.meshgrid(s, t)

x1 = np.array([3, 6, 2])
x2 = np.array([1, 2, 4])
x3 = np.array([2, -2, 1])

X = x1[0] * S + x2[0] * T
Y = x1[1] * S + x2[1] * T
Z = x1[2] * S + x2[2] * T

fig = plt.figure(figsize = (9, 9))
ax = fig.add_subplot(projection='3d')
ax.plot_wireframe(X, Y, Z, linewidth = 1.5, alpha = .3)

############################# x1, x2, v2, alpha*v1 ##############################

vec = np.array([[0, 0, 0, x1[0], x1[1], x1[2]]])
X, Y, Z, U, V, W = zip(*vec)
ax.quiver(X, Y, Z, U, V, W, length=1, normalize=False, color = 'red', alpha = .6,arrow_length_ratio = .08, pivot = 'tail',
          linestyles = 'solid',linewidths = 3)

vec = np.array([[0, 0, 0, x2[0], x2[1], x2[2]]])
X, Y, Z, U, V, W = zip(*vec)
ax.quiver(X, Y, Z, U, V, W, length=1, normalize=False, color = 'red', alpha = .6,arrow_length_ratio = .08, pivot = 'tail',
          linestyles = 'solid',linewidths = 3)

vec = np.array([[0, 0, 0, x3[0], x3[1], x3[2]]])
X, Y, Z, U, V, W = zip(*vec)
ax.quiver(X, Y, Z, U, V, W, length=1, normalize=False, color = 'red', alpha = .6,arrow_length_ratio = .08, pivot = 'tail',
          linestyles = 'solid',linewidths = 3)

vec = np.array([[0, 0, 0, v2[0], v2[1], v2[2]]])
X, Y, Z, U, V, W = zip(*vec)
ax.quiver(X, Y, Z, U, V, W, length=1, normalize=False, color = 'purple', alpha = .6,arrow_length_ratio = .08, pivot = 'tail',
          linestyles = 'solid',linewidths = 3)

vec = np.array([[0, 0, 0, projW_x3[0], projW_x3[1], projW_x3[2]]])
X, Y, Z, U, V, W = zip(*vec)
ax.quiver(X, Y, Z, U, V, W, length=1, normalize=False, color = 'black', alpha = .6,arrow_length_ratio = .08, pivot = 'tail',
          linestyles = 'solid',linewidths = 3)



alpha = (x2@x1)/(x1@x1)
projW_x2 = np.zeros(3)
projW_x2[0], projW_x2[1], projW_x2[2] = alpha*x1[0],alpha*x1[1], alpha*x1[2]

vec = np.array([[0, 0, 0, projW_x2[0], projW_x2[1], projW_x2[2]]])
X, Y, Z, U, V, W = zip(*vec)
ax.quiver(X, Y, Z, U, V, W, length=1, normalize=False, color = 'blue', alpha = .6,arrow_length_ratio = .12, pivot = 'tail',
          linestyles = 'solid',linewidths = 3)

vec = np.array([[0, 0, 0, v3[0], v3[1], v3[2]]])
X, Y, Z, U, V, W = zip(*vec)
ax.quiver(X, Y, Z, U, V, W, length=1, normalize=False, color = 'purple', alpha = .6,arrow_length_ratio = .08, pivot = 'tail',
          linestyles = 'solid',linewidths = 3)


ax.text(x1[0], x1[1], x1[2], '$\mathbf{x}_1 = \mathbf{v}_1 $', size = 15)
ax.text(x2[0], x2[1], x2[2], '$\mathbf{x}_2$', size = 15)
ax.text(x3[0], x3[1], x3[2], '$\mathbf{x}_3$', size = 15)
ax.text(v2[0], v2[1], v2[2], '$\mathbf{v}_2$', size = 15)
ax.text(v3[0], v3[1], v3[2], '$\mathbf{v}_3$', size = 15)
ax.text(projW_x3[0], projW_x3[1], projW_x3[2], '$\hat{\mathbf{x}}_3$', size = 15)

ax.text(x=alpha*x1[0], y=alpha*x1[1], z=alpha*x1[2], s = r'$\mathbf{\hat{x}}_2$', size = 15)

ax.set_xlabel('x-axis')
ax.set_ylabel('y-axis')
ax.set_zlabel('z-axis')

################################# Dashed Line ##################################

point1 = [alpha*x1[0], alpha*x1[1], alpha*x1[2]]
point2 = [x2[0], x2[1], x2[2]]
line1 = np.array([point1, point2])
ax.plot(line1[:,0], line1[:,1], line1[:, 2], c = 'b', lw = 3.5,alpha =0.5, ls = '--')

point1 = [v2[0], v2[1], v2[2]]
point2 = [x2[0], x2[1], x2[2]]
line1 = np.array([point1, point2])
ax.plot(line1[:,0], line1[:,1], line1[:, 2], c = 'b', lw = 3.5,alpha =0.5, ls = '--')

point1 = [projW_x3[0], projW_x3[1], projW_x3[2]]
point2 = [x3[0], x3[1], x3[2]]
line1 = np.array([point1, point2])
ax.plot(line1[:,0], line1[:,1], line1[:, 2], c = 'b', lw = 3.5,alpha =0.5, ls = '--')


################################ Axes ######################################
ax.set_xlim3d(-5, 5)
ax.set_ylim3d(-5, 5)
ax.set_zlim3d(-5, 5)
plt.show()

  ax.text(x1[0], x1[1], x1[2], '$\mathbf{x}_1 = \mathbf{v}_1 $', size = 15)
  ax.text(x2[0], x2[1], x2[2], '$\mathbf{x}_2$', size = 15)
  ax.text(x3[0], x3[1], x3[2], '$\mathbf{x}_3$', size = 15)
  ax.text(v2[0], v2[1], v2[2], '$\mathbf{v}_2$', size = 15)
  ax.text(v3[0], v3[1], v3[2], '$\mathbf{v}_3$', size = 15)
  ax.text(projW_x3[0], projW_x3[1], projW_x3[2], '$\hat{\mathbf{x}}_3$', size = 15)


<IPython.core.display.Javascript object>

现在我们有了正交基$\{\mathbf{v}_1, \mathbf{v}_2, \mathbf{v}_3\}$，接下来我们可以对它们进行归一化。矩阵$U$的列是一组正交归一基。

In [16]:
v1 = x1
u1 = v1/sp.linalg.norm(v1)
u2 = v2/sp.linalg.norm(v2)
u3 = v3/sp.linalg.norm(v3)

U = np.vstack((u1, u2, u3)).T
U

array([[ 0.429, -0.128,  0.894],
       [ 0.857, -0.256, -0.447],
       [ 0.286,  0.958, -0.   ]])

In [17]:
U.T@U

array([[ 1., -0., -0.],
       [-0.,  1., -0.],
       [-0., -0.,  1.]])

我们也可以使用SymPy内置的算法```orthogonalize```或```GramSchmidt```进行Gram-Schmidt过程。

## <font face="gotham" color="purple"> SymPy中用于Gram-Schmidt过程的函数

我们需要将所有向量准备成如下形式：

$$
L = [\mathbf v_1,\ \mathbf v_2,\ ...,\ \mathbf v_n]
$$

其中$\mathbf v_i, i\in (1,2,...n)$是一个列向量。

In [25]:
L = [sy.Matrix([x1]).T, sy.Matrix([x2]).T, sy.Matrix([x3]).T]
ort = sy.GramSchmidt(L)
ort_norm = sy.GramSchmidt(L, orthonormal = True)

ValueError: operands could not be broadcast together with shapes (3,) (0,) 

In [20]:
ort

NameError: name 'ort' is not defined

In [21]:
ort_norm

NameError: name 'ort_norm' is not defined

测试它是否已经归一化。

In [22]:
for i in range(3):
    print(ort_norm[i].T*ort_norm[i])

NameError: name 'ort_norm' is not defined

任意两个向量也是垂直的，例如

In [26]:
ort_norm[0].T*ort_norm[1]

NameError: name 'ort_norm' is not defined

In [None]:
ort_norm[1].T*ort_norm[2]

类似地，我们可以使用```sy.matrices.MatrixSubspaces.orthogonalize()```。

In [None]:
sy.matrices.MatrixSubspaces.orthogonalize(sy.Matrix([x1]).T, sy.Matrix([x2]).T, sy.Matrix([x3]).T, normalize = True)

# <font face="gotham" color="purple">QR分解

QR分解也用于解决线性系统，并且在最小二乘解中非常常见。QR分解基于我们刚刚看到的Gram-Schmidt过程。

考虑两个矩阵

$$A=\left[\mathbf{a}_{1}, \ldots, \mathbf{a}_{n}\right]\quad\text{和}\quad Q=\left[\mathbf{u}_{1}, \ldots, \mathbf{u}_{n}\right]$$

其中$Q$是$A$的正交归一化。我们定义$R = Q^TA$：

$$R=\left[\begin{array}{cccc}
\mathbf{u}_{1}^T \mathbf{a}_{1}& \mathbf{u}_{1}^T \mathbf{a}_{2} &\mathbf{u}_{1}^T\mathbf{a}_{3} & \dots &\mathbf{u}_{1}^T\mathbf{a}_{n}\\
0 &\mathbf{u}_{2}^T\mathbf{a}_{2} & \mathbf{u}_{2}^T \mathbf{a}_{3}& \dots &\mathbf{u}_{2}^T\mathbf{a}_{n}\\
0 & 0 & \mathbf{u}_{3}^T \mathbf{a}_{3}& \dots& \mathbf{u}_{3}^T\mathbf{a}_{n}\\
\vdots & \vdots & \vdots & \ddots & \vdots\\
0& 0&0 & \dots &\mathbf{u}_{n}^T\mathbf{a}_{n}
\end{array}\right]$$

由于$Q$是一个正交归一矩阵，

$$
A = QR
$$

In [27]:
A = np.round(np.random.randn(10, 5)*100); A

array([[-141.,   -2.,    3.,  -21.,  196.],
       [ -93.,  131.,    2.,    9., -179.],
       [ 322.,  106.,  -24., -123.,  153.],
       [-111.,  -48.,  -33.,   70.,   95.],
       [ -64., -127.,  -98.,  -37.,   34.],
       [ -52.,   84.,  -23.,   90.,   21.],
       [ -39.,   28.,   12.,   74.,   23.],
       [-103.,  -79.,   56., -155.,   44.],
       [  73.,  -26.,  -50.,  -91.,  -87.],
       [  47., -126.,  161.,  -14.,    2.]])

In [28]:
Q, R = np.linalg.qr(A)

In [29]:
Q

array([[-0.341, -0.086,  0.038,  0.281,  0.63 ],
       [-0.225, -0.55 ,  0.146,  0.274, -0.42 ],
       [ 0.779, -0.181, -0.079,  0.265,  0.402],
       [-0.268,  0.105, -0.187, -0.235,  0.293],
       [-0.155,  0.431, -0.589,  0.057, -0.029],
       [-0.126, -0.348, -0.027, -0.212,  0.16 ],
       [-0.094, -0.13 ,  0.092, -0.235,  0.132],
       [-0.249,  0.226,  0.222,  0.721,  0.079],
       [ 0.177,  0.145, -0.284,  0.264, -0.358],
       [ 0.114,  0.501,  0.671, -0.165, -0.03 ]])

In [30]:
R

array([[ 413.525,   73.863,    1.149, -101.048,   30.9  ],
       [   0.   , -268.349,   49.744,  -85.682,   66.486],
       [   0.   ,    0.   ,  202.574,    5.111,  -31.212],
       [   0.   ,    0.   ,    0.   , -224.543,   24.559],
       [   0.   ,    0.   ,    0.   ,    0.   ,  327.984]])

尽量不直接使用SymPy中的$QR$分解，在这里我们将分数转换为浮点数，并使用```sy.N()```进行四舍五入。

In [31]:
round_expr(sy.N(sy.matrices.MatrixBase.QRdecomposition(sy.randMatrix(4, 3))[0]),4)

⎡0.7603  -0.5625  -0.2564⎤
⎢                        ⎥
⎢0.2448  0.6919   -0.3301⎥
⎢                        ⎥
⎢0.4381  0.2142   0.8713 ⎥
⎢                        ⎥
⎣0.4124  0.3987   -0.257 ⎦

In [32]:
round_expr(sy.N(sy.matrices.MatrixBase.QRdecomposition(sy.randMatrix(4, 3))[1]),4)

⎡100.6429  115.7856  102.2725⎤
⎢                            ⎥
⎢   0      32.5377    3.3599 ⎥
⎢                            ⎥
⎣   0         0      23.6444 ⎦

大多数情况下，我们使用内置的算法而不是编写自己的算法。

# <font face="gotham" color="purple"> The Least-Squares Problem

我们不会深入讨论这个话题，我的高级计量经济学笔记本上有关于它的完整内容。对于那些还没有学习线性回归或计量经济学的人来说，知道最小二乘解是找到一个坐标$\beta$，使其成为$\text{Col}X$的基，从而形成$\hat{y}$的线性组合。

$\hat{y}$是$y$投影到$\text{Col}X$上的正交投影，记为$\hat{y} = \text{proj}_{\text{Col}X}y$。

$y$和$\hat{y}$之间的距离在向量空间中是所有可能的$\|y - X\beta \|$中最短的，即

$$
\|y - X\hat{\beta}\| \leq \|y - X\beta \|
$$

$\text{Col}X$与$y$的正交投影的分量正交，因此

\begin{align}
X^T(y-X\hat{\beta})&=0\\
X^Ty &= X^TX\hat{\beta}\\
\hat{\beta} &= (X^TX)^{-1}X^Ty 
\end{align}