In [3]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import time
import sys
from scipy.sparse import linalg
from pathlib import Path
torch.set_default_dtype(torch.torch.float64)
pi = torch.tensor(np.pi)
import matplotlib.pyplot as plt 

if torch.cuda.is_available():  
    device = "cuda" 
else:  
    device = "cpu"  


### 1. Gradient descent for the finite neuron method: $L^2$-fitting


1.1 Neural network architecture 
    $$\sum_{i=1}^n a_i \sigma(\omega_i x + b_i)$$
 
- Access and modify parameters in a neural network 

1.2 Define a loss function
$$
\min_{\theta} E(\theta) := \int_{-\pi}^{\pi} \frac{1}{2} |f(x) - \sum_{i=1}^{n} a_i \sigma( x + b_i) |^2 dx, \quad \theta = \{a_i,b_i\}_{i=1}^n
$$
        
1.3 Create an optimizer in pytorch and tune it parameters 

<br>

A simple one-hidden layer shallow neural network: 
$$\sum_{i=1}^n a_i \sigma(\omega_i x + b_i)$$
$$ x \rightarrow W_1x+b \rightarrow \sigma(W_1x +b) \rightarrow W_2(\sigma(W_1x))$$


In [4]:
## 1.1 Neural network architecture 
class model(nn.Module):
    def __init__(self, input_size, hidden_size1, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1) # W1, b
        self.fc2 = nn.Linear(hidden_size1, num_classes,bias = False) # W2

    def forward(self, x):
        u1 = self.fc2(F.relu(self.fc1(x)))
        return u1


In [5]:
## 1.2 Access and modify parameters in a neural network 
"""
Instantiate a class object my_model
Access the parameters
Mutate(modify) the parameters
"""
neuron_num = 4
my_model = model(1,neuron_num,1)

"""
1. Access parameters using my_model.parameters()
2. torch parameters contain two attributes: data and requires_grad
"""
for parameter in my_model.parameters():
    print(parameter.data)
    print(parameter.requires_grad)
print()


tensor([[-0.6466],
        [ 0.4610],
        [ 0.1265],
        [ 0.3587]])
True
tensor([0.1507, 0.1685, 0.1006, 0.8667])
True
tensor([[ 0.3107, -0.4210,  0.4471,  0.4535]])
True



In [6]:
"""
1. Directly access the class attributes: fc1, fc2
2. Access the paramters as attributes of fc1, fc2
"""
print(my_model.fc1.weight)
print(my_model.fc1.bias)
print(my_model.fc1.weight.data)
print(my_model.fc1.weight.requires_grad)
print()
print("+++++++++++++")

Parameter containing:
tensor([[-0.6466],
        [ 0.4610],
        [ 0.1265],
        [ 0.3587]], requires_grad=True)
Parameter containing:
tensor([0.1507, 0.1685, 0.1006, 0.8667], requires_grad=True)
tensor([[-0.6466],
        [ 0.4610],
        [ 0.1265],
        [ 0.3587]])
True

+++++++++++++


In [7]:
## 1.2 Access and modify parameters in a neural network 
"""
Instantiate a class object my_model
Access the parameters
Mutate(modify) the parameters
"""
neuron_num = 4
my_model = model(1,neuron_num,1)

"""
1. Access parameters using my_model.parameters()
2. torch parameters contain two attributes: data and requires_grad
"""
for parameter in my_model.parameters():
    print(parameter.data)
    print(parameter.requires_grad)
print()

"""
1. Directly access the class attributes: fc1, fc2
2. Access the paramters as attributes of fc1, fc2
"""
print(my_model.fc1.weight)
print(my_model.fc1.bias)
print(my_model.fc1.weight.data)
print(my_model.fc1.weight.requires_grad)
print()
print("+++++++++++++")
for param in my_model.fc1.parameters():
    print(param)
    param.requires_grad = False 
    param.requires_grad_()
print(my_model.fc1.weight)
print(my_model.fc1.bias)

"""
Modify the parameter:
Shape matches; modify data and requires_grad separately
"""
print("=========")
w_i = torch.full((1,neuron_num),1.0).view(neuron_num,1)
print(my_model.fc1.weight.data)
my_model.fc1.weight.data = w_i # w_i is now the data
my_model.fc1.weight.requires_grad = False 
print(my_model.fc1.weight)
print(my_model.fc1.weight.data)
print(my_model.fc1.weight.requires_grad)

tensor([[ 0.1143],
        [-0.8234],
        [-0.9260],
        [-0.9157]])
True
tensor([-0.2767,  0.7563, -0.9077,  0.3532])
True
tensor([[ 0.0165, -0.2589, -0.3392, -0.0975]])
True

Parameter containing:
tensor([[ 0.1143],
        [-0.8234],
        [-0.9260],
        [-0.9157]], requires_grad=True)
Parameter containing:
tensor([-0.2767,  0.7563, -0.9077,  0.3532], requires_grad=True)
tensor([[ 0.1143],
        [-0.8234],
        [-0.9260],
        [-0.9157]])
True

+++++++++++++
Parameter containing:
tensor([[ 0.1143],
        [-0.8234],
        [-0.9260],
        [-0.9157]], requires_grad=True)
Parameter containing:
tensor([-0.2767,  0.7563, -0.9077,  0.3532], requires_grad=True)
Parameter containing:
tensor([[ 0.1143],
        [-0.8234],
        [-0.9260],
        [-0.9157]], requires_grad=True)
Parameter containing:
tensor([-0.2767,  0.7563, -0.9077,  0.3532], requires_grad=True)
tensor([[ 0.1143],
        [-0.8234],
        [-0.9260],
        [-0.9157]])
Parameter containing:
t

#### It is easy to compute the nodal points of the relu shallow neural network. 
$$\sum_{i=1}^n a_i \sigma(\omega_i x + b_i)$$

The nodal points are $\{-\frac{b_i}{\omega_i} \}_{i=1}^n$. 

In [8]:
def compute_integration_nodes_relunn(a,b,model): 
    weight = model.fc1.weight.detach().squeeze()
    bias = model.fc1.bias.detach() 
    neuron_number = bias.size(0)
    node = torch.empty(neuron_number + 2).to(device)
    node[-1] = a
    node[-2] = b 
    node[0:neuron_number] = - bias / weight 
    node = node[(node <= b)] 
    node = node[(node >= a)]
    node = node.view(-1,1)
    if neuron_number < 100: 
        refined_node = torch.linspace(a,b,100).view(-1,1)
        node = torch.cat([node,refined_node])
    node = node.unique()
    node, indices = torch.sort(node)
    node = node.view(-1,1)
    return node

#### 1.3 Define a loss function using piecewise Gauss quadrature

1.3 Define a loss function
$$
	\min_{\theta} E(\theta) := \int_{-\pi}^{\pi} \frac{1}{2} |f(x) - \sum_{i=1}^{n} a_i \sigma( x + b_i) |^2 dx, \text{ where } \theta = \{a_i,b_i\}_{i=1}^n
$$

#### Standard p-point Gaussian quadrature rule on $[-1,1]$
\begin{equation}
    \int_{-1}^{1} f(x) dx \approx \sum_{i = 1}^p w_i f(g_i).
\end{equation}

e.g. 5 point Gaussian quadrature: 

$g_i, w_i$ (left, right):
\begin{aligned}
& 0, ~~~~ \frac{128}{225}\\
& \pm \frac{1}{3} \sqrt{5-2 \sqrt{\frac{10}{7}}}, ~ \frac{322+13 \sqrt{70}}{900} \\
& \pm \frac{1}{3} \sqrt{5+2 \sqrt{\frac{10}{7}}}, ~\frac{322-13 \sqrt{70}}{900}
\end{aligned}

On an arbitrary interval $[x_i, x_{i+1}]$: 
\begin{equation}
\begin{aligned}
        \int_{x_i}^{x_{i+1}} f(x) dx & = \frac{x_{i+1} - x_{i}}{2 } \int_{-1}^1 f(\frac{x_{i+1} - x_i}{2} \xi + \frac{x_{i+1} + x_i}{2}) d\xi \\
        & \approx \frac{x_{i+1} - x_{i}}{2 } \sum_{j =1}^p w_j f(\frac{x_{i+1} - x_i}{2}  g_j + \frac{x_{i+1} + x_i}{2}) 
\end{aligned}
\end{equation}

**Method**
- Divide the domain (interval) into several subdomains (subintervals). $[x_0,x_1], [x_1,x_2],...,[x_{N-1}, x_{N}]$
- Compute the quadrature value in each subdomain.
- Sum the quadrature values in each subdomain to get the quadrature value over the whole domain.

<br>

**Vectorization**: 

$$\int_{x_i}^{x_{i+1}} f(x) dx 
\frac{x_{i+1} - x_{i}}{2 } \approx \frac{x_{i+1} - x_{i}}{2 } \sum_{j =1}^p w_j f(\frac{x_{i+1} - x_i}{2}  g_j + \frac{x_{i+1} + x_i}{2}),  i= 0,...,N-1 $$

Sum over $i$. 

In [10]:
### 5 point Gauss Quadrature rule
def integrand(x):
    return torch.sin(10*x)+1 

## the following quadrature weights and nodes are provied in numpy 
# gx = torch.tensor([-0.9061798459386639927976, -0.5384693101056830910363, 0, 0.5384693101056830910363, 
#      0.9061798459386639927976]).to(device)
# gx = gx.view(1,-1)
# gw = torch.tensor([0.2369268850561890875143, 0.4786286704993664680413, 0.5688888888888888888889, 0.4786286704993664680413,
#      0.2369268850561890875143]).to(device)
# gw = gw.view(-1,1) 
order = 5 
x,w = np.polynomial.legendre.leggauss(order)
gx = torch.tensor(x).to(device)
gx = gx.view(1,-1) # row vector 
gw = torch.tensor(w).to(device)    
gw = gw.view(-1,1) # Column vector 

num_points = 12 # subintervals
nodes = torch.linspace(-pi,pi,num_points+1).view(-1,1) 
coef1 = ((nodes[1:,:] - nodes[:-1,:])/2) # n by 1  
coef2 = ((nodes[1:,:] + nodes[:-1,:])/2) # n by 1  
coef2_expand = coef2.expand(-1,gx.size(1)) # Expand to n by p shape, -1: keep the first dimension n , expand the 2nd dim (columns)
integration_points = coef1@gx + coef2_expand
integration_points = integration_points.flatten().view(-1,1) # Make it a column vector
gw_expand = torch.tile(gw,(num_points,1)) # rows: n copies of current tensor, columns: 1 copy, no change
# Modify coef1 to be compatible with func_values
coef1_expand = coef1.expand(coef1.size(0),gx.size(1))    
coef1_expand = coef1_expand.flatten().view(-1,1)

func_values = integrand(integration_points)
integral_value = torch.matmul(func_values.T,gw_expand*coef1_expand) #integral_value = torch.sum(func_values*gw_expand*coef1_expand)
print(integral_value)


tensor([[6.2832]])


##### 1.4 Choices of optimizers 

Suppose we have a loss function $E(
\theta)$,

- Gradient descent: 
$$\theta^{k+1} = \theta^k - \eta_k \nabla_\theta E(\theta^k)$$

- Gradient descent with momentum 
$$\theta^{k+1} = \theta^k - \eta \bigg( \mu (\theta^k - \theta^{k-1})  + (1 - \mu)\nabla_\theta E(\theta^k) \bigg) $$

For this simple example, we can compute the gradient by hand. 

$$
\nabla_a E(\theta) = \int_{-\pi}^{\pi} \big( \sum_{i=1}^{n} a_i \sigma( x + b_i) -  f(x)\big) \begin{pmatrix} \\
\sigma( x + b_1)\\
\sigma( x + b_2)\\
\vdots\\
\sigma( x + b_n)
\end{pmatrix} dx
$$

$$
\nabla_b E(\theta) = \int_{-\pi}^{\pi} \big( \sum_{i=1}^{n} a_i \sigma( x + b_i) -  f(x)\big) \begin{pmatrix} \\
a_1\sigma'( x + b_1)\\
a_2\sigma'( x + b_2)\\
\vdots\\
a_n\sigma'( x + b_n)
\end{pmatrix} dx
$$




##### Create an optimizer in pytorch and tune it parameters 
- import torch.optim as optim
- link: https://pytorch.org/docs/stable/optim.html, https://pytorch.org/docs/stable/generated/torch.optim.SGD.html
- Schedule learning rate

- Basic syntax:
        
        optimizer = optim.SGD(my_model.parameters(), lr=0.02) # Create an optimizer
        
        optimizer.zero_grad() # clear the gradient wrt parameters 
    
        output = model(input) 
    
        loss = loss_fn(output, target) # define the loss 
    
        loss.backward() # compute the gradients 
    
        optimizer.step() # gradient step: $\theta^{k+1} = \theta^k - \eta \nabla_\theta E(\theta^k)$
        

In [None]:
## 1.4 Create an optimizer in pytorch and tune it parameters 

# Define optimizer 
optimizer = optim.SGD(my_model.parameters(), lr=0.02)
# optimizer = optim.Adam(my_model.parameters(), lr=0.02)

# Learning rate schedule
lr = 0.02
for epoch in range(20):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr * (0.98 ** ((epoch + 1) // 1000)) # Learning rate schedule
