# Math 110A Homework 4
### Francisco Banuelos

### 1)



In [18]:
import numpy as np
import matplotlib.pyplot as plt



In [19]:
a,b = 1,100    # parameters for Rosenbrock function
f = lambda x,y: (a-x)**2+b*(y-x**2)**2
Df = lambda x,y: np.array([2*(x-a)-4*b*x*(y-x**2),
                           2*b*(y-x**2)])



In [20]:
def approx_convergence_rate(path, minimizer, numToAvg=100, showPlot=False):
    '''Given a path defined by an iteration and a known minimizer, approximates convergence rate'''
    err = np.linalg.norm(path-np.array(minimizer),axis=1) # ||x_k-x*||=e_k
    
    # if converged in very few steps, return infinite order
    if len(err)<=3:
        return np.inf
    
    pp = np.zeros(len(err)-3)
    for i in range(len(err)-3):
        pp[i] = np.log(err[i+2]/err[i+1])/np.log(err[i+1]/err[i])
    
    if numToAvg>len(pp):
        # if not enough iterations to average, just average all
        p=np.mean(pp)
    else:
        # return mean of last few iterations
        p=np.mean(pp[-numToAvg:])
        
    # plot
    if showPlot:
        plt.plot(pp)
        plt.plot(pp*0+p)
        plt.xlabel('k')
        plt.ylabel('p')
        plt.title(f'p={p}')
        plt.show()
        
    return p

In [21]:
def PR_beta(Dfk,Dfk1):
    return ((Dfk1-Dfk)@Dfk1) / (Dfk@Dfk)

In [22]:
def FR_beta(Dfk,Dfk1):
    return (Dfk1@Dfk1) / (Dfk@Dfk)

Exact line search means finding the value of $\alpha$. As for the direction of descent, we choose the negative of the gradient at the inital point. The whole procedure is summarized with the finding the values of $\alpha$ such that 

$$\min_{\alpha_{k}>0}\phi(\alpha)= \min_{\alpha>0}f(\vec{x_{k}}+\alpha \vec{p_{k}})$$

where $\vec{p_{k}}=-\nabla f_{k}$. The initial point is considered at $\vec{x_{0}}=(1.2,1.2)$ and $\vec{x_{0}}=(-1.2,1)$

To find the optimal value, solve for the critical points. 

It is found that for the initial point at (1.2,1.2) $\alpha=0.0122$,$\alpha=0.0236$, and $\alpha=0.00076$. These are the roots of:
$$\cssId{diff-var-order-mathjax}{\tfrac{\mathrm{d}}{\mathrm{d}{\alpha}}}\left[{100\left(1.2-200{\alpha}\left(1.2-1.2^2\right)-\left(1.2+400{\cdot}1.2{\alpha}\left(1.2-1.2^2\right)+2{\alpha}\left(1-1.2\right)\right)^2\right)^2+\left(1-1.2-400{\alpha}{\cdot}1.2\left(1.2-1.2^2\right)-2{\alpha}\left(1-1.2\right)\right)^2}\right]=0$$


The point corresponding the minimum is the last value of $\alpha$. Hence, this value is used. This is verified using the code below.

In [23]:
from scipy.optimize import minimize_scalar
x,y=1.2,1.2 # initial point
dx=Df(x,y)
pk=-dx

phi=lambda c:f(x + c*pk[0], y + c*pk[1]) # phi function as given above
res = minimize_scalar(phi) # find optimal value
res.x

0.0007626486503493179

In [24]:
x,y = 1.2,1.2
path_PR = [[x,y]]
tol = 1e-8            # stop when gradient is smaller than this amount
max_steps = 10000     # Maximum number of steps to run the iteration
i=0                   # iteration count
dx = Df(x,y)          # current gradient
pk = -dx
while np.linalg.norm(dx)>tol and i<max_steps:    
    
    alpha = 0.00076 # alpha from exact line search
    

    xnew,ynew = x + alpha*pk[0], y + alpha*pk[1]
    dx1 = Df(xnew,ynew)      # Df_{k+1}
    bk = PR_beta(dx,dx1)     # beta_k
    
    pk = -dx1 + bk*pk
    
    path_PR.append([xnew,ynew])
    x,y = xnew,ynew
    dx = dx1
    i += 1

path_PR=np.array(path_PR)
print(f'After {i} iterations, approximate minimum is {f(x,y)} at {x,y}')
R4=approx_convergence_rate(path_PR, [1,1], numToAvg=100, showPlot=False)
err=np.linalg.norm(np.diff(path_PR,axis=0),axis=1)
L=err[-1]/err[-2] # Approximate limit L of the ratio of the erros
print(f' The approximate convergence rate is {R4}')
print(f' The approximate limit L is {L}')

After 10000 iterations, approximate minimum is 4.1011671962380484e-05 at (1.0063989536600613, 1.0128643615776578)
 The approximate convergence rate is 1.0000023316842594
 The approximate limit L is 0.9997011881242253


In [25]:
x,y = 1.2,1.2
path_FR = [[x,y]]
tol = 1e-8            # stop when gradient is smaller than this amount
max_steps = 10000     # Maximum number of steps to run the iteration
i=0                   # iteration count
dx = Df(x,y)          # current gradient
pk = -dx
while np.linalg.norm(dx)>tol and i<max_steps:    
    
    alpha = 0.00076 # alpha from exact line search

    xnew,ynew = x + alpha*pk[0], y + alpha*pk[1]
    dx1 = Df(xnew,ynew)      # Df_{k+1}
    bk = FR_beta(dx,dx1)  # beta_k
    
    pk = -dx1 + bk*pk
    
    path_FR.append([xnew,ynew])
    x,y = xnew,ynew
    dx = dx1
    i += 1

path_FR=np.array(path_FR)
print(f'After {i} iterations, approximate minimum is {f(x,y)} at {x,y}')
R5=approx_convergence_rate(path_FR, [1,1], numToAvg=100, showPlot=False)
err=np.linalg.norm(np.diff(path_FR,axis=0),axis=1)
L=err[-1]/err[-2] # Approximate limit L of the ratio of errors
print(f' The approximate convergence rate is {R5}')
print(f' Approximate limit is {L}')

After 964 iterations, approximate minimum is 1.2462183404613065e-16 at (1.0000000111542098, 1.000000022353745)
 The approximate convergence rate is 1.0000043372524952
 Approximate limit is 0.9788417693199295


It is seen that both converge successfully to the minimizer and their approximate convergence rate is $1$. Furthermore, since $L<1$, this confirms that both converge linearly but Fletcher-Reeves is faster since it only took $964$ iterations.


Now, we restart every 3 iterations. That means setting $\beta_{k}=0$.

In [26]:
x,y = 1.2,1.2
path_PR = [[x,y]]
tol = 1e-8            # stop when gradient is smaller than this amount
max_steps = 10000     # Maximum number of steps to run the iteration
i=0                   # iteration count
dx = Df(x,y)          # current gradient
pk = -dx
while np.linalg.norm(dx)>tol and i<max_steps:    
    
    alpha = 0.00076 # alpha from exact line search
    

    xnew,ynew = x + alpha*pk[0], y + alpha*pk[1]
    dx1 = Df(xnew,ynew)      # Df_{k+1}
    bk = PR_beta(dx,dx1)     # beta_k
    if i%3 == 0:     # restart every 3 iterations
        bk=0
    pk = -dx1 + bk*pk
    
    path_PR.append([xnew,ynew])
    x,y = xnew,ynew
    dx = dx1
    i += 1

path_PR=np.array(path_PR)
print(f'After {i} iterations, approximate minimum is {f(x,y)} at {x,y}')
R=approx_convergence_rate(path_PR, [1,1], numToAvg=100, showPlot=False)
err=np.linalg.norm(np.diff(path_PR,axis=0),axis=1)
L=err[-1]/err[-2] # Limit L of the erros
print(f' The approximate convergence rate is {R}')
print(f' The approximate limit L is {L} ')

After 10000 iterations, approximate minimum is 4.09856790792875e-05 at (1.0063969255224006, 1.0128602712970445)
 The approximate convergence rate is 0.999999372600341
 The approximate limit L is 0.9997013653917546 


In [27]:
x,y = 1.2,1.2
path_FR = [[x,y]]
tol = 1e-8            # stop when gradient is smaller than this amount
max_steps = 10000     # Maximum number of steps to run the iteration
i=0                   # iteration count
dx = Df(x,y)          # current gradient
pk = -dx
while np.linalg.norm(dx)>tol and i<max_steps:    
    
    alpha = 0.00076

    xnew,ynew = x + alpha*pk[0], y + alpha*pk[1]
    dx1 = Df(xnew,ynew)      # Df_{k+1}
    bk = FR_beta(dx,dx1)  # beta_k
    if i%3 == 0:     # restarting every 3 iterations
        bk=0
    pk = -dx1 + bk*pk
    
    path_FR.append([xnew,ynew])
    x,y = xnew,ynew
    dx = dx1
    i += 1

path_FR=np.array(path_FR)
print(f'After {i} iterations, approximate minimum is {f(x,y)} at {x,y}')
R1=approx_convergence_rate(path_FR, [1,1], numToAvg=100, showPlot=False)
err=np.linalg.norm(np.diff(path_FR,axis=0),axis=1)
L=err[-1]/err[-2]
print(f' The approximate convergence rate is {R1}')
print(f' The approximate limit L is {L} ')

After 10000 iterations, approximate minimum is 1.5135521143124254e-07 at (1.000388570769531, 1.000779210776803)
 The approximate convergence rate is 1.2644368369249688
 The approximate limit L is 1.553674087780001 


The convergence is nearly linear for the PR method. For Fletcher-Reeves, the convergence is super linear which is an improvement from Fletcher-Reeves without restarting.

The second point,at $(-1.2,1)$, has 3 critical points:$\alpha=0.0122$, $\alpha=0.0650$, and $\alpha=0.00079$. These are the roots of:

$$\cssId{diff-var-order-mathjax}{\tfrac{\mathrm{d}}{\mathrm{d}{\alpha}}}\left[{100\left(1-200{\alpha}\left(1-1.2^2\right)-\left(-1.2-400{\cdot}1.2{\alpha}\left(1-1.2^2\right)+2{\alpha}\left(1+1.2\right)\right)^2\right)^2+\left(1+1.2+400{\alpha}{\cdot}1.2\left(1-1.2^2\right)-2{\alpha}\left(1+1.2\right)\right)^2}\right]=0$$

The value of $\alpha$ satisfying the minimization is $\alpha=0.00079$. This is verified in the code below.




In [28]:
from scipy.optimize import minimize_scalar
x,y=-1.2,1 # initial value
dx=Df(x,y)
pk=-dx

phi=lambda c:f(x + c*pk[0], y + c*pk[1]) # phi function to be minimized
res = minimize_scalar(phi) # minimize
res.x

0.0007880024518644726

In [29]:
x,y = -1.2,1 # initial point
path_PR = [[x,y]]
tol = 1e-8            # stop when gradient is smaller than this amount
max_steps = 10000     # Maximum number of steps to run the iteration
i=0                   # iteration count
dx = Df(x,y)          # current gradient
pk = -dx
while np.linalg.norm(dx)>tol and i<max_steps:    
    
    alpha = 0.00079 # alpha from exact line search
    

    xnew,ynew = x + alpha*pk[0], y + alpha*pk[1]
    dx1 = Df(xnew,ynew)      # Df_{k+1}
    bk = PR_beta(dx,dx1)     # beta_k
    
    pk = -dx1 + bk*pk
    
    path_PR.append([xnew,ynew])
    x,y = xnew,ynew
    dx = dx1
    i += 1

path_PR=np.array(path_PR)
print(f'After {i} iterations, approximate minimum is {f(x,y)} at {x,y}')
R2=approx_convergence_rate(path_PR, [1,1], numToAvg=100, showPlot=False)
err=np.linalg.norm(np.diff(path_PR,axis=0),axis=1)
L=err[-1]/err[-2]
print(f' The approximate convergence rate is {R2}')
print(f' The approximate limit L is {L}')

After 10000 iterations, approximate minimum is 0.0003254246072593479 at (0.9819752124736729, 0.9642024041355456)
 The approximate convergence rate is 0.9999928043011022
 The approximate limit L is 0.999670612325752


In [30]:
x,y = -1.2,1 # initial point
path_FR = [[x,y]]
tol = 1e-8            # stop when gradient is smaller than this amount
max_steps = 10000     # Maximum number of steps to run the iteration
i=0                   # iteration count
dx = Df(x,y)          # current gradient
pk = -dx
while np.linalg.norm(dx)>tol and i<max_steps:    
    
    alpha = 0.00079

    xnew,ynew = x + alpha*pk[0], y + alpha*pk[1]
    dx1 = Df(xnew,ynew)      # Df_{k+1}
    bk = FR_beta(dx,dx1)  # beta_k
   
    pk = -dx1 + bk*pk
    
    path_FR.append([xnew,ynew])
    x,y = xnew,ynew
    dx = dx1
    i += 1

path_FR=np.array(path_FR)
print(f'After {i} iterations, approximate minimum is {f(x,y)} at {x,y}')
R3=approx_convergence_rate(path_FR, [1,1], numToAvg=100, showPlot=False)
err=np.linalg.norm(np.diff(path_FR,axis=0),axis=1)
L=err[-1]/err[-2]
print(f' The approximate convergence rate is {R3}')
print(f' The approximate limit L is {L}')

After 37 iterations, approximate minimum is nan at (nan, nan)
 The approximate convergence rate is inf
 The approximate limit L is nan


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  app.launch_new_instance()
  s = (x.conj() * x).real


It is seen that the PR method is very close to being linear or even sub-linear. The Fletcher-Reeves method diverges from this point.

In [31]:
x,y = -1.2,1 # initial point
path_PR = [[x,y]]
tol = 1e-8            # stop when gradient is smaller than this amount
max_steps = 10000     # Maximum number of steps to run the iteration
i=0                   # iteration count
dx = Df(x,y)          # current gradient
pk = -dx
while np.linalg.norm(dx)>tol and i<max_steps:    
    
    alpha = 0.00079 # alpha from exact line search
    

    xnew,ynew = x + alpha*pk[0], y + alpha*pk[1]
    dx1 = Df(xnew,ynew)      # Df_{k+1}
    bk = PR_beta(dx,dx1)     # beta_k
    if i%3 == 0:     # restarting every 3 iterations
        bk=0
    pk = -dx1 + bk*pk
    
    path_PR.append([xnew,ynew])
    x,y = xnew,ynew
    dx = dx1
    i += 1

path_PR=np.array(path_PR)
print(f'After {i} iterations, approximate minimum is {f(x,y)} at {x,y}')
R2=approx_convergence_rate(path_PR, [1,1], numToAvg=100, showPlot=False)
err=np.linalg.norm(np.diff(path_PR,axis=0),axis=1)
L=err[-1]/err[-2]
print(f' The approximate convergence rate is {R2}')
print(f' The approximate limit L is {L}')

After 10000 iterations, approximate minimum is 0.0003250974444066023 at (0.9819842751250115, 0.9642202398725522)
 The approximate convergence rate is 0.9999895428800052
 The approximate limit L is 0.9996708367290676


In [32]:
x,y = -1.2,1 # initial point
path_FR = [[x,y]]
tol = 1e-8            # stop when gradient is smaller than this amount
max_steps = 10000     # Maximum number of steps to run the iteration
i=0                   # iteration count
dx = Df(x,y)          # current gradient
pk = -dx
while np.linalg.norm(dx)>tol and i<max_steps:    
    
    alpha = 0.00079

    xnew,ynew = x + alpha*pk[0], y + alpha*pk[1]
    dx1 = Df(xnew,ynew)      # Df_{k+1}
    bk = FR_beta(dx,dx1)  # beta_k
    if i%3 == 0:     # restarting every 3 iterations
        bk=0
    pk = -dx1 + bk*pk
    
    path_FR.append([xnew,ynew])
    x,y = xnew,ynew
    dx = dx1
    i += 1

path_FR=np.array(path_FR)
print(f'After {i} iterations, approximate minimum is {f(x,y)} at {x,y}')
R3=approx_convergence_rate(path_FR, [1,1], numToAvg=100, showPlot=False)
err=np.linalg.norm(np.diff(path_FR,axis=0),axis=1)
L=err[-1]/err[-2]
print(f' The approximate convergence rate is {R3}')
print(f' The approximate limit L is {L}')

After 10000 iterations, approximate minimum is 7.848259836416086e-07 at (0.9991145040972794, 0.9982271034503895)
 The approximate convergence rate is 1.2659157565027892
 The approximate limit L is 1.5700289936439011


Restarting every  3 iterations, the convergence rate for the PR method remains very much unchanged. However, the situation improves for the Fletcher-Reeves as it no longer diverges and it converges in a superlinear fashion. 