In [1]:
%matplotlib inline
import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt

In [2]:
N = 20
np.random.seed(1)
A = np.random.randn(N, 5)
b = np.random.randn(N)

In [3]:
A = np.dot(A, A.T) + np.diag([0.01]*N)
x_true = la.solve(A, b)
la.matrix_rank(A)

20

In [4]:
x = np.random.randn(N)
s_past = np.zeros((N))
gt_g_past = 1;

for i in range(N):
    # gradient
    g = np.dot(A, x) - b
    # search direction
    gt_g = np.dot(g, g)
    c = gt_g/gt_g_past
    s = -g + c*s_past
    # step size
    gam = -np.dot(s, g)/np.dot(s, np.dot(A, s))
    # update
    x = x + gam*s
    s_past = s
    gt_g_past = gt_g
    print('=========== STEP {} ==========='.format(i + 1))
    print(np.column_stack([x, x_true]))
    print()

[[ -4.57881953e-01  -6.30207724e+01]
 [ -2.00749438e-01   5.56479040e+01]
 [  6.46650012e-01   9.17719835e+01]
 [  1.95763879e+00   4.54763936e+01]
 [ -1.31159511e+00  -1.00517086e+02]
 [  1.14732499e+00  -8.75536447e+00]
 [  1.40340437e+00   3.60549762e+01]
 [  2.79541099e-01  -1.06441008e+00]
 [ -1.32369227e+00   3.19441532e+01]
 [  1.28037513e+00   5.71768108e+01]
 [ -5.31971532e-01  -1.67285577e+02]
 [ -2.44976903e-01   4.70891219e+01]
 [ -7.76500933e-01   5.40208206e+01]
 [  1.15131344e+00  -2.43522677e+01]
 [  9.32252398e-01   7.95283833e+01]
 [ -4.00585765e-01  -9.34671522e+01]
 [  5.66519705e-01   2.00982520e+01]
 [ -1.24926689e+00  -1.20409387e+02]
 [  6.89950574e-01   8.36525899e+01]
 [ -5.13607895e-02   3.83211460e+01]]

[[ -2.02541019e-01  -6.30207724e+01]
 [ -1.75774453e-01   5.56479040e+01]
 [  7.10923492e-01   9.17719835e+01]
 [  1.86625856e+00   4.54763936e+01]
 [ -1.11630348e+00  -1.00517086e+02]
 [  1.07887949e+00  -8.75536447e+00]
 [  1.04320484e+00   3.60549762e+01]

Don't need $A$ explicitly. Don't need to know its shape, its structure, nothing. We only need the function evaluation. If $A$ corresponds to the linear transformation $f$, then we only need to know the result $f(x)$ for some $x$.

* Matrix $A$ can be huge if constructed. 
* Matrix $A$ can be on the other side of the world (literally); we don't need it. We simply need the result $Ax$.
* In the above cases (when you don't or can't have $A$ explicitly), solving a linear system via conjugate gradient is a very good idea.