# 100k DOFs case

From the Kratos outputs it seems that on every "Fluid Dynamics Analysis" step 6
problems are solved using AMGCL: 2 larger ones, and 4 smaller ones (a larger
problem has exactly 4 times more DOFs than a smaller one). Riccardo says these
are all physically different problems, which seems to be confirmed by the fact
that each of the problems takes distinctive and repeated across time steps
number of iterations to converge. I'll consider each of these as a separate
sequence of problems.

AMGCL parameters:

- `solver.type=bicgstab`
- `solver.tol=1e-6`
- `precond.coarse_enough=500`
- `precond.coarsening.type=aggregation`
- `precond.relax.type=damped_jacobi`

The larger problems were solved with 4x4 block values both with OpenMP and VexCL(CUDA) backends.

In [1]:
import pandas
import re
from pathlib import Path
def csv(fname):
    return pandas.read_csv(fname, delimiter='\t', header=None)

def speedup(full, part):
    data = {}
    for fname in (full, part):
        for line in open(fname):
            if m := re.search(r'amgcl:\s+(\d+\.\d+) s', line.strip()):
                data[(fname, 'total')] = float(m.group(1))
            elif m := re.search(r'setup:\s+(\d+\.\d+) s', line.strip()):
                data[(fname, 'setup')] = float(m.group(1))
    print(f"Total speedup: {100 * (data.get((full, 'total'), 0) / data.get((part, 'total'), 1) - 1):.2f}%")
    print(f"Setup speedup: {100 * (data.get((full, 'setup'), 0) / data.get((part, 'setup'), 1) - 1):.2f}%")
    
def report(size, device, num, mode):
    full = Path(f'{size}/{device}-{num}-none.time')
    part = Path(f'{size}/{device}-{num}-{mode}.time')
    
    print('*** Full AMGCL rebuild on every step ***\n')
    print(full.read_text().strip())
    log = csv(full.with_suffix('.log'))
    print(f"\nAverage iterations: {log.mean()[2]:.1f}")
    print(f"Full rebuilds: {log.sum()[1]:.0f}")
    
    print('\n\n*** Full AMGCL rebuild on every 10th step, partial rebuild elsewhere ***\n')
    print(part.read_text().strip())
    log = csv(part.with_suffix('.log'))
    print(f"\nAverage iterations: {log.mean()[2]:.1f}")
    print(f"Full rebuilds: {log.sum()[1]:.0f}")
    speedup(full, part)

## Small problem 0, OpenMP

In [2]:
report('small', 'cpu', 0, 'full')

*** Full AMGCL rebuild on every step ***

[Profile:                       21.468 s] (100.00%)
[ self:                          0.035 s] (  0.16%)
[  CSR copy:                     0.172 s] (  0.80%)
[  amgcl:                        4.128 s] ( 19.23%)
[    setup:                      1.235 s] (  5.75%)
[      full:                     1.235 s] (  5.75%)
[        coarse operator:        0.441 s] (  2.06%)
[        coarsest level:         0.003 s] (  0.01%)
[        move to backend:        0.005 s] (  0.02%)
[        relaxation:             0.066 s] (  0.31%)
[        transfer operators:     0.699 s] (  3.26%)
[         self:                  0.036 s] (  0.17%)
[          aggregates:           0.640 s] (  2.98%)
[          interpolation:        0.023 s] (  0.10%)
[            tentative:          0.016 s] (  0.08%)
[    solve:                      2.893 s] ( 13.48%)
[      axpby:                    0.028 s] (  0.13%)
[      axpbypcz:                 0.053 s] (  0.25%)
[      clear:         

In [3]:
report('small', 'cpu', 0, 'part')

*** Full AMGCL rebuild on every step ***

[Profile:                       21.468 s] (100.00%)
[ self:                          0.035 s] (  0.16%)
[  CSR copy:                     0.172 s] (  0.80%)
[  amgcl:                        4.128 s] ( 19.23%)
[    setup:                      1.235 s] (  5.75%)
[      full:                     1.235 s] (  5.75%)
[        coarse operator:        0.441 s] (  2.06%)
[        coarsest level:         0.003 s] (  0.01%)
[        move to backend:        0.005 s] (  0.02%)
[        relaxation:             0.066 s] (  0.31%)
[        transfer operators:     0.699 s] (  3.26%)
[         self:                  0.036 s] (  0.17%)
[          aggregates:           0.640 s] (  2.98%)
[          interpolation:        0.023 s] (  0.10%)
[            tentative:          0.016 s] (  0.08%)
[    solve:                      2.893 s] ( 13.48%)
[      axpby:                    0.028 s] (  0.13%)
[      axpbypcz:                 0.053 s] (  0.25%)
[      clear:         

## Small problem 0, VexCL(CUDA)

In [4]:
report('small', 'gpu', 0, 'full')

*** Full AMGCL rebuild on every step ***

[Profile:                        3.941 s] (100.00%)
[ self:                          0.472 s] ( 11.97%)
[  CSR copy:                     0.139 s] (  3.54%)
[  amgcl:                        3.009 s] ( 76.34%)
[    setup:                      2.064 s] ( 52.37%)
[      full:                     2.064 s] ( 52.37%)
[       self:                    0.060 s] (  1.53%)
[        coarse operator:        0.331 s] (  8.41%)
[        coarsest level:         0.002 s] (  0.06%)
[        move to backend:        1.132 s] ( 28.73%)
[        relaxation:             0.064 s] (  1.62%)
[        transfer operators:     0.474 s] ( 12.04%)
[         self:                  0.032 s] (  0.81%)
[          aggregates:           0.418 s] ( 10.62%)
[          interpolation:        0.024 s] (  0.61%)
[            tentative:          0.021 s] (  0.54%)
[    solve:                      0.944 s] ( 23.96%)
[      axpby:                    0.035 s] (  0.88%)
[      axpbypcz:      

In [5]:
report('small', 'gpu', 0, 'part')

*** Full AMGCL rebuild on every step ***

[Profile:                        3.941 s] (100.00%)
[ self:                          0.472 s] ( 11.97%)
[  CSR copy:                     0.139 s] (  3.54%)
[  amgcl:                        3.009 s] ( 76.34%)
[    setup:                      2.064 s] ( 52.37%)
[      full:                     2.064 s] ( 52.37%)
[       self:                    0.060 s] (  1.53%)
[        coarse operator:        0.331 s] (  8.41%)
[        coarsest level:         0.002 s] (  0.06%)
[        move to backend:        1.132 s] ( 28.73%)
[        relaxation:             0.064 s] (  1.62%)
[        transfer operators:     0.474 s] ( 12.04%)
[         self:                  0.032 s] (  0.81%)
[          aggregates:           0.418 s] ( 10.62%)
[          interpolation:        0.024 s] (  0.61%)
[            tentative:          0.021 s] (  0.54%)
[    solve:                      0.944 s] ( 23.96%)
[      axpby:                    0.035 s] (  0.88%)
[      axpbypcz:      

## Small problem 2, OpenMP

In [6]:
report('small', 'cpu', 2, 'full')

*** Full AMGCL rebuild on every step ***

[Profile:                       28.346 s] (100.00%)
[ self:                          0.056 s] (  0.20%)
[  CSR copy:                     0.170 s] (  0.60%)
[  amgcl:                       10.504 s] ( 37.05%)
[    setup:                      1.227 s] (  4.33%)
[      full:                     1.227 s] (  4.33%)
[        coarse operator:        0.378 s] (  1.33%)
[        coarsest level:         0.011 s] (  0.04%)
[        move to backend:        0.004 s] (  0.02%)
[        relaxation:             0.065 s] (  0.23%)
[        transfer operators:     0.750 s] (  2.65%)
[         self:                  0.031 s] (  0.11%)
[          aggregates:           0.699 s] (  2.46%)
[          interpolation:        0.021 s] (  0.07%)
[            tentative:          0.015 s] (  0.05%)
[    solve:                      9.277 s] ( 32.73%)
[      axpby:                    0.096 s] (  0.34%)
[      axpbypcz:                 0.173 s] (  0.61%)
[      clear:         

In [7]:
report('small', 'cpu', 2, 'part')

*** Full AMGCL rebuild on every step ***

[Profile:                       28.346 s] (100.00%)
[ self:                          0.056 s] (  0.20%)
[  CSR copy:                     0.170 s] (  0.60%)
[  amgcl:                       10.504 s] ( 37.05%)
[    setup:                      1.227 s] (  4.33%)
[      full:                     1.227 s] (  4.33%)
[        coarse operator:        0.378 s] (  1.33%)
[        coarsest level:         0.011 s] (  0.04%)
[        move to backend:        0.004 s] (  0.02%)
[        relaxation:             0.065 s] (  0.23%)
[        transfer operators:     0.750 s] (  2.65%)
[         self:                  0.031 s] (  0.11%)
[          aggregates:           0.699 s] (  2.46%)
[          interpolation:        0.021 s] (  0.07%)
[            tentative:          0.015 s] (  0.05%)
[    solve:                      9.277 s] ( 32.73%)
[      axpby:                    0.096 s] (  0.34%)
[      axpbypcz:                 0.173 s] (  0.61%)
[      clear:         

## Small problem 2, VexCL(CUDA)

In [8]:
report('small', 'gpu', 2, 'full')

*** Full AMGCL rebuild on every step ***

[Profile:                        5.008 s] (100.00%)
[ self:                          0.311 s] (  6.20%)
[  CSR copy:                     0.170 s] (  3.39%)
[  amgcl:                        4.246 s] ( 84.79%)
[    setup:                      1.610 s] ( 32.15%)
[      full:                     1.610 s] ( 32.15%)
[       self:                    0.044 s] (  0.88%)
[        coarse operator:        0.332 s] (  6.63%)
[        coarsest level:         0.010 s] (  0.19%)
[        move to backend:        0.647 s] ( 12.92%)
[        relaxation:             0.066 s] (  1.32%)
[        transfer operators:     0.511 s] ( 10.21%)
[         self:                  0.029 s] (  0.58%)
[          aggregates:           0.465 s] (  9.28%)
[          interpolation:        0.018 s] (  0.35%)
[            tentative:          0.015 s] (  0.29%)
[    solve:                      2.636 s] ( 52.63%)
[      axpby:                    0.005 s] (  0.10%)
[      axpbypcz:      

In [9]:
report('small', 'gpu', 2, 'part')

*** Full AMGCL rebuild on every step ***

[Profile:                        5.008 s] (100.00%)
[ self:                          0.311 s] (  6.20%)
[  CSR copy:                     0.170 s] (  3.39%)
[  amgcl:                        4.246 s] ( 84.79%)
[    setup:                      1.610 s] ( 32.15%)
[      full:                     1.610 s] ( 32.15%)
[       self:                    0.044 s] (  0.88%)
[        coarse operator:        0.332 s] (  6.63%)
[        coarsest level:         0.010 s] (  0.19%)
[        move to backend:        0.647 s] ( 12.92%)
[        relaxation:             0.066 s] (  1.32%)
[        transfer operators:     0.511 s] ( 10.21%)
[         self:                  0.029 s] (  0.58%)
[          aggregates:           0.465 s] (  9.28%)
[          interpolation:        0.018 s] (  0.35%)
[            tentative:          0.015 s] (  0.29%)
[    solve:                      2.636 s] ( 52.63%)
[      axpby:                    0.005 s] (  0.10%)
[      axpbypcz:      

## Large problem 0, OpenMP

In [10]:
report('large', 'cpu', 0, 'full')

*** Full AMGCL rebuild on every step ***

[Profile:                      309.560 s] (100.00%)
[  CSR copy:                     2.398 s] (  0.77%)
[  amgcl:                       74.320 s] ( 24.01%)
[    setup:                      3.756 s] (  1.21%)
[      full:                     3.756 s] (  1.21%)
[        coarse operator:        1.637 s] (  0.53%)
[        coarsest level:         0.005 s] (  0.00%)
[        move to backend:        0.042 s] (  0.01%)
[        relaxation:             0.303 s] (  0.10%)
[        transfer operators:     1.460 s] (  0.47%)
[          aggregates:           1.077 s] (  0.35%)
[          interpolation:        0.075 s] (  0.02%)
[            tentative:          0.071 s] (  0.02%)
[    solve:                     70.564 s] ( 22.79%)
[      axpby:                    0.546 s] (  0.18%)
[      axpbypcz:                 1.472 s] (  0.48%)
[      clear:                    0.343 s] (  0.11%)
[      coarse:                   0.021 s] (  0.01%)
[      copy:          

In [11]:
report('large', 'cpu', 0, 'part')

*** Full AMGCL rebuild on every step ***

[Profile:                      309.560 s] (100.00%)
[  CSR copy:                     2.398 s] (  0.77%)
[  amgcl:                       74.320 s] ( 24.01%)
[    setup:                      3.756 s] (  1.21%)
[      full:                     3.756 s] (  1.21%)
[        coarse operator:        1.637 s] (  0.53%)
[        coarsest level:         0.005 s] (  0.00%)
[        move to backend:        0.042 s] (  0.01%)
[        relaxation:             0.303 s] (  0.10%)
[        transfer operators:     1.460 s] (  0.47%)
[          aggregates:           1.077 s] (  0.35%)
[          interpolation:        0.075 s] (  0.02%)
[            tentative:          0.071 s] (  0.02%)
[    solve:                     70.564 s] ( 22.79%)
[      axpby:                    0.546 s] (  0.18%)
[      axpbypcz:                 1.472 s] (  0.48%)
[      clear:                    0.343 s] (  0.11%)
[      coarse:                   0.021 s] (  0.01%)
[      copy:          

## Large problem 0, VexCL(CUDA)

In [12]:
report('large', 'gpu', 0, 'full')

*** Full AMGCL rebuild on every step ***

[Profile:                      271.478 s] (100.00%)
[ self:                          0.643 s] (  0.24%)
[  CSR copy:                     2.661 s] (  0.98%)
[  amgcl:                       31.195 s] ( 11.49%)
[    setup:                      9.766 s] (  3.60%)
[      full:                     9.766 s] (  3.60%)
[       self:                    0.494 s] (  0.18%)
[        coarse operator:        1.840 s] (  0.68%)
[        coarsest level:         0.006 s] (  0.00%)
[        move to backend:        5.400 s] (  1.99%)
[        relaxation:             0.547 s] (  0.20%)
[        transfer operators:     1.479 s] (  0.54%)
[         self:                  0.300 s] (  0.11%)
[          aggregates:           1.100 s] (  0.41%)
[          interpolation:        0.078 s] (  0.03%)
[            tentative:          0.075 s] (  0.03%)
[    solve:                     21.429 s] (  7.89%)
[      axpby:                    0.023 s] (  0.01%)
[      axpbypcz:      

In [13]:
report('large', 'gpu', 0, 'part')

*** Full AMGCL rebuild on every step ***

[Profile:                      271.478 s] (100.00%)
[ self:                          0.643 s] (  0.24%)
[  CSR copy:                     2.661 s] (  0.98%)
[  amgcl:                       31.195 s] ( 11.49%)
[    setup:                      9.766 s] (  3.60%)
[      full:                     9.766 s] (  3.60%)
[       self:                    0.494 s] (  0.18%)
[        coarse operator:        1.840 s] (  0.68%)
[        coarsest level:         0.006 s] (  0.00%)
[        move to backend:        5.400 s] (  1.99%)
[        relaxation:             0.547 s] (  0.20%)
[        transfer operators:     1.479 s] (  0.54%)
[         self:                  0.300 s] (  0.11%)
[          aggregates:           1.100 s] (  0.41%)
[          interpolation:        0.078 s] (  0.03%)
[            tentative:          0.075 s] (  0.03%)
[    solve:                     21.429 s] (  7.89%)
[      axpby:                    0.023 s] (  0.01%)
[      axpbypcz:      