In [1]:
import pandas as pd
import numpy as np
from loaders import *

from scripts.utils import *

In [2]:
show_config(ConfigRegistry.PIM_ARCH)

architecture:
  version: 0.3
  subtree:                           # "subtree" contains abstract hierarchies
    - name: system_arch              # name of the design
      attributes:                    # shared attributes that will propagate down all lower levels
        technology: 32nm
        latency: 1ns
      local:
        - name: DRAM                 # offchip DRAM is the source of all datatypes
          class: DRAM                # assume DRAM is large enough to store all the data, so no depth specification needed
          attributes:
            width: 64                # width in bits
            datawidth: 16            # datawidth in bits
            block-size: 4
      subtree:
        - name: chip                 # all components below this level are on-chip components
          local:
            - name: shared_glb
              class: smartbuffer_SRAM
              attributes:
                memory_depth: 16384
                memory_width: 64
                n_bank

In [3]:
show_config(ConfigRegistry.PIM_COMPONENTS_DIR)

compound_components:
  version: 0.3
  classes:
    - name: A2D_conversion_system
      attributes:  # default attributes that can be overridden
        technology: 32nm
        latency: 1ns
        datawidth: 16
        cell_datawidth: 2
        n_DAC_cycles: 16
        ADC_datawidth: 8
        n_PE_cols: 8
        # derived parameters (automatically derived, but can be overridden from high-level)
        nCells: datawidth/cell_datawidth
        SH_datawidth: ADC_datawidth
        total_SHs: nCells * n_PE_cols # each physical column has a sample and hold
        ADC_converts_per_psum: nCells * n_DAC_cycles # temporal: n_DAC_cycles times for each column of cells
      subcomponents:
        - name: SH[0..total_SHs-1] # this is all SHs in the system
          class: sample_and_hold
          attributes:
            technology: technology
            latency: latency
            datawidth: SH_datawidth
        - name: ADC #  assume this ADC is fast enough to convert the output of the SHs 

In [4]:
show_config(ConfigRegistry.PIM_CONSTRAINTS)

mapspace:
  targets:
    # DRAM
    - target: DRAM
      type: bypassing
      keep: [Inputs, Outputs]
      bypass: [Weights]  # weights are stored locally in the memcells, not loaded from DRAM
    - target: DRAM
      type: temporal
      factors: R=1 S=1 M=1 C=1
      permutation: RSCPQMN
    # Global Buffer Constraints
    - target: shared_glb
      type: bypassing
      keep: [Inputs, Outputs]
      bypass: [Weights] # weights are stored locally in the memcells, not loaded from DRAM
    - target: shared_glb
      type: temporal
      # factors: R=1 S=1 M=1 C=1 # TODO(girfan): Ask TA if this makes sense.
    # Dummy Buffer Constraints
    - target: dummy_buffer
      type: bypassing
      keep: []
      bypass: [Inputs, Outputs, Weights]  # no data should be stored in dummy, it is just a transition buffer
    - target: dummy_buffer
      type: spatial
      factors: Q=1 P=1
      permutation: RSC M
      split: 3  # M cannot be mapped in the same dimension as others
    - target: d

In [5]:
show_config(ConfigRegistry.PIM_MAPPER)

mapper:
  optimization-metric: [ delay, energy ]
  live-status: False
  num-threads: 8
  timeout: 15000
  victory-condition: 800
  diagnostics: True



In [6]:
!accelergyTables -r /home/workspace/final-project/example_designs/simple_pim/processing-in-memory-design/PIM_estimation_tables

pim_accelergy_result = run_accelergy(
    ConfigRegistry.PIM_ARCH,
    ConfigRegistry.PIM_COMPONENTS_DIR
)
print(pim_accelergy_result.ert_verbose)

/home/workspace/final-project/example_designs/simple_pim/processing-in-memory-design/PIM_estimation_tables is already added as a root for table based plug-in
ERT_summary:
  version: 0.3
  table_summary:
  - name: system_arch.DRAM
    actions:
    - name: read
      energy: 512
    - name: write
      energy: 512
    - name: idle
      energy: 0
    primitive_estimation(s):
    - system_arch.DRAM:
        estimator: Cacti
  - name: system_arch.chip.PE[0..16383].scratchpad
    actions:
    - name: read
      energy: 0.0
    primitive_estimation(s):
    - action_name: read
      arguments: null
      energy: 0.0
      subaction_estimations:
      - subcomponent_name: storage
        subaction_name: idle
        arguments: null
        energy: 0.0
        action_share: 1
        interpreted_energy: 0.0
        percentage: 0%
        estimator: table-based-plug-ins
  - name: system_arch.chip.PE[0..16383].mac
    actions:
    - name: compute
      energy: 0.23424
    primitive_estimation(s):

In [7]:
alexnet_layers = [
    ConfigRegistry.ALEXNET_LAYER1,
    ConfigRegistry.ALEXNET_LAYER2,
    ConfigRegistry.ALEXNET_LAYER3,
    ConfigRegistry.ALEXNET_LAYER4,
    ConfigRegistry.ALEXNET_LAYER5,
]

In [8]:
alexnet_stats = []
alexnet_loops = []

for i, layer in enumerate(alexnet_layers):
    results = run_timeloop_mapper(
        ConfigRegistry.PIM_ARCH,
        ConfigRegistry.PIM_COMPONENTS_DIR,
        pim_accelergy_result.art,
        pim_accelergy_result.ert,
        ConfigRegistry.PIM_CONSTRAINTS,
        ConfigRegistry.PIM_MAPPER,
        layer,
    )
    print(f"Layer: {i+1}")
    stats, loops = results
    print(loops)
    
    alexnet_stats.append(stats)
    alexnet_loops.append(loops)

input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
Found Accelergy ERT (energy reference table), replacing internal energy model.
Found Accelergy ART (area reference table), replacing internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configuration complete.
Initializing Index Factorization subspace.
  Factorization options along problem dimension C = 2
  Factorization options along problem dimension M = 12
  Factorization options along problem dimension R = 2
  Factorization options along problem dimension S = 2
  Factorization options along problem dimension N = 1
  Factorization options along problem dimension P = 16
  Factorization options along

[  3] Utilization = 0.00 | pJ/Compute =  255.849 | L3[IO] Q5 P11 - L2[IO] S11 R11 M96 C3 - L1[] Q1 - L0[W] Q11 P5 
[  6] Utilization = 0.00 | pJ/Compute =   87.635 | L3[IO] Q5 P11 - L2[IO] S11 R11 M48 - L1[] Q1 M2Y C3X - L0[W] Q11 P5 
[  2] Utilization = 0.00 | pJ/Compute =   93.059 | L3[IO] Q5 P11 - L2[IO] S11 R11 M96 - L1[] Q1 C3X - L0[W] Q11 P5 
[  0] Utilization = 0.00 | pJ/Compute =   82.324 | L3[IO] Q5 P11 - L2[IO] S11 R11 - L1[] Q1 M96Y C3X - L0[W] Q11 P5 
[  1] Utilization = 0.00 | pJ/Compute =  245.114 | L3[IO] Q5 P11 - L2[IO] S11 R11 C3 - L1[] Q1 M96Y - L0[W] Q11 P5 
[  4] Utilization = 0.00 | pJ/Compute =   82.437 | L3[IO] Q5 P11 - L2[IO] S11 R11 M2 - L1[] Q1 M48Y C3X - L0[W] Q11 P5 
[  5] Utilization = 0.00 | pJ/Compute =  245.227 | L3[IO] Q5 P11 - L2[IO] S11 R11 M2 C3 - L1[] Q1 M48Y - L0[W] Q11 P5 
[  7] Utilization = 0.00 | pJ/Compute =  250.425 | L3[IO] Q5 P11 - L2[IO] S11 R11 M48 C3 - L1[] Q1 M2Y - L0[W] Q11 P5 
[  3] Utilization = 0.00 | pJ/Compute =  248.617 | L3[IO] 


               BEGIN DIAGNOSTICS               
-----------------------------------------------
Fail class: Fanout

  Level: dummy_buffer
    Fail count: 1206
    Sample mapping that experienced this fail class:

      DRAM [ Inputs:154587 (154587) Outputs:290400 (290400) ] 
      -------------------------------------------------------
      | for Q in [0:1)

      shared_glb [ Inputs:154587 (154587) Outputs:290400 (290400) ] 
      -------------------------------------------------------------
      |   for M in [0:6)
      |     for P in [0:55)
      |       for Q in [0:11)

      dummy_buffer [ ] 
      ----------------
      |         for Q in [0:5)
      |           for M in [0:16) (Spatial-Y)
      |             for C in [0:3) (Spatial-X)
      |               for S in [0:11) (Spatial-X)
      |                 for R in [0:11) (Spatial-X)

      scratchpad [ Weights:1 (1) ] 
      ----------------------------
      |                   for Q in [0:1)

    Fail reason: mapped fanou

input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
Found Accelergy ERT (energy reference table), replacing internal energy model.
Found Accelergy ART (area reference table), replacing internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configuration complete.
Initializing Index Factorization subspace.
  Factorization options along problem dimension C = 12
  Factorization options along problem dimension M = 9
  Factorization options along problem dimension R = 2
  Factorization options along problem dimension S = 2
  Factorization options along problem dimension N = 1
  Factorization options along problem dimension P = 20
  Factorization options along

[  1] Utilization = 0.00 | pJ/Compute =    9.201 | L3[IO] Q27 P3 - L2[IO] M128 C32 - L1[] P9 M2Y C3X S5X R5X - L0[W] Q1 
[  3] Utilization = 0.00 | pJ/Compute =    8.360 | L3[IO] Q27 P3 - L2[IO] M128 C24 - L1[] P9 M2Y C4X S5X R5X - L0[W] Q1 
[  5] Utilization = 0.01 | pJ/Compute =   41.098 | L3[IO] P9 - L2[IO] S5 R5 M2 C16 - L1[] P3 M128Y C6X - L0[W] Q27 
[  7] Utilization = 0.01 | pJ/Compute =   30.924 | L3[IO] P9 - L2[IO] S5 R5 M2 C12 - L1[] P3 M128Y C8X - L0[W] Q27 
[  6] Utilization = 0.01 | pJ/Compute =   20.750 | L3[IO] P9 - L2[IO] S5 R5 M2 C8 - L1[] P3 M128Y C12X - L0[W] Q27 
[  4] Utilization = 0.01 | pJ/Compute =   15.663 | L3[IO] P9 - L2[IO] S5 R5 M2 C6 - L1[] P3 M128Y C16X - L0[W] Q27 
[  1] Utilization = 0.01 | pJ/Compute =   17.959 | L3[IO] Q3 P3 - L2[IO] S5 M32 C32 - L1[] Q9 M8Y C3X R5X - L0[W] P9 
[  2] Utilization = 0.02 | pJ/Compute =   10.575 | L3[IO] P9 - L2[IO] S5 R5 M2 C4 - L1[] P3 M128Y C24X - L0[W] Q27 
[  3] Utilization = 0.01 | pJ/Compute =   13.889 | L3[IO] Q3


               BEGIN DIAGNOSTICS               
-----------------------------------------------
Fail class: Fanout

  Level: dummy_buffer
    Fail count: 9702
    Sample mapping that experienced this fail class:

      DRAM [ Inputs:92256 (92256) Outputs:186624 (186624) ] 
      -----------------------------------------------------
      | for Q in [0:3)
      |   for P in [0:3)

      shared_glb [ Inputs:16224 (16224) Outputs:20736 (20736) ] 
      ---------------------------------------------------------
      |     for M in [0:8)
      |       for Q in [0:3)
      |         for R in [0:5)
      |           for P in [0:3)
      |             for C in [0:3)

      dummy_buffer [ ] 
      ----------------
      |               for Q in [0:1)
      |                 for M in [0:32) (Spatial-Y)
      |                   for C in [0:32) (Spatial-X)
      |                     for S in [0:5) (Spatial-X)

      scratchpad [ Weights:1 (1) ] 
      ----------------------------
      |       

input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
Found Accelergy ERT (energy reference table), replacing internal energy model.
Found Accelergy ART (area reference table), replacing internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configuration complete.
Initializing Index Factorization subspace.
  Factorization options along problem dimension C = 9
  Factorization options along problem dimension M = 16
  Factorization options along problem dimension R = 2
  Factorization options along problem dimension S = 2
  Factorization options along problem dimension N = 1
  Factorization options along problem dimension P = 4
  Factorization options along 

[  0] Utilization = 0.00 | pJ/Compute =   17.525 | L3[IO] Q13 - L2[IO] M128 C128 - L1[] Q1 M3Y C2X S3X R3X - L0[W] P13 
[  2] Utilization = 0.00 | pJ/Compute =   10.742 | L3[IO] Q13 - L2[IO] M128 C64 - L1[] Q1 M3Y C4X S3X R3X - L0[W] P13 
[  6] Utilization = 0.00 | pJ/Compute =  245.433 | L3[IO] Q13 - L2[IO] P13 S3 R3 M32 C256 - L1[] Q1 M12Y - L0[W] Q1 
[  1] Utilization = 0.00 | pJ/Compute =   61.729 | L3[IO] Q13 - L2[IO] P13 S3 R3 M12 C64 - L1[] Q1 M32Y C4X - L0[W] Q1 
[  4] Utilization = 0.00 | pJ/Compute =    7.351 | L3[IO] Q13 - L2[IO] M128 C32 - L1[] Q1 M3Y C8X S3X R3X - L0[W] P13 
[  5] Utilization = 0.00 | pJ/Compute =    9.165 | L3[IO] P13 - L2[IO] R3 M128 C16 - L1[] Q13 M3Y C16X S3X - L0[W] Q1 
[  7] Utilization = 0.01 | pJ/Compute =   27.588 | L3[IO] Q13 - L2[IO] M4 C256 - L1[] Q1 M96Y S3X R3X - L0[W] P13 
[  3] Utilization = 0.00 | pJ/Compute =    6.621 | L3[IO] P13 - L2[IO] R3 M128 C8 - L1[] Q13 M3Y C32X S3X - L0[W] Q1 
[  4] Utilization = 0.01 | pJ/Compute =   15.944 | L3


               BEGIN DIAGNOSTICS               
-----------------------------------------------
Fail class: Fanout

  Level: dummy_buffer
    Fail count: 5802
    Sample mapping that experienced this fail class:

      DRAM [ Inputs:57600 (57600) Outputs:64896 (64896) ] 
      ---------------------------------------------------
      | for Q in [0:1)

      shared_glb [ Inputs:57600 (57600) Outputs:64896 (64896) ] 
      ---------------------------------------------------------
      |   for M in [0:16)
      |     for C in [0:2)
      |       for Q in [0:13)
      |         for R in [0:3)

      dummy_buffer [ ] 
      ----------------
      |           for Q in [0:1)
      |             for M in [0:24) (Spatial-Y)
      |               for C in [0:128) (Spatial-X)
      |                 for S in [0:3) (Spatial-X)

      scratchpad [ Weights:1 (1) ] 
      ----------------------------
      |                   for P in [0:13)

    Fail reason: mapped fanoutX 384 exceeds hardware fan

input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
Found Accelergy ERT (energy reference table), replacing internal energy model.
Found Accelergy ART (area reference table), replacing internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configuration complete.
Initializing Index Factorization subspace.
  Factorization options along problem dimension C = 16
  Factorization options along problem dimension M = 16
  Factorization options along problem dimension R = 2
  Factorization options along problem dimension S = 2
  Factorization options along problem dimension N = 1
  Factorization options along problem dimension P = 4
  Factorization options along

[  1] Utilization = 0.00 | pJ/Compute =    8.463 | L3[IO] Q13 - L2[IO] M128 C64 - L1[] Q1 M3Y C6X S3X R3X - L0[W] P13 
[  3] Utilization = 0.00 | pJ/Compute =    7.332 | L3[IO] Q13 - L2[IO] M128 C48 - L1[] Q1 M3Y C8X S3X R3X - L0[W] P13 
[  5] Utilization = 0.00 | pJ/Compute =    6.202 | L3[IO] Q13 - L2[IO] M128 C32 - L1[] Q1 M3Y C12X S3X R3X - L0[W] P13 
[  6] Utilization = 0.00 | pJ/Compute =    7.412 | L3[IO] P13 - L2[IO] R3 M128 C16 - L1[] Q13 M3Y C24X S3X - L0[W] Q1 
[  7] Utilization = 0.00 | pJ/Compute =    9.146 | L3[IO] P13 - L2[IO] R3 M128 C24 - L1[] Q13 M3Y C16X S3X - L0[W] Q1 
[  4] Utilization = 0.00 | pJ/Compute =    6.603 | L3[IO] P13 - L2[IO] R3 M128 C12 - L1[] Q13 M3Y C32X S3X - L0[W] Q1 
[  6] Utilization = 0.01 | pJ/Compute =    3.772 | L3[IO] Q13 - L2[IO] P13 S3 R3 M32 C4 - L1[] Q1 M12Y C96X - L0[W] Q1 
[  4] Utilization = 0.01 | pJ/Compute =    3.136 | L3[IO] Q13 - L2[IO] P13 S3 R3 M32 C3 - L1[] Q1 M12Y C128X - L0[W] Q1 
[  2] Utilization = 0.03 | pJ/Compute =    5


               BEGIN DIAGNOSTICS               
-----------------------------------------------
Fail class: Fanout

  Level: dummy_buffer
    Fail count: 19650
    Sample mapping that experienced this fail class:

      DRAM [ Inputs:86400 (86400) Outputs:64896 (64896) ] 
      ---------------------------------------------------
      | for Q in [0:1)

      shared_glb [ Inputs:86400 (86400) Outputs:64896 (64896) ] 
      ---------------------------------------------------------
      |   for M in [0:32)
      |     for R in [0:3)
      |       for P in [0:13)
      |         for C in [0:6)
      |           for Q in [0:13)

      dummy_buffer [ ] 
      ----------------
      |             for Q in [0:1)
      |               for M in [0:12) (Spatial-Y)
      |                 for C in [0:64) (Spatial-X)
      |                   for S in [0:3) (Spatial-X)

      scratchpad [ Weights:1 (1) ] 
      ----------------------------
      |                     for Q in [0:1)

    Fail reas

input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
Found Accelergy ERT (energy reference table), replacing internal energy model.
Found Accelergy ART (area reference table), replacing internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configuration complete.
Initializing Index Factorization subspace.
  Factorization options along problem dimension C = 16
  Factorization options along problem dimension M = 9
  Factorization options along problem dimension R = 2
  Factorization options along problem dimension S = 2
  Factorization options along problem dimension N = 1
  Factorization options along problem dimension P = 4
  Factorization options along 

[  1] Utilization = 0.00 | pJ/Compute =   32.907 | L3[IO] Q13 - L2[IO] M128 C384 - L1[] Q1 M2Y S3X R3X - L0[W] P13 
[  7] Utilization = 0.00 | pJ/Compute =   12.558 | L3[IO] Q13 - L2[IO] M128 C96 - L1[] Q1 M2Y C4X S3X R3X - L0[W] P13 
[  5] Utilization = 0.00 | pJ/Compute =   14.819 | L3[IO] Q13 - L2[IO] M128 C128 - L1[] Q1 M2Y C3X S3X R3X - L0[W] P13 
[  3] Utilization = 0.00 | pJ/Compute =   19.341 | L3[IO] Q13 - L2[IO] M128 C192 - L1[] Q1 M2Y C2X S3X R3X - L0[W] P13 
[  6] Utilization = 0.01 | pJ/Compute =    4.251 | L3[IO] Q13 - L2[IO] P13 S3 R3 M32 C4 - L1[] Q1 M8Y C96X - L0[W] Q1 
[  4] Utilization = 0.01 | pJ/Compute =    3.615 | L3[IO] Q13 - L2[IO] P13 S3 R3 M32 C3 - L1[] Q1 M8Y C128X - L0[W] Q1 
[  1] Utilization = 0.00 | pJ/Compute =   24.765 | L3[IO] Q13 - L2[IO] R3 M256 C64 - L1[] P13 C6X S3X - L0[W] Q1 
[  1] Utilization = 0.01 | pJ/Compute =   41.218 | L3[IO] Q13 - L2[IO] P13 S3 R3 M4 C64 - L1[] Q1 M64Y C6X - L0[W] Q1 
[  7] STATEMENT: 800 suboptimal mappings found since 


               BEGIN DIAGNOSTICS               
-----------------------------------------------
Fail class: Fanout

  Level: dummy_buffer
    Fail count: 19710
    Sample mapping that experienced this fail class:

      DRAM [ Inputs:86400 (86400) Outputs:43264 (43264) ] 
      ---------------------------------------------------
      | for Q in [0:1)

      shared_glb [ Inputs:86400 (86400) Outputs:43264 (43264) ] 
      ---------------------------------------------------------
      |   for R in [0:3)

      dummy_buffer [ ] 
      ----------------
      |     for Q in [0:13)
      |       for P in [0:13)
      |         for M in [0:256) (Spatial-Y)
      |           for C in [0:384) (Spatial-X)
      |             for S in [0:3) (Spatial-X)

      scratchpad [ Weights:1 (1) ] 
      ----------------------------
      |               for Q in [0:1)

    Fail reason: mapped fanoutX 1152 exceeds hardware fanoutX 128mapped fanoutY 256 exceeds hardware fanoutY 128

Fail class: Capacity


In [None]:
fname = './results/alexnet_pim/mapper/'

extract_numbers(alexnet_stats, fname + 'stats.pkl')
_ = load_data(fname + 'stats.pkl')

save_loop_nests(alexnet_loops, fname + 'loops.txt')

In [9]:
for i, (stats, loops) in enumerate(alexnet_pim_layerwise_results):
    print(f"Layer: {i+1}")
    log = False
    for line in stats.split("\n"):
        if "Summary Stats" in line:
            log = True
        if log:
            print(line)

Layer: 1
Summary Stats
-------------
GFLOPs (@1GHz): 2006.48
Utilization: 0.06
Cycles: 104930
Energy: 310.61 uJ
EDP(J*cycle): 3.26e+01
Area: 0.00 mm^2

Computes = 105415200
pJ/Compute
    mac                          = 0.23
    scratchpad                   = 0.00
    dummy_buffer                 = 0.00
    shared_glb                   = 0.30
    DRAM                         = 0.56
    A2D_NoC                      = 1.84
    D2A_NoC                      = 0.00
    DRAM <==> shared_glb         = 0.00
    dummy_buffer <==> scratchpad = 0.00
    scratchpad <==> mac          = 0.00
    Total                        = 2.95


Layer: 2
Summary Stats
-------------
GFLOPs (@1GHz): 1359.01
Utilization: 0.04
Cycles: 659016
Energy: 1132.24 uJ
EDP(J*cycle): 7.46e+02
Area: 0.00 mm^2

Computes = 447897600
pJ/Compute
    mac                          = 0.23
    scratchpad                   = 0.00
    dummy_buffer                 = 0.00
    shared_glb                   = 0.34
    DRAM                     

In [12]:
total_cycles = 0
for i, (stats, loops) in enumerate(alexnet_pim_layerwise_results):
    for line in stats.split("\n"):
        if "Cycles: " in line:
            cycles = int(line[len("Cycles: "):])
            total_cycles += cycles
            print(f"Cycles in layer {i+1}: {cycles}")
            break
print(f"Total cycles: {total_cycles}")

Cycles in layer 1: 104930
Cycles in layer 2: 659016
Cycles in layer 3: 214968
Cycles in layer 4: 271752
Cycles in layer 5: 289328
Total cycles: 1539994


In [15]:
total_energy = 0
for i, (stats, loops) in enumerate(alexnet_pim_layerwise_results):
    for line in stats.split("\n"):
        if "Energy: " in line:
            energy = float(line[len("Energy: "):].split(' uJ')[0])
            total_energy += energy
            print(f"Energy in layer {i+1}: {energy}")
            break
print(f"Total energy: {total_energy} uJ")

Energy in layer 1: 310.61
Energy in layer 2: 1132.24
Energy in layer 3: 361.9
Energy in layer 4: 605.26
Energy in layer 5: 648.36
Total energy: 3058.3700000000003 uJ
