In [1]:
import pandas as pd
import numpy as np
from loaders import *

In [2]:
show_config(ConfigRegistry.PIM_ARCH)

architecture:
  version: 0.3
  subtree:                           # "subtree" contains abstract hierarchies
    - name: system_arch              # name of the design
      attributes:                    # shared attributes that will propagate down all lower levels
        technology: 32nm
        latency: 1ns
      local:
        - name: DRAM                 # offchip DRAM is the source of all datatypes
          class: DRAM                # assume DRAM is large enough to store all the data, so no depth specification needed
          attributes:
            width: 64                # width in bits
            datawidth: 16            # datawidth in bits
            block-size: 4
      subtree:
        - name: chip                 # all components below this level are on-chip components
          local:
            - name: shared_glb
              class: smartbuffer_SRAM
              attributes:
                memory_depth: 16384
                memory_width: 64
                n_bank

In [3]:
show_config(ConfigRegistry.PIM_COMPONENTS_DIR)

compound_components:
  version: 0.3
  classes:
    - name: A2D_conversion_system
      attributes:  # default attributes that can be overridden
        technology: 32nm
        latency: 1ns
        datawidth: 16
        cell_datawidth: 2
        n_DAC_cycles: 16
        ADC_datawidth: 8
        n_PE_cols: 8
        # derived parameters (automatically derived, but can be overridden from high-level)
        nCells: datawidth/cell_datawidth
        SH_datawidth: ADC_datawidth
        total_SHs: nCells * n_PE_cols # each physical column has a sample and hold
        ADC_converts_per_psum: nCells * n_DAC_cycles # temporal: n_DAC_cycles times for each column of cells
      subcomponents:
        - name: SH[0..total_SHs-1] # this is all SHs in the system
          class: sample_and_hold
          attributes:
            technology: technology
            latency: latency
            datawidth: SH_datawidth
        - name: ADC #  assume this ADC is fast enough to convert the output of the SHs 

In [4]:
show_config(ConfigRegistry.PIM_CONSTRAINTS)

mapspace:
  targets:
    # DRAM
    - target: DRAM
      type: bypassing
      keep: [Inputs, Outputs]
      bypass: [Weights]  # weights are stored locally in the memcells, not loaded from DRAM
    - target: DRAM
      type: temporal
      factors: R=1 S=1 M=1 C=1
      permutation: RSCPQMN
    # Global Buffer Constraints
    - target: shared_glb
      type: bypassing
      keep: [Inputs, Outputs]
      bypass: [Weights] # weights are stored locally in the memcells, not loaded from DRAM
    - target: shared_glb
      type: temporal
      # factors: R=1 S=1 M=1 C=1 # TODO(girfan): Ask TA if this makes sense.
    # Dummy Buffer Constraints
    - target: dummy_buffer
      type: bypassing
      keep: []
      bypass: [Inputs, Outputs, Weights]  # no data should be stored in dummy, it is just a transition buffer
    - target: dummy_buffer
      type: spatial
      factors: Q=1 P=1
      permutation: RSC M
      split: 3  # M cannot be mapped in the same dimension as others
    - target: d

In [5]:
show_config(ConfigRegistry.PIM_MAPPER)

mapper:
  optimization-metric: [ delay, energy ]
  live-status: False
  num-threads: 8
  timeout: 15000
  victory-condition: 800
  diagnostics: True



In [6]:
!accelergyTables -r /home/workspace/final-project/example_designs/simple_pim/processing-in-memory-design/PIM_estimation_tables

pim_accelergy_result = run_accelergy(
    ConfigRegistry.PIM_ARCH,
    ConfigRegistry.PIM_COMPONENTS_DIR
)
print(pim_accelergy_result.ert_verbose)

/home/workspace/final-project/example_designs/simple_pim/processing-in-memory-design/PIM_estimation_tables is added as a new root for table based plug-in.
ERT_summary:
  version: 0.3
  table_summary:
  - name: system_arch.DRAM
    actions:
    - name: read
      energy: 512
    - name: write
      energy: 512
    - name: idle
      energy: 0
    primitive_estimation(s):
    - system_arch.DRAM:
        estimator: Cacti
  - name: system_arch.chip.PE[0..16383].scratchpad
    actions:
    - name: read
      energy: 0.0
    primitive_estimation(s):
    - action_name: read
      arguments: null
      energy: 0.0
      subaction_estimations:
      - subcomponent_name: storage
        subaction_name: idle
        arguments: null
        energy: 0.0
        action_share: 1
        interpreted_energy: 0.0
        percentage: 0%
        estimator: table-based-plug-ins
  - name: system_arch.chip.PE[0..16383].mac
    actions:
    - name: compute
      energy: 0.23424
    primitive_estimation(s):
  

In [8]:
vgg_layers = [
    ConfigRegistry.VGG01_LAYER1,
    ConfigRegistry.VGG01_LAYER2,
    ConfigRegistry.VGG01_LAYER3,
    ConfigRegistry.VGG01_LAYER4,
    ConfigRegistry.VGG01_LAYER5,
    ConfigRegistry.VGG01_LAYER6,
    ConfigRegistry.VGG01_LAYER7,
    ConfigRegistry.VGG01_LAYER8,
]

In [None]:
vgg_pim_layerwise_results = []

for i, layer in enumerate(vgg_layers):
    pim_results = run_timeloop_mapper(
        ConfigRegistry.PIM_ARCH,
        ConfigRegistry.PIM_COMPONENTS_DIR,
        pim_accelergy_result.art,
        pim_accelergy_result.ert,
        ConfigRegistry.PIM_CONSTRAINTS,
        ConfigRegistry.PIM_MAPPER,
        layer,
    )
    print(f"Layer: {i+1}")
    pim_results_stats, pim_results_loops = pim_results
    print(pim_results_loops)
    print(pim_results_stats)
    vgg_pim_layerwise_results.append(pim_results)

input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
Found Accelergy ERT (energy reference table), replacing internal energy model.
Found Accelergy ART (area reference table), replacing internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configuration complete.
Initializing Index Factorization subspace.
  Factorization options along problem dimension C = 2
  Factorization options along problem dimension M = 7
  Factorization options along problem dimension R = 2
  Factorization options along problem dimension S = 2
  Factorization options along problem dimension N = 1
  Factorization options along problem dimension P = 224
  Factorization options along

[  0] Utilization = 0.00 | pJ/Compute =   92.050 | L3[IO] Q4 P56 - L2[IO] P2 S3 R3 M32 - L1[] Q28 P2 M2Y C3X - L0[W] Q2 
[  4] Utilization = 0.00 | pJ/Compute =   89.338 | L3[IO] Q4 P56 - L2[IO] P2 S3 R3 M16 - L1[] Q28 P2 M4Y C3X - L0[W] Q2 
[  6] Utilization = 0.00 | pJ/Compute =   87.982 | L3[IO] Q4 P56 - L2[IO] P2 S3 R3 M8 - L1[] Q28 P2 M8Y C3X - L0[W] Q2 
[  2] Utilization = 0.00 | pJ/Compute =   87.304 | L3[IO] Q4 P56 - L2[IO] P2 S3 R3 M4 - L1[] Q28 P2 M16Y C3X - L0[W] Q2 
[  5] Utilization = 0.00 | pJ/Compute =  252.128 | L3[IO] Q4 P56 - L2[IO] P2 S3 R3 M16 C3 - L1[] Q28 P2 M4Y - L0[W] Q2 
[  7] Utilization = 0.00 | pJ/Compute =  250.772 | L3[IO] Q4 P56 - L2[IO] P2 S3 R3 M8 C3 - L1[] Q28 P2 M8Y - L0[W] Q2 
[  1] Utilization = 0.00 | pJ/Compute =  254.840 | L3[IO] Q4 P56 - L2[IO] P2 S3 R3 M32 C3 - L1[] Q28 P2 M2Y - L0[W] Q2 
[  3] Utilization = 0.00 | pJ/Compute =  250.094 | L3[IO] Q4 P56 - L2[IO] P2 S3 R3 M4 C3 - L1[] Q28 P2 M16Y - L0[W] Q2 
[  0] Utilization = 0.00 | pJ/Compute 


               BEGIN DIAGNOSTICS               
-----------------------------------------------
Fail class: Capacity

  Level: shared_glb
    Fail count: 195
    Sample mapping that experienced this fail class:

      DRAM [ Inputs:153228 (153228) Outputs:3211264 (3211264) ] 
      ---------------------------------------------------------
      | for P in [0:7)

      shared_glb [ Inputs:23052 (23052) Outputs:458752 (458752) ] 
      -----------------------------------------------------------
      |   for P in [0:4)
      |     for M in [0:4)

      dummy_buffer [ ] 
      ----------------
      |       for Q in [0:224)
      |         for P in [0:8)
      |           for M in [0:16) (Spatial-Y)
      |             for C in [0:3) (Spatial-X)
      |               for S in [0:3) (Spatial-X)
      |                 for R in [0:3) (Spatial-X)

      scratchpad [ Weights:1 (1) ] 
      ----------------------------
      |                   for Q in [0:1)

    Fail reason: mapped tile siz

input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
Found Accelergy ERT (energy reference table), replacing internal energy model.
Found Accelergy ART (area reference table), replacing internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configuration complete.
Initializing Index Factorization subspace.
  Factorization options along problem dimension C = 7
  Factorization options along problem dimension M = 8
  Factorization options along problem dimension R = 2
  Factorization options along problem dimension S = 2
  Factorization options along problem dimension N = 1
  Factorization options along problem dimension P = 140
  Factorization options along

[  0] Utilization = 0.00 | pJ/Compute =   38.574 | L3[IO] Q7 P7 - L2[IO] P2 M128 C64 - L1[] Q4 P4 S3X R3X - L0[W] Q4 P2 
[  4] Utilization = 0.00 | pJ/Compute =   18.225 | L3[IO] Q7 P7 - L2[IO] P2 M128 C16 - L1[] Q4 P4 C4X S3X R3X - L0[W] Q4 P2 
[  5] Utilization = 0.00 | pJ/Compute =   14.834 | L3[IO] Q7 P7 - L2[IO] P2 M128 C8 - L1[] Q4 P4 C8X S3X R3X - L0[W] Q4 P2 
[  6] Utilization = 0.01 | pJ/Compute =    8.749 | L3[IO] Q14 P16 - L2[IO] P7 M16 C16 - L1[] Q1 M8Y C4X S3X R3X - L0[W] Q8 
[  2] Utilization = 0.00 | pJ/Compute =   25.008 | L3[IO] Q7 P7 - L2[IO] P2 M128 C32 - L1[] Q4 P4 C2X S3X R3X - L0[W] Q4 P2 
[  3] Utilization = 0.00 | pJ/Compute =   82.657 | L3[IO] Q2 P28 - L2[IO] Q8 P2 S3 M8 C64 - L1[] Q7 M16Y R3X - L0[W] P2 
[  1] Utilization = 0.00 | pJ/Compute =   13.470 | L3[IO] Q2 P28 - L2[IO] Q8 P2 S3 M32 C8 - L1[] Q7 M4Y C8X R3X - L0[W] P2 
[  7] Utilization = 0.01 | pJ/Compute =   27.896 | L3[IO] Q7 P7 - L2[IO] P2 M2 C64 - L1[] Q4 P4 M64Y S3X R3X - L0[W] Q4 P2 
[  4] Utiliz


               BEGIN DIAGNOSTICS               
-----------------------------------------------
Fail class: Fanout

  Level: dummy_buffer
    Fail count: 3258
    Sample mapping that experienced this fail class:

      DRAM [ Inputs:831744 (831744) Outputs:1605632 (1605632) ] 
      ---------------------------------------------------------
      | for P in [0:2)

      shared_glb [ Inputs:423168 (423168) Outputs:802816 (802816) ] 
      -------------------------------------------------------------
      |   for M in [0:4)
      |     for C in [0:4)
      |       for P in [0:2)
      |         for Q in [0:2)

      dummy_buffer [ ] 
      ----------------
      |           for P in [0:7)
      |             for Q in [0:14)
      |               for M in [0:32) (Spatial-Y)
      |                 for C in [0:16) (Spatial-X)
      |                   for S in [0:3) (Spatial-X)
      |                     for R in [0:3) (Spatial-X)

      scratchpad [ Weights:1 (1) ] 
      --------------

input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
Found Accelergy ERT (energy reference table), replacing internal energy model.
Found Accelergy ART (area reference table), replacing internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configuration complete.
Initializing Index Factorization subspace.
  Factorization options along problem dimension C = 8
  Factorization options along problem dimension M = 9
  Factorization options along problem dimension R = 2
  Factorization options along problem dimension S = 2
  Factorization options along problem dimension N = 1
  Factorization options along problem dimension P = 80
  Factorization options along 

[  1] Utilization = 0.00 | pJ/Compute =  245.954 | L3[IO] Q2 P28 - L2[IO] Q4 S3 R3 M32 C128 - L1[] Q1 M8Y - L0[W] Q7 P2 
[  7] Utilization = 0.00 | pJ/Compute =   32.291 | L3[IO] Q2 P28 - L2[IO] Q4 S3 R3 M32 C16 - L1[] Q1 M8Y C8X - L0[W] Q7 P2 
[  5] Utilization = 0.00 | pJ/Compute =   62.815 | L3[IO] Q2 P28 - L2[IO] Q4 S3 R3 M32 C32 - L1[] Q1 M8Y C4X - L0[W] Q7 P2 
[  3] Utilization = 0.00 | pJ/Compute =  123.861 | L3[IO] Q2 P28 - L2[IO] Q4 S3 R3 M32 C64 - L1[] Q1 M8Y C2X - L0[W] Q7 P2 
[  6] Utilization = 0.01 | pJ/Compute =   17.030 | L3[IO] Q2 P28 - L2[IO] Q4 S3 R3 M32 C8 - L1[] Q1 M8Y C16X - L0[W] Q7 P2 
[  4] Utilization = 0.01 | pJ/Compute =    9.399 | L3[IO] Q2 P28 - L2[IO] Q4 S3 R3 M32 C4 - L1[] Q1 M8Y C32X - L0[W] Q7 P2 
[  2] Utilization = 0.01 | pJ/Compute =    5.584 | L3[IO] Q2 P28 - L2[IO] Q4 S3 R3 M32 C2 - L1[] Q1 M8Y C64X - L0[W] Q7 P2 
[  0] Utilization = 0.01 | pJ/Compute =    3.676 | L3[IO] Q2 P28 - L2[IO] Q4 S3 R3 M32 - L1[] Q1 M8Y C128X - L0[W] Q7 P2 
[  1] Utiliza

In [None]:
for i, (stats, loops) in enumerate(vgg_pim_layerwise_results):
    print(f"Layer: {i+1}")
    log = False
    for line in stats.split("\n"):
        if "Summary Stats" in line:
            log = True
        if log:
            print(line)

In [None]:
total_cycles = 0
for i, (stats, loops) in enumerate(vgg_pim_layerwise_results):
    for line in stats.split("\n"):
        if "Cycles: " in line:
            cycles = int(line[len("Cycles: "):])
            total_cycles += cycles
            print(f"Cycles in layer {i+1}: {cycles}")
            break
print(f"Total cycles: {total_cycles}")

In [None]:
total_energy = 0
for i, (stats, loops) in enumerate(vgg_pim_layerwise_results):
    for line in stats.split("\n"):
        if "Energy: " in line:
            energy = float(line[len("Energy: "):].split(' uJ')[0])
            total_energy += energy
            print(f"Energy in layer {i+1}: {energy}")
            break
print(f"Total energy: {total_energy} uJ")