Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

codegen error: RuntimeError: producer->getMemoryType() == MemoryType::Global || producer->getMemoryType() == MemoryType::Shared INTERNAL ASSERT FAILED #2559

Open
jjsjann123 opened this issue Mar 8, 2023 · 3 comments · May be fixed by #2563
Assignees

Comments

@jjsjann123
Copy link
Collaborator

🐛 Describe the bug

The issue was coming from lots of pytorch slow_tests.

%kernel {
T25_l[ ithreadIdx.x212{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * 1 ) ), 4) ), 1) ), 4) )}, iS207{4}, iUS209{1}, iUR211{4} ]
   = T4_g[ iS220{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * 1 ) ), 4) ), 1) ), 4) )}, iS215{4}, iS217{1}, iS219{4} ];
T5_l[ ithreadIdx.x204{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * 1 ) ), 4) ), 1) ), 4) )}, iS199{4}, iUS201{1}, iS203{4} ]
   = T25_l[ ithreadIdx.x212{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * 1 ) ), 4) ), 1) ), 4) )}, iS207{4}, iUS209{1}, iUR211{4} ];
T14_l[ ithreadIdx.x188{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * 1 ) ), 4) ), 1) ), 4) )}, iS183{4}, iUS185{1}, iS187{4} ] ca_pos( 4 )
   = T5_l[ ithreadIdx.x204{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * 1 ) ), 4) ), 1) ), 4) )}, iS199{4}, iUS201{1}, iS203{4} ];
T23_l[ ithreadIdx.x100{( ceilDiv(( ceilDiv(( ceilDiv(( i0 * ( i1 * i2 ) ), 4) ), 1) ), 4) )}, iS95{4}, iUS97{1}, iUR99{4} ]
   = T2_g[ iS108{( ceilDiv(( ceilDiv(( ceilDiv(( i0 * ( i1 * i2 ) ), 4) ), 1) ), 4) )}, iS103{4}, iS105{1}, iS107{4} ];
T22_l[ 0 ]
   = T1_g[ 0 ];
T6_l[ bthreadIdx.x92{( ceilDiv(( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 4) ), 1) ), 4) )}, bS87{4}, bUS89{1}, bS91{4} ]
   = broadcast( T22_l[ 0 ] )
T7_l[ ithreadIdx.x84{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * i2 ) ), 4) ), 1) ), 4) )}, iS79{4}, iUS81{1}, iS83{4} ] ca_pos( 4 )
   = where(T5_l[ ithreadIdx.x204{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * 1 ) ), 4) ), 1) ), 4) )}, iS199{4}, iUS201{1}, iS203{4} ]
  , T23_l[ ithreadIdx.x100{( ceilDiv(( ceilDiv(( ceilDiv(( i0 * ( i1 * i2 ) ), 4) ), 1) ), 4) )}, iS95{4}, iUS97{1}, iUR99{4} ]
  , T6_l[ bthreadIdx.x92{( ceilDiv(( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 4) ), 1) ), 4) )}, bS87{4}, bUS89{1}, bS91{4} ]);
T27_l[ ithreadIdx.x75{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * i2 ) ), 4) ), 1) ), 4) )}rf, rS70{4}rf, rUS72{1}rf, rS74{4}rf ] ca_pos( 1 ) produce_pos( 4 )
   = reduction( T7_l[ ithreadIdx.x84{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * i2 ) ), 4) ), 1) ), 4) )}, iS79{4}, iUS81{1}, iS83{4} ] ca_pos( 4 ), op = add, initial value = double(0), allreduce = false )
T8_l[ rthreadIdx.x76{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * i2 ) ), 4) ), 1) ), 4) )} ] produce_pos( 1 )
   = reduction( T27_l[ ithreadIdx.x75{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * i2 ) ), 4) ), 1) ), 4) )}rf, rS70{4}rf, rUS72{1}rf, rS74{4}rf ] ca_pos( 1 ) produce_pos( 4 ), op = add, initial value = double(0), 
allreduce = false )
T10_l[ bthreadIdx.x156{( ceilDiv(( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 4) ), 1) ), 4) )}, bS151{4}, bUS153{1}, bS155{4} ]
   = broadcast( T8_l[ rthreadIdx.x76{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * i2 ) ), 4) ), 1) ), 4) )} ] produce_pos( 1 ) )
T24_l[ bthreadIdx.x140{( ceilDiv(( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 4) ), 1) ), 4) )}, bS135{4}, bUS137{1}, bUR139{4} ]
   = T3_g[ bS148{( ceilDiv(( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 4) ), 1) ), 4) )}, bS143{4}, bS145{1}, bS147{4} ];
T9_l[ bthreadIdx.x132{( ceilDiv(( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 4) ), 1) ), 4) )}, bS127{4}, bUS129{1}, bS131{4} ]
   = (float)(T24_l[ bthreadIdx.x140{( ceilDiv(( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 4) ), 1) ), 4) )}, bS135{4}, bUS137{1}, bUR139{4} ]);
T11_l[ bthreadIdx.x124{( ceilDiv(( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 4) ), 1) ), 4) )}, bS119{4}, bUS121{1}, bS123{4} ]
   = T10_l[ bthreadIdx.x156{( ceilDiv(( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 4) ), 1) ), 4) )}, bS151{4}, bUS153{1}, bS155{4} ]
   / T9_l[ bthreadIdx.x132{( ceilDiv(( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 4) ), 1) ), 4) )}, bS127{4}, bUS129{1}, bS131{4} ];
T12_l[ ithreadIdx.x116{( ceilDiv(( ceilDiv(( ceilDiv(( i0 * ( i1 * i2 ) ), 4) ), 1) ), 4) )}, iS111{4}, iUS113{1}, iS115{4} ] ca_pos( 4 )
   = T23_l[ ithreadIdx.x100{( ceilDiv(( ceilDiv(( ceilDiv(( i0 * ( i1 * i2 ) ), 4) ), 1) ), 4) )}, iS95{4}, iUS97{1}, iUR99{4} ]
   - T11_l[ bthreadIdx.x124{( ceilDiv(( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 4) ), 1) ), 4) )}, bS119{4}, bUS121{1}, bS123{4} ];
T13_l[ ithreadIdx.x164{( ceilDiv(( ceilDiv(( ceilDiv(( i0 * ( i1 * i2 ) ), 4) ), 1) ), 4) )}, iS159{4}, iUS161{1}, iS163{4} ] ca_pos( 4 ) produce_pos( 4 )
   = T12_l[ ithreadIdx.x116{( ceilDiv(( ceilDiv(( ceilDiv(( i0 * ( i1 * i2 ) ), 4) ), 1) ), 4) )}, iS111{4}, iUS113{1}, iS115{4} ] ca_pos( 4 )
   * T12_l[ ithreadIdx.x116{( ceilDiv(( ceilDiv(( ceilDiv(( i0 * ( i1 * i2 ) ), 4) ), 1) ), 4) )}, iS111{4}, iUS113{1}, iS115{4} ] ca_pos( 4 );
T15_l[ bthreadIdx.x180{( ceilDiv(( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 4) ), 1) ), 4) )}, bS175{4}, bUS177{1}, bS179{4} ]
   = broadcast( T22_l[ 0 ] )
T16_l[ ithreadIdx.x172{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * i2 ) ), 4) ), 1) ), 4) )}, iS167{4}, iUS169{1}, iS171{4} ] ca_pos( 4 ) produce_pos( 4 )
   = where(T14_l[ ithreadIdx.x188{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * 1 ) ), 4) ), 1) ), 4) )}, iS183{4}, iUS185{1}, iS187{4} ] ca_pos( 4 )
  , T13_l[ ithreadIdx.x164{( ceilDiv(( ceilDiv(( ceilDiv(( i0 * ( i1 * i2 ) ), 4) ), 1) ), 4) )}, iS159{4}, iUS161{1}, iS163{4} ] ca_pos( 4 ) produce_pos( 4 )
  , T15_l[ bthreadIdx.x180{( ceilDiv(( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 4) ), 1) ), 4) )}, bS175{4}, bUS177{1}, bS179{4} ]);
T28_l[ ithreadIdx.x231{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * i2 ) ), 4) ), 1) ), 4) )}rf, rS226{4}rf, rUS228{1}rf, rS230{4}rf ] ca_pos( 1 ) produce_pos( 4 )
   = reduction( T16_l[ ithreadIdx.x172{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * i2 ) ), 4) ), 1) ), 4) )}, iS167{4}, iUS169{1}, iS171{4} ] ca_pos( 4 ) produce_pos( 4 ), op = add, initial value = double(0), allreduce = false )
T17_l[ rthreadIdx.x232{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * i2 ) ), 4) ), 1) ), 4) )} ] produce_pos( 1 )
   = reduction( T28_l[ ithreadIdx.x231{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * i2 ) ), 4) ), 1) ), 4) )}rf, rS226{4}rf, rUS228{1}rf, rS230{4}rf ] ca_pos( 1 ) produce_pos( 4 ), op = add, initial value = double(0), allreduce = false )
T21_l[ 0 ]
   = T0_g[ 0 ];
T18_l[ 0 ]
   = T17_l[ rthreadIdx.x232{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * i2 ) ), 4) ), 1) ), 4) )} ] produce_pos( 1 )
   / T21_l[ 0 ];
T19_l[ 0 ]
   = T18_l[ 0 ];
T26_l[ 0 ]
   = sqrtf(T19_l[ 0 ]);
T20_g[ 0 ]
   = T26_l[ 0 ];

TransformPrinter :
T4_g[ iS220{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * 1 ) ), 4) ), 1) ), 4) )}, iS215{4}, iS217{1}, iS219{4} ]
 root domain : (iS6{i5},iS7{i6},bS8{1})
 contiguity: 1 1
  Merge: iS7{i6} and bS8{1} -> iS213{( i6 * 1 )}
  Merge: iS6{i5} and iS213{( i6 * 1 )} -> iS214{( i5 * ( i6 * 1 ) )}
  Outer split: iS214{( i5 * ( i6 * 1 ) )} by factor 4 -> iS215{4}, iS216{( ceilDiv(( i5 * ( i6 * 1 ) ), 4) )}, start offset: 0, stop offset: 0
  Outer split: iS216{( ceilDiv(( i5 * ( i6 * 1 ) ), 4) )} by factor 1 -> iS217{1}, iS218{( ceilDiv(( ceilDiv(( i5 * ( i6 * 1 ) ), 4) ), 1) )}, start offset: 0, stop offset: 0
  Outer split: iS218{( ceilDiv(( ceilDiv(( i5 * ( i6 * 1 ) ), 4) ), 1) )} by factor 4 -> iS219{4}, iS220{( ceilDiv(( ceilDiv(( ceilDiv(( i5 * ( i6 * 1 ) ), 4) ), 1) ), 4) )}, start offset: 0, stop offset: 0

RuntimeError: producer->getMemoryType() == MemoryType::Global || producer->getMemoryType() == MemoryType::Shared INTERNAL ASSERT FAILED at "/opt/pytorch/pytorch-jit/third_party/nvfuser/csrc/lower_sync_information.cpp":772, please report a bug to PyTorch. Inconsistent parallelization found between TV5 (T5_l[ ithreadIdx.x204{( ceilDiv(( ceilDiv(( ceilDiv(( T2.size[0] * ( T2.size[1] * 1 ) ), 4) ), 1) ), 4) )}, iS199{4}, iUS201{1}, iS203{4} ]) and TV14(T14_l[ ithreadIdx.x188{( ceilDiv(( ceilDiv(( ceilDiv(( T2.size[0] * ( T2.size[1] * 1 ) ), 4) ), 1) ), 4) )}, iS183{4}, iUS185{1}, iS187{4} ] ca_pos( 4 )). Producer is required to be in Global or Shared Memory based on parallelization strategy. RAW flags: (threadIdx.x)

Repro script with python API:

import torch
from nvfuser import FusionDefinition, DataType


inputs = [
    torch.tensor(1.5, device="cuda"),
    torch.tensor(2.5, device="cuda"),
    torch.randn(5, 5, 5, device='cuda'),
    torch.randint(0, 23, (1, 1, 1), device="cuda"),
    #torch.randint(0, 1, (5, 5), device="cuda").bool().unsqueeze(-1).expand(5, 5, 5),
    torch.randint(0, 1, (5, 5), device="cuda").bool().unsqueeze(-1),
]

print(inputs[4].shape)
print(inputs[4].stride())

def nvfuser_fusion(fd : FusionDefinition) -> None :
    T0 = fd.from_pytorch(inputs[0])
    T1 = fd.from_pytorch(inputs[1])
    T2 = fd.from_pytorch(inputs[2])
    T3 = fd.from_pytorch(inputs[3])
    T4 = fd.from_pytorch(inputs[4])
    #T4 = fd.define_tensor((-1, -1, 1), (True, True), dtype=DataType.Bool)
    T5 = fd.ops.set(T4)
    T7 = fd.ops.where(T5, T2, T1)
    T8 = fd.ops.sum(T7)
    T10 = fd.ops.cast(T3, DataType.Float)
    T11 = fd.ops.div(T8, T10)
    T12 = fd.ops.sub(T2, T11)
    T13 = fd.ops.mul(T12, T12)
    T14 = fd.ops.set(T5)
    T16 = fd.ops.where(T14, T13, T1)
    T17 = fd.ops.sum(T16)
    T18 = fd.ops.div(T17, T0)
    T19 = fd.ops.set(T18)
    T20 = fd.ops.sqrt(T19)
    fd.add_output(T20)

f = FusionDefinition()

with FusionDefinition() as fd:
    nvfuser_fusion(fd)

for _ in range(5) :
    o = fd.execute(inputs)

print(fd)
print(o[0])

Versions

repro on devel.

@jjsjann123
Copy link
Collaborator Author

I'm pinging @naoyam on this one to assign. 🙇

@jjsjann123
Copy link
Collaborator Author

Note that. if T4 in the example up is passed in as non-broadcasting,
torch.randint(0, 1, (5, 5), device="cuda").bool().unsqueeze(-1).expand(5, 5, 5),
and defined in fusion as so
T4 = fd.define_tensor((-1, -1, 1), (True, True, True), dtype=DataType.Bool)

then everything seems to be working fine.

naoyam added a commit that referenced this issue Mar 9, 2023
@naoyam naoyam linked a pull request Mar 9, 2023 that will close this issue
@naoyam
Copy link
Collaborator

naoyam commented Mar 11, 2023

C++ repro (#2563):

  FusionGuard fg(&fusion);

  auto tv0 = makeSymbolicTensor(1, DataType::Half);
  fusion.addInput(tv0);
  auto tv1 = makeSymbolicTensor(2, DataType::Half);
  fusion.addInput(tv1);

  auto tv2 = castOp(DataType::Float, tv0);
  auto tv3 = castOp(DataType::Float, tv1);

  auto tv4 = broadcast(tv2, {false, true});
  auto tv5 = add(tv4, tv3);

  auto tv6 = sum(tv5, {0, 1});
  auto tv7 = broadcast(tv6, {true, true});

  auto tv8 = add(tv3, tv7);
  auto tv9 = add(tv4, tv7);
  auto tv10 = add(tv8, tv9);

  fusion.addOutput(tv10);

  fusion.printMath();
  fusion.printKernel();

  const int tidx = 256;
  const int pb = 8;
  const int bidx = 2;

  tv6->merge(0);
  // tidx
  tv6->split(0, tidx);
  // persistent buffer
  tv6->split(0, pb);
  // [BIDx, PB, TIDX]

  MaxRootDomainInfoSpanningTree tree(tv6);
  TransformPropagator tp(tv6);
  tree.traverse(&tp);

  auto tv6_rf = tv6->rFactor({1});

  inlineMost();

  tv6_rf->axis(0)->parallelize(ParallelType::BIDx);
  tv6_rf->axis(2)->parallelize(ParallelType::TIDx);
  scheduler_utils::parallelizeAllLike(tv6_rf);

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants