Skip to content

[BUG] OpenMP does not work anymore with function call on cython 3.0 #5573

Closed
@arthurlm

Description

@arthurlm

Describe the bug

After upgrading from Cython 0.29.36 to Cython 3.0.0 produced code just hang infinitly when OpenMP is used and sub-function are called.

Looking at produced code for bellow lines:

for j in prange(J):
    foo_vec(x[:, j], y[:, j])

Using Cython 0.29.36 produced code is:

// for j in prange(J):

  __pyx_t_1 = __pyx_v_J;
  if ((1 == 0)) abort();
  {
      #if ((defined(__APPLE__) || defined(__OSX__)) && (defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))))
          #undef likely
          #undef unlikely
          #define likely(x)   (x)
          #define unlikely(x) (x)
      #endif
      __pyx_t_3 = (__pyx_t_1 - 0 + 1 - 1/abs(1)) / 1;
      if (__pyx_t_3 > 0)
      {
          #ifdef _OPENMP
          #pragma omp parallel
          #endif /* _OPENMP */
          {
              #ifdef _OPENMP
              #pragma omp for firstprivate(__pyx_v_j) lastprivate(__pyx_v_j)
              #endif /* _OPENMP */
              for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_3; __pyx_t_2++){
                  {
                      __pyx_v_j = (Py_ssize_t)(0 + 1 * __pyx_t_2);

//     foo_vec(x[:, j], y[:, j])

                      __pyx_t_4.data = __pyx_v_x.data;
                      __pyx_t_4.memview = __pyx_v_x.memview;
                      __PYX_INC_MEMVIEW(&__pyx_t_4, 0);
                      __pyx_t_4.shape[0] = __pyx_v_x.shape[0];
__pyx_t_4.strides[0] = __pyx_v_x.strides[0];
    __pyx_t_4.suboffsets[0] = -1;

{
    Py_ssize_t __pyx_tmp_idx = __pyx_v_j;
    Py_ssize_t __pyx_tmp_stride = __pyx_v_x.strides[1];
        __pyx_t_4.data += __pyx_tmp_idx * __pyx_tmp_stride;
}

__pyx_t_5.data = __pyx_v_y.data;
                      __pyx_t_5.memview = __pyx_v_y.memview;
                      __PYX_INC_MEMVIEW(&__pyx_t_5, 0);
                      __pyx_t_5.shape[0] = __pyx_v_y.shape[0];
__pyx_t_5.strides[0] = __pyx_v_y.strides[0];
    __pyx_t_5.suboffsets[0] = -1;

{
    Py_ssize_t __pyx_tmp_idx = __pyx_v_j;
    Py_ssize_t __pyx_tmp_stride = __pyx_v_y.strides[1];
        __pyx_t_5.data += __pyx_tmp_idx * __pyx_tmp_stride;
}

__pyx_f_6my_ext_foo_vec(__pyx_t_4, __pyx_t_5);
                      __PYX_XDEC_MEMVIEW(&__pyx_t_4, 0);
                      __pyx_t_4.memview = NULL;
                      __pyx_t_4.data = NULL;
                      __PYX_XDEC_MEMVIEW(&__pyx_t_5, 0);
                      __pyx_t_5.memview = NULL;
                      __pyx_t_5.data = NULL;
                  }
              }
          }
      }
  }
  #if ((defined(__APPLE__) || defined(__OSX__)) && (defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))))
      #undef likely
      #undef unlikely
      #define likely(x)   __builtin_expect(!!(x), 1)
      #define unlikely(x) __builtin_expect(!!(x), 0)
  #endif

Using Cython 3.0 produced code is:

// for j in prange(J):

  __pyx_t_1 = __pyx_v_J;
  {
      #if ((defined(__APPLE__) || defined(__OSX__)) && (defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))))
          #undef likely
          #undef unlikely
          #define likely(x)   (x)
          #define unlikely(x) (x)
      #endif
      __pyx_t_3 = (__pyx_t_1 - 0 + 1 - 1/abs(1)) / 1;
      if (__pyx_t_3 > 0)
      {
          #ifdef _OPENMP
          #pragma omp parallel
          #endif /* _OPENMP */
          {
              #ifdef _OPENMP
              #pragma omp for firstprivate(__pyx_v_j) lastprivate(__pyx_v_j)
              #endif /* _OPENMP */
              for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_3; __pyx_t_2++){
                  {
                      __pyx_v_j = (Py_ssize_t)(0 + 1 * __pyx_t_2);

//    foo_vec(x[:, j], y[:, j])

                      __pyx_t_4.data = __pyx_v_x.data;
                      __pyx_t_4.memview = __pyx_v_x.memview;
                      __PYX_INC_MEMVIEW(&__pyx_t_4, 0);
                      __pyx_t_4.shape[0] = __pyx_v_x.shape[0];
__pyx_t_4.strides[0] = __pyx_v_x.strides[0];
    __pyx_t_4.suboffsets[0] = -1;

{
    Py_ssize_t __pyx_tmp_idx = __pyx_v_j;
    Py_ssize_t __pyx_tmp_stride = __pyx_v_x.strides[1];
        __pyx_t_4.data += __pyx_tmp_idx * __pyx_tmp_stride;
}

__pyx_t_5.data = __pyx_v_y.data;
                      __pyx_t_5.memview = __pyx_v_y.memview;
                      __PYX_INC_MEMVIEW(&__pyx_t_5, 0);
                      __pyx_t_5.shape[0] = __pyx_v_y.shape[0];
__pyx_t_5.strides[0] = __pyx_v_y.strides[0];
    __pyx_t_5.suboffsets[0] = -1;

{
    Py_ssize_t __pyx_tmp_idx = __pyx_v_j;
    Py_ssize_t __pyx_tmp_stride = __pyx_v_y.strides[1];
        __pyx_t_5.data += __pyx_tmp_idx * __pyx_tmp_stride;
}

__pyx_f_6my_ext_foo_vec(__pyx_t_4, __pyx_t_5); if (unlikely(__Pyx_ErrOccurredWithGIL())) __PYX_ERR(0, 40, __pyx_L5_error)
                      __PYX_XCLEAR_MEMVIEW(&__pyx_t_4, 0);
                      __pyx_t_4.memview = NULL; __pyx_t_4.data = NULL;
                      __PYX_XCLEAR_MEMVIEW(&__pyx_t_5, 0);
                      __pyx_t_5.memview = NULL; __pyx_t_5.data = NULL;
                      goto __pyx_L8;
                      __pyx_L5_error:;
                      {
                          #ifdef WITH_THREAD
                          PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
                          #endif
                          #ifdef _OPENMP
                          #pragma omp flush(__pyx_parallel_exc_type)
                          #endif /* _OPENMP */
                          if (!__pyx_parallel_exc_type) {
                            __Pyx_ErrFetchWithState(&__pyx_parallel_exc_type, &__pyx_parallel_exc_value, &__pyx_parallel_exc_tb);
                            __pyx_parallel_filename = __pyx_filename; __pyx_parallel_lineno = __pyx_lineno; __pyx_parallel_clineno = __pyx_clineno;
                            __Pyx_GOTREF(__pyx_parallel_exc_type);
                          }
                          #ifdef WITH_THREAD
                          __Pyx_PyGILState_Release(__pyx_gilstate_save);
                          #endif
                      }
                      __pyx_parallel_why = 4;
                      goto __pyx_L7;
                      __pyx_L7:;
                      #ifdef _OPENMP
                      #pragma omp critical(__pyx_parallel_lastprivates0)
                      #endif /* _OPENMP */
                      {
                          __pyx_parallel_temp0 = __pyx_v_j;
                      }
                      __pyx_L8:;
                      #ifdef _OPENMP
                      #pragma omp flush(__pyx_parallel_why)
                      #endif /* _OPENMP */
                  }
              }
              #ifdef _OPENMP
              Py_END_ALLOW_THREADS
              #else
{
#ifdef WITH_THREAD
              PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
              #endif
              #endif /* _OPENMP */
              /* Clean up any temporaries */
              __PYX_XCLEAR_MEMVIEW(&__pyx_t_4, 0);
              __pyx_t_4.memview = NULL; __pyx_t_4.data = NULL;
              __PYX_XCLEAR_MEMVIEW(&__pyx_t_5, 0);
              __pyx_t_5.memview = NULL; __pyx_t_5.data = NULL;
              #ifdef WITH_THREAD
              __Pyx_PyGILState_Release(__pyx_gilstate_save);
              #endif
              #ifndef _OPENMP
}
#endif /* _OPENMP */
          }
      }
      if (__pyx_parallel_exc_type) {
        /* This may have been overridden by a continue, break or return in another thread. Prefer the error. */
        __pyx_parallel_why = 4;
      }
      if (__pyx_parallel_why) {
        __pyx_v_j = __pyx_parallel_temp0;
        switch (__pyx_parallel_why) {
              case 4:
          {
              #ifdef WITH_THREAD
              PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
              #endif
              __Pyx_GIVEREF(__pyx_parallel_exc_type);
              __Pyx_ErrRestoreWithState(__pyx_parallel_exc_type, __pyx_parallel_exc_value, __pyx_parallel_exc_tb);
              __pyx_filename = __pyx_parallel_filename; __pyx_lineno = __pyx_parallel_lineno; __pyx_clineno = __pyx_parallel_clineno;
              #ifdef WITH_THREAD
              __Pyx_PyGILState_Release(__pyx_gilstate_save);
              #endif
          }
          goto __pyx_L1_error;
        }
      }
  }
  #if ((defined(__APPLE__) || defined(__OSX__)) && (defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))))
      #undef likely
      #undef unlikely
      #define likely(x)   __builtin_expect(!!(x), 1)
      #define unlikely(x) __builtin_expect(!!(x), 0)
  #endif

I am not expecting any GIL check / any other check in prange body.

Code to reproduce the behaviour:

my_ext.pyx :

cimport cython
from cython.parallel cimport prange

cimport numpy as np

np.import_array()


def foo(np.ndarray x not None):
    cdef int ndim = np.PyArray_NDIM(x)
    y = np.PyArray_EMPTY(ndim, np.PyArray_DIMS(x), np.NPY_FLOAT64, 0)

    if ndim == 1:
        foo_vec(x, y)
    elif ndim == 2:
        foo_mat(x, y)

    return y


@cython.boundscheck(False)
@cython.wraparound(False)
cdef void foo_vec(const double[:] x, double[:] y) nogil:
    cdef:
        Py_ssize_t I = x.shape[0]
        Py_ssize_t i

    for i in range(I):
        y[i] = x[i] * 2.0 + 1.0  # Do whatever computation on vec


@cython.boundscheck(False)
@cython.wraparound(False)
cdef void foo_mat(const double[:, :] x,  double[:, :] y) nogil:
    cdef:
        Py_ssize_t J = x.shape[1]
        Py_ssize_t j
    
    for j in prange(J):
        foo_vec(x[:, j], y[:, j])

setup.py :

from setuptools import setup, Extension
from Cython.Build import cythonize
import numpy as np

setup(
    ext_modules=cythonize(
        [Extension(
            "my_ext", ["my_ext.pyx"],
            include_dirs=[np.get_include()],
            extra_compile_args=["-fopenmp"],
            extra_link_args=["-fopenmp"],
            define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
        )],
        language_level=3,
        annotate=True,
    )
)

Expected behaviour

I am expecting code produced code that:

  • does not require any GIL check
  • works correctly on latest Cython version😀

OS

Linux

Python version

3.10.6

Cython version

3.0.0

Additional context

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions