[BUG] OpenMP does not work anymore with function call on cython 3.0

### Describe the bug

After upgrading from Cython 0.29.36 to Cython 3.0.0 produced code just hang infinitly when OpenMP is used and sub-function are called.

Looking at produced code for bellow lines:
```python
for j in prange(J):
    foo_vec(x[:, j], y[:, j])
```

Using Cython 0.29.36 produced code is:
```c
// for j in prange(J):

  __pyx_t_1 = __pyx_v_J;
  if ((1 == 0)) abort();
  {
      #if ((defined(__APPLE__) || defined(__OSX__)) && (defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))))
          #undef likely
          #undef unlikely
          #define likely(x)   (x)
          #define unlikely(x) (x)
      #endif
      __pyx_t_3 = (__pyx_t_1 - 0 + 1 - 1/abs(1)) / 1;
      if (__pyx_t_3 > 0)
      {
          #ifdef _OPENMP
          #pragma omp parallel
          #endif /* _OPENMP */
          {
              #ifdef _OPENMP
              #pragma omp for firstprivate(__pyx_v_j) lastprivate(__pyx_v_j)
              #endif /* _OPENMP */
              for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_3; __pyx_t_2++){
                  {
                      __pyx_v_j = (Py_ssize_t)(0 + 1 * __pyx_t_2);

//     foo_vec(x[:, j], y[:, j])

                      __pyx_t_4.data = __pyx_v_x.data;
                      __pyx_t_4.memview = __pyx_v_x.memview;
                      __PYX_INC_MEMVIEW(&__pyx_t_4, 0);
                      __pyx_t_4.shape[0] = __pyx_v_x.shape[0];
__pyx_t_4.strides[0] = __pyx_v_x.strides[0];
    __pyx_t_4.suboffsets[0] = -1;

{
    Py_ssize_t __pyx_tmp_idx = __pyx_v_j;
    Py_ssize_t __pyx_tmp_stride = __pyx_v_x.strides[1];
        __pyx_t_4.data += __pyx_tmp_idx * __pyx_tmp_stride;
}

__pyx_t_5.data = __pyx_v_y.data;
                      __pyx_t_5.memview = __pyx_v_y.memview;
                      __PYX_INC_MEMVIEW(&__pyx_t_5, 0);
                      __pyx_t_5.shape[0] = __pyx_v_y.shape[0];
__pyx_t_5.strides[0] = __pyx_v_y.strides[0];
    __pyx_t_5.suboffsets[0] = -1;

{
    Py_ssize_t __pyx_tmp_idx = __pyx_v_j;
    Py_ssize_t __pyx_tmp_stride = __pyx_v_y.strides[1];
        __pyx_t_5.data += __pyx_tmp_idx * __pyx_tmp_stride;
}

__pyx_f_6my_ext_foo_vec(__pyx_t_4, __pyx_t_5);
                      __PYX_XDEC_MEMVIEW(&__pyx_t_4, 0);
                      __pyx_t_4.memview = NULL;
                      __pyx_t_4.data = NULL;
                      __PYX_XDEC_MEMVIEW(&__pyx_t_5, 0);
                      __pyx_t_5.memview = NULL;
                      __pyx_t_5.data = NULL;
                  }
              }
          }
      }
  }
  #if ((defined(__APPLE__) || defined(__OSX__)) && (defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))))
      #undef likely
      #undef unlikely
      #define likely(x)   __builtin_expect(!!(x), 1)
      #define unlikely(x) __builtin_expect(!!(x), 0)
  #endif
```

Using Cython 3.0 produced code is:
```c
// for j in prange(J):

  __pyx_t_1 = __pyx_v_J;
  {
      #if ((defined(__APPLE__) || defined(__OSX__)) && (defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))))
          #undef likely
          #undef unlikely
          #define likely(x)   (x)
          #define unlikely(x) (x)
      #endif
      __pyx_t_3 = (__pyx_t_1 - 0 + 1 - 1/abs(1)) / 1;
      if (__pyx_t_3 > 0)
      {
          #ifdef _OPENMP
          #pragma omp parallel
          #endif /* _OPENMP */
          {
              #ifdef _OPENMP
              #pragma omp for firstprivate(__pyx_v_j) lastprivate(__pyx_v_j)
              #endif /* _OPENMP */
              for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_3; __pyx_t_2++){
                  {
                      __pyx_v_j = (Py_ssize_t)(0 + 1 * __pyx_t_2);

//    foo_vec(x[:, j], y[:, j])

                      __pyx_t_4.data = __pyx_v_x.data;
                      __pyx_t_4.memview = __pyx_v_x.memview;
                      __PYX_INC_MEMVIEW(&__pyx_t_4, 0);
                      __pyx_t_4.shape[0] = __pyx_v_x.shape[0];
__pyx_t_4.strides[0] = __pyx_v_x.strides[0];
    __pyx_t_4.suboffsets[0] = -1;

{
    Py_ssize_t __pyx_tmp_idx = __pyx_v_j;
    Py_ssize_t __pyx_tmp_stride = __pyx_v_x.strides[1];
        __pyx_t_4.data += __pyx_tmp_idx * __pyx_tmp_stride;
}

__pyx_t_5.data = __pyx_v_y.data;
                      __pyx_t_5.memview = __pyx_v_y.memview;
                      __PYX_INC_MEMVIEW(&__pyx_t_5, 0);
                      __pyx_t_5.shape[0] = __pyx_v_y.shape[0];
__pyx_t_5.strides[0] = __pyx_v_y.strides[0];
    __pyx_t_5.suboffsets[0] = -1;

{
    Py_ssize_t __pyx_tmp_idx = __pyx_v_j;
    Py_ssize_t __pyx_tmp_stride = __pyx_v_y.strides[1];
        __pyx_t_5.data += __pyx_tmp_idx * __pyx_tmp_stride;
}

__pyx_f_6my_ext_foo_vec(__pyx_t_4, __pyx_t_5); if (unlikely(__Pyx_ErrOccurredWithGIL())) __PYX_ERR(0, 40, __pyx_L5_error)
                      __PYX_XCLEAR_MEMVIEW(&__pyx_t_4, 0);
                      __pyx_t_4.memview = NULL; __pyx_t_4.data = NULL;
                      __PYX_XCLEAR_MEMVIEW(&__pyx_t_5, 0);
                      __pyx_t_5.memview = NULL; __pyx_t_5.data = NULL;
                      goto __pyx_L8;
                      __pyx_L5_error:;
                      {
                          #ifdef WITH_THREAD
                          PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
                          #endif
                          #ifdef _OPENMP
                          #pragma omp flush(__pyx_parallel_exc_type)
                          #endif /* _OPENMP */
                          if (!__pyx_parallel_exc_type) {
                            __Pyx_ErrFetchWithState(&__pyx_parallel_exc_type, &__pyx_parallel_exc_value, &__pyx_parallel_exc_tb);
                            __pyx_parallel_filename = __pyx_filename; __pyx_parallel_lineno = __pyx_lineno; __pyx_parallel_clineno = __pyx_clineno;
                            __Pyx_GOTREF(__pyx_parallel_exc_type);
                          }
                          #ifdef WITH_THREAD
                          __Pyx_PyGILState_Release(__pyx_gilstate_save);
                          #endif
                      }
                      __pyx_parallel_why = 4;
                      goto __pyx_L7;
                      __pyx_L7:;
                      #ifdef _OPENMP
                      #pragma omp critical(__pyx_parallel_lastprivates0)
                      #endif /* _OPENMP */
                      {
                          __pyx_parallel_temp0 = __pyx_v_j;
                      }
                      __pyx_L8:;
                      #ifdef _OPENMP
                      #pragma omp flush(__pyx_parallel_why)
                      #endif /* _OPENMP */
                  }
              }
              #ifdef _OPENMP
              Py_END_ALLOW_THREADS
              #else
{
#ifdef WITH_THREAD
              PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
              #endif
              #endif /* _OPENMP */
              /* Clean up any temporaries */
              __PYX_XCLEAR_MEMVIEW(&__pyx_t_4, 0);
              __pyx_t_4.memview = NULL; __pyx_t_4.data = NULL;
              __PYX_XCLEAR_MEMVIEW(&__pyx_t_5, 0);
              __pyx_t_5.memview = NULL; __pyx_t_5.data = NULL;
              #ifdef WITH_THREAD
              __Pyx_PyGILState_Release(__pyx_gilstate_save);
              #endif
              #ifndef _OPENMP
}
#endif /* _OPENMP */
          }
      }
      if (__pyx_parallel_exc_type) {
        /* This may have been overridden by a continue, break or return in another thread. Prefer the error. */
        __pyx_parallel_why = 4;
      }
      if (__pyx_parallel_why) {
        __pyx_v_j = __pyx_parallel_temp0;
        switch (__pyx_parallel_why) {
              case 4:
          {
              #ifdef WITH_THREAD
              PyGILState_STATE __pyx_gilstate_save = __Pyx_PyGILState_Ensure();
              #endif
              __Pyx_GIVEREF(__pyx_parallel_exc_type);
              __Pyx_ErrRestoreWithState(__pyx_parallel_exc_type, __pyx_parallel_exc_value, __pyx_parallel_exc_tb);
              __pyx_filename = __pyx_parallel_filename; __pyx_lineno = __pyx_parallel_lineno; __pyx_clineno = __pyx_parallel_clineno;
              #ifdef WITH_THREAD
              __Pyx_PyGILState_Release(__pyx_gilstate_save);
              #endif
          }
          goto __pyx_L1_error;
        }
      }
  }
  #if ((defined(__APPLE__) || defined(__OSX__)) && (defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))))
      #undef likely
      #undef unlikely
      #define likely(x)   __builtin_expect(!!(x), 1)
      #define unlikely(x) __builtin_expect(!!(x), 0)
  #endif
```

I am not expecting any GIL check / any other check in `prange` body.

### Code to reproduce the behaviour:

my_ext.pyx :

```cython
cimport cython
from cython.parallel cimport prange

cimport numpy as np

np.import_array()


def foo(np.ndarray x not None):
    cdef int ndim = np.PyArray_NDIM(x)
    y = np.PyArray_EMPTY(ndim, np.PyArray_DIMS(x), np.NPY_FLOAT64, 0)

    if ndim == 1:
        foo_vec(x, y)
    elif ndim == 2:
        foo_mat(x, y)

    return y


@cython.boundscheck(False)
@cython.wraparound(False)
cdef void foo_vec(const double[:] x, double[:] y) nogil:
    cdef:
        Py_ssize_t I = x.shape[0]
        Py_ssize_t i

    for i in range(I):
        y[i] = x[i] * 2.0 + 1.0  # Do whatever computation on vec


@cython.boundscheck(False)
@cython.wraparound(False)
cdef void foo_mat(const double[:, :] x,  double[:, :] y) nogil:
    cdef:
        Py_ssize_t J = x.shape[1]
        Py_ssize_t j
    
    for j in prange(J):
        foo_vec(x[:, j], y[:, j])

```

setup.py :

```python
from setuptools import setup, Extension
from Cython.Build import cythonize
import numpy as np

setup(
    ext_modules=cythonize(
        [Extension(
            "my_ext", ["my_ext.pyx"],
            include_dirs=[np.get_include()],
            extra_compile_args=["-fopenmp"],
            extra_link_args=["-fopenmp"],
            define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
        )],
        language_level=3,
        annotate=True,
    )
)
```


### Expected behaviour

I am expecting code produced code that:
- does not require any GIL check 
- works correctly on latest Cython version😀 

### OS

Linux

### Python version

3.10.6

### Cython version

3.0.0

### Additional context

_No response_

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

[BUG] OpenMP does not work anymore with function call on cython 3.0 #5573

Describe the bug

Code to reproduce the behaviour:

Expected behaviour

OS

Python version

Cython version

Additional context

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

[BUG] OpenMP does not work anymore with function call on cython 3.0 #5573

Description

Describe the bug

Code to reproduce the behaviour:

Expected behaviour

OS

Python version

Cython version

Additional context

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions