Merge pull request #931 from danieldk/maintenance/merge-v9-thincai

Merge `v9` into `thinc.ai`
explosion · Apr 18, 2024 · 98a3118 · 98a3118
2 parents 4c84103 + f348090
commit 98a3118
Show file tree

Hide file tree

Showing 44 changed files with 1,306 additions and 1,241 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -87,7 +87,9 @@ jobs:
 
       - name: Run mypy
         run: python -m mypy thinc --no-implicit-reexport
-        if: matrix.python_version != '3.6'
+        if: |
+          matrix.python_version != '3.6' &&
+          matrix.python_version != '3.7'
 
       - name: Delete source directory
         run: rm -rf thinc
@@ -150,14 +152,3 @@ jobs:
 
       - name: Run tests with extras
         run: python -m pytest --pyargs thinc --cov=thinc --cov-report=term -p thinc.tests.enable_tensorflow -p thinc.tests.enable_mxnet
-
-      - name: Run tests for thinc-apple-ops
-        run: |
-          pip uninstall -y tensorflow
-          pip install thinc-apple-ops
-          python -m pytest --pyargs thinc_apple_ops
-        if: matrix.os == 'macos-latest' && matrix.python_version == '3.10'
-
-      - name: Run tests with thinc-apple-ops
-        run: python -m pytest --pyargs thinc
-        if: matrix.os == 'macos-latest' && matrix.python_version == '3.10'
diff --git a/requirements.txt b/requirements.txt
@@ -25,7 +25,7 @@ pytest-cov>=2.7.0,<5.0.0
 coverage>=5.0.0,<8.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.5.0,<3.6.0
-mypy>=1.0.0,<1.1.0; python_version >= "3.7"
+mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
 types-mock>=0.1.1
 types-contextvars>=0.1.2; python_version < "3.7"
 types-dataclasses>=0.1.3; python_version < "3.7"

diff --git a/setup.py b/setup.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+import platform
 import sys
 from setuptools.command.build_ext import build_ext
 from sysconfig import get_path
@@ -13,16 +14,16 @@
 # http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options
 Options.docstrings = True
 
+ACCELERATE = "thinc.backends._accelerate"
+APPLE_OPS = ["thinc.backends.apple_ops", ACCELERATE]
 
 PACKAGES = find_packages()
 MOD_NAMES = [
     "thinc.backends.cblas",
-    "thinc.backends.linalg",
     "thinc.backends.numpy_ops",
-    "thinc.extra.search",
     "thinc.layers.sparselinear",
     "thinc.layers.premap_ids",
-]
+] + (APPLE_OPS if platform.system() == "Darwin" else [])
 COMPILE_OPTIONS = {
     "msvc": ["/Ox", "/EHsc"],
     "other": ["-O3", "-Wno-strict-prototypes", "-Wno-unused-function", "-std=c++11"],
@@ -80,7 +81,16 @@ def setup_package():
     ext_modules = []
     for name in MOD_NAMES:
         mod_path = name.replace(".", "/") + ".pyx"
-        ext = Extension(name, [mod_path], language="c++", include_dirs=include_dirs)
+        if name == ACCELERATE:
+            ext = Extension(
+                name,
+                [mod_path],
+                language="c++",
+                include_dirs=include_dirs,
+                libraries=["blas"],
+            )
+        else:
+            ext = Extension(name, [mod_path], language="c++", include_dirs=include_dirs)
         ext_modules.append(ext)
     print("Cythonizing sources")
     ext_modules = cythonize(

diff --git a/thinc/about.py b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.2.2"
+__version__ = "9.0.0"
 __release__ = True
diff --git a/thinc/api.py b/thinc/api.py
@@ -119,11 +119,13 @@
 )
 from .optimizers import SGD, Adam, Optimizer, RAdam
 from .schedules import (
+    Schedule,
     compounding,
     constant,
     constant_then,
     cyclic_triangular,
     decaying,
+    plateau,
     slanted_triangular,
     warmup_linear,
 )
@@ -160,6 +162,11 @@
     xp2torch,
 )
 
+try:
+    from .backends import AppleOps
+except ImportError:
+    AppleOps = None
+
 # fmt: off
 __all__ = [
     # .config
@@ -179,8 +186,8 @@
     # .optimizers
     "Adam", "RAdam", "SGD", "Optimizer",
     # .schedules
-    "cyclic_triangular", "warmup_linear", "constant", "constant_then",
-    "decaying", "slanted_triangular", "compounding",
+    "Schedule", "cyclic_triangular", "warmup_linear", "constant", "constant_then",
+    "decaying", "slanted_triangular", "compounding", "plateau",
     # .types
     "Ragged", "Padded", "ArgsKwargs", "Unserializable",
     # .util
@@ -196,7 +203,7 @@
     "has_cupy",
     # .backends
     "get_ops", "set_current_ops", "get_current_ops", "use_ops",
-    "Ops", "CupyOps", "MPSOps", "NumpyOps", "set_gpu_allocator",
+    "Ops", "AppleOps", "CupyOps", "MPSOps", "NumpyOps", "set_gpu_allocator",
     "use_pytorch_for_gpu_memory", "use_tensorflow_for_gpu_memory",
     # .layers
     "Dropout", "Embed", "expand_window", "HashEmbed", "LayerNorm", "Linear",

diff --git a/thinc/backends/__init__.py b/thinc/backends/__init__.py
@@ -19,13 +19,21 @@
 from .numpy_ops import NumpyOps
 from .ops import Ops
 
+try:
+    from .apple_ops import AppleOps
+except ImportError:
+    AppleOps = None
+
 context_ops: ContextVar[Optional[Ops]] = ContextVar("context_ops", default=None)
 context_pools: ContextVar[dict] = ContextVar("context_pools", default={})
 
 # Internal use of thread-local storage only for detecting cases where a Jupyter
 # notebook might not have preserved contextvars across cells.
 _GLOBAL_STATE = {"ops": None}
 
+# Thread-local state.
+_LOCAL_STATE = threading.local()
+
 
 def set_gpu_allocator(allocator: str) -> None:  # pragma: no cover
     """Route GPU memory allocation via PyTorch or tensorflow.
@@ -80,10 +88,6 @@ def use_tensorflow_for_gpu_memory() -> None:  # pragma: no cover
 
 
 def _import_extra_cpu_backends():
-    try:
-        from thinc_apple_ops import AppleOps
-    except ImportError:
-        pass
     try:
         from thinc_bigendian_ops import BigEndianOps
     except ImportError:
@@ -152,22 +156,14 @@ def contextvars_eq_thread_ops() -> bool:
     return False
 
 
-def _get_thread_state():
+def _get_thread_state() -> threading.local:
     """Get a thread-specific state variable that inherits from a global
     state when it's created."""
-    thread: threading.Thread = threading.current_thread()
-    if not hasattr(thread, "__local"):
-        thread.__local = _create_thread_local(_GLOBAL_STATE)
-    return thread.__local
-
-
-def _create_thread_local(
-    attrs: Dict[str, Any], local_class: Type[threading.local] = threading.local
-):
-    obj = local_class()
-    for name, value in attrs.items():
-        setattr(obj, name, value)
-    return obj
+    if not hasattr(_LOCAL_STATE, "initialized") or not _LOCAL_STATE.initialized:
+        for name, value in _GLOBAL_STATE.items():
+            setattr(_LOCAL_STATE, name, value)
+        _LOCAL_STATE.initialized = True
+    return _LOCAL_STATE
 
 
 __all__ = [
@@ -176,6 +172,7 @@ def _create_thread_local(
     "use_ops",
     "ParamServer",
     "Ops",
+    "AppleOps",
     "CupyOps",
     "MPSOps",
     "NumpyOps",

diff --git a/thinc/backends/_accelerate.pxd b/thinc/backends/_accelerate.pxd
@@ -0,0 +1,40 @@
+cdef extern from "Accelerate/Accelerate.h":
+    enum CBLAS_ORDER:     CblasRowMajor, CblasColMajor
+    enum CBLAS_TRANSPOSE: CblasNoTrans, CblasTrans, CblasConjTrans
+    enum CBLAS_UPLO:      CblasUpper, CblasLower
+    enum CBLAS_DIAG:      CblasNonUnit, CblasUnit
+    enum CBLAS_SIDE:      CblasLeft, CblasRight
+
+    # BLAS level 1 routines
+
+    void cblas_sswap(int M, float  *x, int incX, float  *y, int incY) nogil
+    void cblas_sscal(int N, float  alpha, float  *x, int incX) nogil
+    void cblas_scopy(int N, float  *x, int incX, float  *y, int incY) nogil
+    void cblas_saxpy(int N, float  alpha, float  *x, int incX, float  *y, int incY ) nogil
+    float cblas_sdot(int N, float  *x, int incX, float  *y, int incY ) nogil
+    float cblas_snrm2(int N, float  *x, int incX) nogil
+    float cblas_sasum(int N, float  *x, int incX) nogil
+    int cblas_isamax(int N, float  *x, int incX) nogil
+
+    # BLAS level 2 routines
+    void cblas_sgemv(CBLAS_ORDER Order, CBLAS_TRANSPOSE TransA, int M, int N,
+                                 float  alpha, float  *A, int lda, float  *x, int incX,
+                                 float  beta, float  *y, int incY) nogil
+
+    void cblas_sger(CBLAS_ORDER Order, int M, int N, float  alpha, float  *x,
+                                int incX, float  *y, int incY, float  *A, int lda) nogil
+
+    # BLAS level 3 routines
+    void cblas_sgemm(CBLAS_ORDER Order, CBLAS_TRANSPOSE TransA,
+                                 CBLAS_TRANSPOSE TransB, int M, int N, int K,
+                                 float  alpha, float  *A, int lda, float  *B, int ldb,
+                                 float  beta, float  *C, int ldc) nogil
+
+
+cdef void sgemm(bint TransA, bint TransB, int M, int N, int K,
+                    float alpha, const float* A, int lda, const float *B,
+                    int ldb, float beta, float* C, int ldc) nogil
+
+
+cdef void saxpy(int N, float alpha, const float* X, int incX,
+                float *Y, int incY) nogil
diff --git a/thinc/backends/_accelerate.pyx b/thinc/backends/_accelerate.pyx
@@ -0,0 +1,75 @@
+cimport numpy as np
+from libc.stdint cimport uintptr_t
+
+import numpy
+
+
+cpdef np.ndarray gemm(float[:, ::1] A, float[:, ::1] B,
+                      bint trans1=False, bint trans2=False,
+                      np.ndarray out=None):
+    cdef int nM = A.shape[0] if not trans1 else A.shape[1]
+    cdef int nK = A.shape[1] if not trans1 else A.shape[0]
+    cdef int nK_b = B.shape[0] if not trans2 else B.shape[1]
+    cdef int nN = B.shape[1] if not trans2 else B.shape[0]
+
+    cdef float[:, ::1] C = out
+
+    if out is None:
+        out = numpy.empty((nM, nN), dtype="f")
+        C = out
+    else:
+        if C.shape[0] != nM or C.shape[1] != nN:
+            msg = "Shape mismatch for output matrix, was: (%d, %d), expected (%d, %d)"
+            raise ValueError(msg % (C.shape[0], C.shape[1], nM, nN))
+
+
+    if nK != nK_b:
+        msg = "Shape mismatch for gemm: (%d, %d), (%d, %d)"
+        raise ValueError(msg % (nM, nK, nK_b, nN))
+
+    if nM == 0 or nK == 0 or nN == 0:
+        return out
+
+    cblas_sgemm(
+        CblasRowMajor,
+        CblasTrans if trans1 else CblasNoTrans,
+        CblasTrans if trans2 else CblasNoTrans,
+        nM,
+        nN,
+        nK,
+        1.0,
+        &A[0, 0],
+        A.shape[1],
+        &B[0, 0],
+        B.shape[1],
+        0.0,
+        &C[0, 0],
+        C.shape[1]
+    )
+    return out
+
+
+cdef void sgemm(bint TransA, bint TransB, int M, int N, int K,
+                    float alpha, const float* A, int lda, const float *B,
+                    int ldb, float beta, float* C, int ldc) nogil:
+    cblas_sgemm(
+        CblasRowMajor,
+        CblasTrans if TransA else CblasNoTrans,
+        CblasTrans if TransB else CblasNoTrans,
+        M,
+        N,
+        K,
+        alpha,
+        A,
+        lda,
+        B,
+        ldb,
+        beta,
+        C,
+        ldc
+    )
+
+
+cdef void saxpy(int N, float alpha, const float* X, int incX,
+                float *Y, int incY) nogil:
+    cblas_saxpy(N, alpha, X, incX, Y, incY)
diff --git a/thinc/backends/apple_ops.pyx b/thinc/backends/apple_ops.pyx
@@ -0,0 +1,39 @@
+from typing import Optional
+
+import numpy
+
+from ._accelerate import gemm
+
+from ._accelerate cimport saxpy, sgemm
+from .cblas cimport CBlas, set_saxpy, set_sgemm
+
+from .. import registry
+from ..types import Floats2d
+from .numpy_ops import NumpyOps
+
+
+@registry.ops("AppleOps")
+class AppleOps(NumpyOps):
+    """Thinc Ops class that calls into Apple's native libraries for some
+    operations. Other operations fall back to numpy."""
+    name = "apple"
+    xp = numpy
+
+    def cblas(self) -> CBlas:
+        cdef CBlas cblas = CBlas()
+        set_saxpy(cblas, saxpy)
+        set_sgemm(cblas, sgemm)
+        return cblas
+
+    def gemm(
+        self,
+        x: Floats2d,
+        y: Floats2d,
+        out: Optional[Floats2d] = None,
+        trans1: bool = False,
+        trans2: bool = False,
+    ) -> Floats2d:
+        """Perform General Matrix Multiplication (GeMM) and optionally store
+        the result in the specified output variable.
+        """
+        return gemm(x, y, out=out, trans1=trans1, trans2=trans2)
diff --git a/thinc/backends/cblas.pxd b/thinc/backends/cblas.pxd
@@ -1,8 +1,11 @@
 from libcpp.memory cimport shared_ptr
 
 ctypedef void (*sgemm_ptr)(bint transA, bint transB, int M, int N, int K,
-                           float alpha, const float* A, int lda, const float *B,
+                           float alpha, const float* A, int lda, const float* B,
                            int ldb, float beta, float* C, int ldc) nogil
+ctypedef void (*dgemm_ptr)(bint transA, bint transB, int M, int N, int K,
+                           double alpha, const double* A, int lda, const double* B,
+                           int ldb, double beta, double* C, int ldc) nogil
 
 
 ctypedef void (*saxpy_ptr)(int N, float alpha, const float* X, int incX,
@@ -12,6 +15,8 @@ ctypedef void (*saxpy_ptr)(int N, float alpha, const float* X, int incX,
 ctypedef void (*daxpy_ptr)(int N, double alpha, const double* X, int incX,
                            double *Y, int incY) nogil
 
+ctypedef void (*sscal_ptr)(int N, float alpha, float* X, int incX) nogil
+ctypedef void (*dscal_ptr)(int N, double alpha, double* X, int incX) nogil
 
 # Forward-declaration of the BlasFuncs struct. This struct must be opaque, so
 # that consumers of the CBlas class cannot become dependent on its size or
@@ -32,6 +37,12 @@ cdef class CBlas:
 cdef daxpy_ptr daxpy(CBlas cblas) nogil
 cdef saxpy_ptr saxpy(CBlas cblas) nogil
 cdef sgemm_ptr sgemm(CBlas cblas) nogil
+cdef dgemm_ptr dgemm(CBlas cblas) nogil
+cdef sscal_ptr sscal(CBlas cblas) nogil
+cdef dscal_ptr dscal(CBlas cblas) nogil
 cdef void set_daxpy(CBlas cblas, daxpy_ptr daxpy) nogil
 cdef void set_saxpy(CBlas cblas, saxpy_ptr saxpy) nogil
 cdef void set_sgemm(CBlas cblas, sgemm_ptr sgemm) nogil
+cdef void set_dgemm(CBlas cblas, dgemm_ptr dgemm) nogil
+cdef void set_sscal(CBlas cblas, sscal_ptr sscal) nogil
+cdef void set_dscal(CBlas cblas, dscal_ptr dscal) nogil