/
PaddedMatrices.jl
114 lines (93 loc) · 3.44 KB
/
PaddedMatrices.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
module PaddedMatrices
using VectorizationBase, ArrayInterface,
SLEEFPirates, VectorizedRNG,
LoopVectorization, LinearAlgebra,
Random, Base.Threads#, StackPointers#,
# SpecialFunctions # Perhaps there is a better way to support erf?
using VectorizationBase: align, gep, AbstractStridedPointer, AbstractSIMDVector, vnoaliasstore!, staticm1,
static_sizeof, lazymul, vmul, vadd, vsub, StridedPointer, gesp, zero_offsets, pause,
CACHE_COUNT, NUM_CORES, CACHE_INCLUSIVITY, zstridedpointer
using LoopVectorization: maybestaticsize, mᵣ, nᵣ, preserve_buffer, CloseOpen
using ArrayInterface: StaticInt, Zero, One, OptionallyStaticUnitRange, size, strides, offsets, indices,
static_length, static_first, static_last, axes,
dense_dims, DenseDims, stride_rank, StrideRank
# using Threads: @spawn
# import ReverseDiffExpressionsBase:
# RESERVED_INCREMENT_SEED_RESERVED!, ∂getindex,
# alloc_adjoint, uninitialized, initialized, isinitialized
# import LoopVectorization: isdense
# using Parameters: @unpack
export @StrideArray, @gc_preserve, # @Constant,
AbstractStrideArray, AbstractStrideVector, AbstractStrideMatrix,
StrideArray, StrideVector, StrideMatrix,
PtrArray,# PtrVector, PtrMatrix,
# ConstantArray, ConstantVector, ConstantMatrix, allocarray,
matmul!, matmul_serial!, mul!, *ˡ, StaticInt,
matmul, matmul_serial
# LazyMap,
include("type_declarations.jl")
include("l3_cache_buffer.jl")
include("size_and_strides.jl")
include("adjoints.jl")
include("stridedpointers.jl")
include("indexing.jl")
include("initialization.jl")
include("views.jl")
include("rand.jl")
include("kernels.jl")
include("blas.jl")
include("broadcast.jl")
include("miscellaneous.jl")
# Commented, because I don't want this to be the only doc string.
# """
# To find a mode, define methods for `logdensity` and logdensity_and_gradient!` dispatching on obj, and evaluating at the position `q`.
# logdensity(obj, q, [::StackPointer])
# ∂logdensity!(∇, obj, q, [::StackPointer])
# These must return a value (eg, a logdensity). logdensity_and_gradient! should store the gradient in ∇.
# """
function logdensity end
function ∂logdensity! end
@generated function calc_factors(::Val{nc} = Val{NUM_CORES}()) where {nc}
t = Expr(:tuple)
for i ∈ nc:-1:1
d, r = divrem(nc, i)
iszero(r) && push!(t.args, (i, d))
end
t
end
const CORE_FACTORS = calc_factors()
const BCACHE = Float64[]
"""
Length is one less than `Base.nthreads()`
"""
const MULTASKS = Task[]
_nthreads() = min(NUM_CORES, length(MULTASKS));
function runfunc(t::Task, tid)
t.sticky = true
ccall(:jl_set_task_tid, Cvoid, (Any, Cint), t, tid)
push!(@inbounds(Base.Workqueues[tid+1]), t)
ccall(:jl_wakeup_thread, Cvoid, (Int16,), tid % Int16)
t
end
runfunc(func, tid) = runfunc(Task(func), tid)
function runfunc!(ft, tid)
@inbounds MULTASKS[tid] = runfunc(ft, tid)
nothing
end
function __init__()
resize!(BCACHE, BSIZE * BCACHE_COUNT)
_nt = nthreads() - 1
_nt > 1 && resize!(MULTASKS, _nt)
if _nt < NUM_CORES
msg = string(
"Your system has $NUM_CORES physical cores, but `PaddedMatrices.jl` only has ",
"$(_nt > 1 ? "$(_nt) threads" : "1 thread") available ",
"(it doesn't spawn tasks on `threadid() == 1`). ",
"You should start Julia with at least $(NUM_CORES + 1) threads.",
"",
)
end
end
# include("precompile.jl")
# _precompile_()
end # module