Skip to content

Commit

Permalink
cse stores; assumes stores can be cse-ed
Browse files Browse the repository at this point in the history
  • Loading branch information
chriselrod committed Jan 13, 2020
1 parent f560133 commit 09ff723
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 58 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LoopVectorization"
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
authors = ["Chris Elrod <elrodc@gmail.com>"]
version = "0.3.5"
version = "0.3.6"

[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand Down
64 changes: 53 additions & 11 deletions src/graphs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,10 @@ struct LoopOrder <: AbstractArray{Vector{Operation},5}
bestorder::Vector{Symbol}
end
function LoopOrder(N::Int)
LoopOrder( [ Operation[] for i 1:24N ], Vector{Symbol}(undef, N), Vector{Symbol}(undef, N) )
LoopOrder(
[ Operation[] for _ 1:32N ],
Vector{Symbol}(undef, N), Vector{Symbol}(undef, N)
)
end
LoopOrder() = LoopOrder(Vector{Operation}[],Symbol[],Symbol[])
Base.empty!(lo::LoopOrder) = foreach(empty!, lo.oporder)
Expand Down Expand Up @@ -87,6 +90,8 @@ struct LoopSet
refs_aliasing_syms::Vector{ArrayReference}
cost_vec::Matrix{Float64}
reg_pres::Matrix{Int}
included_vars::Vector{Bool}
place_after_loop::Vector{Bool}
# sym_to_ref_aliases::Dict{Symbol,ArrayReference}
# ref_to_sym_aliases::Dict{ArrayReference,Symbol}
end
Expand Down Expand Up @@ -139,7 +144,8 @@ function LoopSet()
Symbol[],
ArrayReference[],
Matrix{Float64}(undef, 4, 2),
Matrix{Int}(undef, 4, 2)
Matrix{Int}(undef, 4, 2),
Bool[], Bool[]
)
end
num_loops(ls::LoopSet) = length(ls.loops)
Expand Down Expand Up @@ -246,16 +252,19 @@ function register_single_loop!(ls::LoopSet, looprange::Expr)
lower = r.args[2]
upper = r.args[3]
lii::Bool = lower isa Integer
@assert lii
liiv::Int = convert(Int, lii)
@assert liiv == 1 "Currently only loops starting from the first index are supported."
uii::Bool = upper isa Integer
if lii & uii
Loop(itersym, 1 + convert(Int,upper) - convert(Int,lower))
Loop(itersym, 1 + convert(Int,upper) - liiv)
else
N = gensym(Symbol(:loop, itersym))
ex = if lii
if lower == 1
pushpreamble!(ls, Expr(:(=), N, upper))
else
pushpreamble!(ls, Expr(:(=), N, Expr(:call, :-, upper, lower - 1)))
pushpreamble!(ls, Expr(:(=), N, Expr(:call, :-, upper, liiv - 1)))
end
else
ex = if uii
Expand Down Expand Up @@ -437,6 +446,7 @@ function maybe_cse_load!(ls::LoopSet, expr::Expr, elementbytes::Int = 8)
@view(expr.args[2+offset:end]),
Ref(false)
)::ArrayReference
# whether this finds load or store, we use that
id = findfirst(r -> r == ref, ls.refs_aliasing_syms)
if id === nothing
add_load!( ls, gensym(:temporary), ref, elementbytes )
Expand Down Expand Up @@ -470,6 +480,7 @@ function add_reduction_update_parent!(
)
parent = getop(ls, var, elementbytes)
setdiffv!(reduceddeps, deps, loopdependencies(parent))
mergesetv!(reduceddependencies(parent), reduceddeps)
pushparent!(parents, deps, reduceddeps, parent) # deps and reduced deps will not be disjoint
op = Operation(length(operations(ls)), var, elementbytes, instr, compute, deps, reduceddeps, parents)
parent.instruction === LOOPCONSTANT && push!(ls.outer_reductions, identifier(op))
Expand Down Expand Up @@ -502,6 +513,19 @@ function add_compute!(ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int = 8,
pushop!(ls, op, var)
end
end
function add_unique_store!(ls::LoopSet, ref::ArrayReference, parent::Operation, elementbytes::Int = 8)
ldref = loopdependencies(ref, ls)
op = Operation( length(operations(ls)), ref.array, elementbytes, :setindex!, memstore, ldref, reduceddependencies(parent), [parent], ref )
add_vptr!(ls, ref.array, identifier(op), ref.ptr)
pushop!(ls, op, ref.array)
end
function cse_store!(ls::LoopSet, id::Int, ref::ArrayReference, parent::Operation, elementbytes::Int = 8)
ldref = loopdependencies(ref, ls)
op = Operation( length(operations(ls))-1, ref.array, elementbytes, :setindex!, memstore, ldref, reduceddependencies(parent), [parent], ref )
ls.operations[id] = op
ls.opdict[op.variable] = op
op
end
function add_store!(
ls::LoopSet, var::Symbol, ref::ArrayReference, elementbytes::Int = 8
)
Expand All @@ -511,10 +535,16 @@ function add_store!(
if pvar ls.syms_aliasing_refs
push!(ls.syms_aliasing_refs, pvar)
push!(ls.refs_aliasing_syms, ref)
add_unique_store!(ls, ref, parent, elementbytes)
else
# try to cse store
# different from cse load, because the other op here must be a store
for opp operations(ls)
isstore(opp) || continue
ref == opp.ref && return cse_store!(ls, identifier(opp), ref, parent, elementbytes)
end
add_unique_store!(ls, ref, parent, elementbytes)
end
op = Operation( length(operations(ls)), ref.array, elementbytes, :setindex!, memstore, ldref, reduceddependencies(parent), [parent], ref )
add_vptr!(ls, ref.array, identifier(op), ref.ptr)
pushop!(ls, op, ref.array)
end
function add_store_ref!(ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int = 8)
ref = ref_from_ref(ex)::ArrayReference
Expand Down Expand Up @@ -626,8 +656,10 @@ function fillorder!(ls::LoopSet, order::Vector{Symbol}, loopistiled::Bool)
end
ops = operations(ls)
nops = length(ops)
included_vars = fill(false, nops)
place_after_loop = fill(true, nops)
included_vars = resize!(ls.included_vars, nops)
fill!(included_vars, false)
place_after_loop = resize!(ls.place_after_loop, nops)
fill!(ls.place_after_loop, true)
# to go inside out, we just have to include all those not-yet included depending on the current sym
empty!(lo)
for _n 1:nloops
Expand All @@ -642,13 +674,23 @@ function fillorder!(ls::LoopSet, order::Vector{Symbol}, loopistiled::Bool)
istiled = (loopistiled ? (tiled loopdependencies(op)) : false) + 1
optype = Int(op.node_type) + 1
after_loop = place_after_loop[id] + 1
push!(lo[optype,isunrolled,istiled,after_loop,_n], ops[id])
push!(lo[optype,isunrolled,istiled,after_loop,_n], op)
set_upstream_family!(place_after_loop, op, false) # parents that have already been included are not moved, so no need to check included_vars to filter
end
end
end


function define_remaining_ops!(
ls::LoopSet, vectorized::Symbol, W, unrolled, tiled, U::Int
)
ops = operations(ls)
for (id,incl) enumerate(ls.included_vars)
if !incl
op = ops[id]
length(reduceddependencies(op)) == 0 && lower!( ls.preamble, op, vectorized, W, unrolled, tiled, U, nothing, nothing )
end
end
end
# function depends_on_assigned(op::Operation, assigned::Vector{Bool})
# for p ∈ parents(op)
# p === op && continue # don't fall into recursive loop when we have updates, eg a = a + b
Expand Down
77 changes: 32 additions & 45 deletions src/lowering.jl
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,6 @@ function lower_load!(
q::Expr, op::Operation, vectorized::Symbol, W::Symbol, unrolled::Symbol, U::Int,
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned} = nothing
)
# @show op.instruction
# @show unrolled, loopdependencies(op)
if vectorized loopdependencies(op)
lower_load_vectorized!(q, op, vectorized, W, unrolled, U, suffix, mask)
else
Expand Down Expand Up @@ -205,7 +203,6 @@ function reduce_expr!(q::Expr, toreduct::Symbol, instr::Instruction, U::Int)
Uh = Uh2 >> 1
reduce_range!(q, toreduct, instr, Uh, Uh2)
Uh == 1 && break
# @show Uh
Uh2 = Uh
iter += 1; iter > 4 && throw("Oops! This seems to be excessive unrolling.")
end
Expand All @@ -220,11 +217,10 @@ pvariable_name(op::Operation, suffix) = Symbol(pvariable_name(op, nothing), suff
function reduce_unroll!(q, op, U, unrolled)
loopdeps = loopdependencies(op)
isunrolled = unrolled loopdeps
if unrolled reduceddependencies(op)
if (unrolled reduceddependencies(op))
U = isunrolled ? U : 1
return U, isunrolled
end
unrolled reduceddependencies(op) || return U
var = mangledvar(op)
instr = first(parents(op)).instruction
reduce_expr!(q, var, instr, U) # assigns reduction to storevar
Expand Down Expand Up @@ -278,7 +274,6 @@ function lower_store_vectorized!(
for u 0:U-1
name, mo = name_mo(var, op, u, W, vecnotunrolled, unrolled)
instrcall = Expr(:call,lv(:vstore!), ptr, name, mo)
# @show mask, vecnotunrolled, u, U
if mask !== nothing && (vecnotunrolled || u == U - 1)
push!(instrcall.args, mask)
end
Expand Down Expand Up @@ -365,7 +360,6 @@ function lower_compute!(
# cache unroll and tiling check of parents
# not broadcasted, because we use frequent checks of individual bools
# making BitArrays inefficient.
# @show instr parentsunrolled
# parentsyms = [opp.variable for opp ∈ parents(op)]
Uiter = opunrolled ? U - 1 : 0
maskreduct = mask !== nothing && isreduction(op) && any(opp -> opp.variable === var, parents_op)
Expand Down Expand Up @@ -401,27 +395,13 @@ function lower_compute!(
end
push!(instrcall.args, parent)
end
if maskreduct && u == Uiter # only mask last
if maskreduct && (u == Uiter || unrolled !== vectorized) # only mask last
push!(q.args, Expr(:(=), varsym, Expr(:call, lv(:vifelse), mask, instrcall, varsym)))
else
push!(q.args, Expr(:(=), varsym, instrcall))
end
end
end
function lower!(
q::Expr, op::Operation, vectorized::Symbol, W::Symbol, unrolled::Symbol, tiled::Symbol, U::Int,
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned} = nothing
)
if isload(op)
lower_load!(q, op, vectorized, W, unrolled, U, suffix, mask)
elseif isstore(op)
lower_store!(q, op, vectorized, W, unrolled, tiled, U, suffix, mask)
elseif iscompute(op)
lower_compute!(q, op, vectorized, W, unrolled, U, suffix, mask)
else
lower_constant!(q, op, vectorized, W, unrolled, U, suffix, mask)
end
end
function lower_constant!(
q::Expr, op::Operation, vectorized::Symbol, W::Symbol, unrolled::Symbol, U::Int,
suffix::Union{Nothing,Int}, mask::Any = nothing
Expand Down Expand Up @@ -470,9 +450,23 @@ function lower_constant!(
q::Expr, ops::AbstractVector{Operation}, vectorized::Symbol, W::Symbol, unrolled::Symbol, U::Int,
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned} = nothing
)
foreach(op -> lower_constan!(q, op, vectorized, W, unrolled, U, suffix, mask), ops)
foreach(op -> lower_constant!(q, op, vectorized, W, unrolled, U, suffix, mask), ops)
end

function lower!(
q::Expr, op::Operation, vectorized::Symbol, W::Symbol, unrolled::Symbol, tiled::Symbol, U::Int,
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned} = nothing
)
if isconstant(op)
lower_constant!(q, op, vectorized, W, unrolled, U, suffix, mask)
elseif isload(op)
lower_load!(q, op, vectorized, W, unrolled, U, suffix, mask)
elseif iscompute(op)
lower_compute!(q, op, vectorized, W, unrolled, tiled, U, suffix, mask)
else#if isstore(op)
lower_store!(q, op, vectorized, W, unrolled, U, suffix, mask)
end
end
function lower!(
q::Expr, ops::AbstractVector{<:AbstractVector{Operation}}, vectorized::Symbol, W::Symbol, unrolled::Symbol, tiled::Symbol, U::Int,
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned} = nothing
Expand Down Expand Up @@ -501,7 +495,6 @@ function lower_nest(
nisvectorized = loopsym === vectorized
nisunrolled = false
nistiled = false
# @show n, mask
if istiled
if n == nloops
loopsym = tiledsym(loopsym)
Expand All @@ -514,7 +507,6 @@ function lower_nest(
unrolled = last(order)
nisunrolled = n == nloops
end
# @show unrolled, order
blockq = Expr(:block)
n == 1 || push!(blockq.args, Expr(:(=), order[n-1], loopstart))
loopq = if exprtype === :block
Expand Down Expand Up @@ -581,7 +573,6 @@ function add_vec_rem_iter(
loopq
end
function lower_set(ls::LoopSet, vectorized::Symbol, U::Int, T::Int, W::Symbol, ::Nothing, Uexprtype::Symbol)
# @show U, T, W
loopstart = 0
istiled = T != -1
order = names(ls)
Expand Down Expand Up @@ -620,11 +611,11 @@ function lower_set_unrolled_is_vectorized(ls::LoopSet, vectorized::Symbol, U::In
loopq
end
function initialize_outer_reductions!(
q::Expr, op::Operation, Umin::Int, Umax::Int, W::Symbol, typeT::Symbol, unrolled::Symbol, suffix::Union{Symbol,Nothing} = nothing
q::Expr, op::Operation, Umin::Int, Umax::Int, W::Symbol, typeT::Symbol, vectorized::Symbol, suffix::Union{Symbol,Nothing} = nothing
)
# T = op.elementbytes == 8 ? :Float64 : :Float32
z = Expr(:call, REDUCTION_ZERO[op.instruction], typeT)
if unrolled reduceddependencies(op)
if vectorized reduceddependencies(op)
z = Expr(:call, lv(:vbroadcast), W, z)
end
mvar = variable_name(op, suffix)
Expand All @@ -633,16 +624,15 @@ function initialize_outer_reductions!(
end
nothing
end
function initialize_outer_reductions!(q::Expr, ls::LoopSet, Umin::Int, Umax::Int, W::Symbol, typeT::Symbol, unrolled::Symbol, suffix::Union{Symbol,Nothing} = nothing)
foreach(or -> initialize_outer_reductions!(q, ls.operations[or], Umin, Umax, W, typeT, unrolled, suffix), ls.outer_reductions)
function initialize_outer_reductions!(q::Expr, ls::LoopSet, Umin::Int, Umax::Int, W::Symbol, typeT::Symbol, vectorized::Symbol, suffix::Union{Symbol,Nothing} = nothing)
foreach(or -> initialize_outer_reductions!(q, ls.operations[or], Umin, Umax, W, typeT, vectorized, suffix), ls.outer_reductions)
end
function initialize_outer_reductions!(ls::LoopSet, Umin::Int, Umax::Int, W::Symbol, typeT::Symbol, unrolled::Symbol, suffix::Union{Symbol,Nothing} = nothing)
initialize_outer_reductions!(ls.preamble, ls, Umin, Umax, W, typeT, unrolled, suffix)
function initialize_outer_reductions!(ls::LoopSet, Umin::Int, Umax::Int, W::Symbol, typeT::Symbol, vectorized::Symbol, suffix::Union{Symbol,Nothing} = nothing)
initialize_outer_reductions!(ls.preamble, ls, Umin, Umax, W, typeT, vectorized, suffix)
end
function add_upper_outer_reductions(ls::LoopSet, loopq::Expr, Ulow::Int, Uhigh::Int, W::Symbol, typeT::Symbol, unrolledloop::Loop)
unrolled = unrolledloop.itersymbol
function add_upper_outer_reductions(ls::LoopSet, loopq::Expr, Ulow::Int, Uhigh::Int, W::Symbol, typeT::Symbol, unrolledloop::Loop, vectorized::Symbol)
ifq = Expr(:block)
initialize_outer_reductions!(ifq, ls, Ulow, Uhigh, W, typeT, unrolled)
initialize_outer_reductions!(ifq, ls, Ulow, Uhigh, W, typeT, vectorized)
push!(ifq.args, loopq)
reduce_range!(ifq, ls, Ulow, Uhigh)
comparison = if unrolledloop.hintexact
Expand Down Expand Up @@ -786,7 +776,7 @@ function lower_unrolled_dynamic!(
if manageouterreductions
# Umax = (!static_unroll && U > 2) ? U >> 1 : U
Ureduct = U > 6 ? 4 : U
initialize_outer_reductions!(q, ls, 0, Ureduct, W, typeT, last(names(ls)))
initialize_outer_reductions!(q, ls, 0, Ureduct, W, typeT, vectorized)#last(names(ls)))
else
Ureduct = -1
end
Expand All @@ -798,7 +788,7 @@ function lower_unrolled_dynamic!(
if firstiter # first iter
loopq = lower_set(ls, vectorized, Ut, T, W, nothing, Uexprtype)
if T == -1 && manageouterreductions && U > 4
loopq = add_upper_outer_reductions(ls, loopq, Ureduct, U, W, typeT, unrolledloop)
loopq = add_upper_outer_reductions(ls, loopq, Ureduct, U, W, typeT, unrolledloop, vectorized)
end
push!(q.args, loopq)
elseif U == 1 #
Expand Down Expand Up @@ -863,10 +853,11 @@ function definemask(loop::Loop, W::Symbol, allon::Bool)
maskexpr(W, loop.rangesym, allon)
end
end
function setup_Wmask!(ls::LoopSet, W::Symbol, typeT::Symbol, vectorized::Symbol, unrolled::Symbol, U::Int)
function setup_Wmask!(ls::LoopSet, W::Symbol, typeT::Symbol, vectorized::Symbol, unrolled::Symbol, tiled::Symbol, U::Int)
pushpreamble!(ls, Expr(:(=), typeT, determine_eltype(ls)))
pushpreamble!(ls, Expr(:(=), W, determine_width(ls, typeT, unrolled)))
pushpreamble!(ls, definemask(ls.loops[vectorized], W, U > 1 && unrolled === vectorized))
# define_remaining_ops!( ls, vectorized, W, unrolled, tiled, U )
end
function lower_tiled(ls::LoopSet, vectorized::Symbol, U::Int, T::Int)
order = ls.loop_order.loopnames
Expand All @@ -875,11 +866,11 @@ function lower_tiled(ls::LoopSet, vectorized::Symbol, U::Int, T::Int)
mangledtiled = tiledsym(tiled)
W = gensym(:W)
typeT = gensym(:T)
setup_Wmask!(ls, W, typeT, vectorized, unrolled, U)
setup_Wmask!(ls, W, typeT, vectorized, unrolled, tiled, U)
tiledloop = ls.loops[tiled]
static_tile = tiledloop.hintexact
unrolledloop = ls.loops[unrolled]
initialize_outer_reductions!(ls, 0, 4, W, typeT, unrolled)
initialize_outer_reductions!(ls, 0, 4, W, typeT, vectorized)#unrolled)
q = Expr(:block, Expr(:(=), mangledtiled, 0))
# we build up the loop expression.
Trem = Tt = T
Expand Down Expand Up @@ -932,12 +923,11 @@ function lower_tiled(ls::LoopSet, vectorized::Symbol, U::Int, T::Int)
end
function lower_unrolled(ls::LoopSet, vectorized::Symbol, U::Int)
order = ls.loop_order.loopnames
# @show order
unrolled = last(order)
# W = VectorizationBase.pick_vector_width(ls, unrolled)
W = gensym(:W)
typeT = gensym(:T)
setup_Wmask!(ls, W, typeT, vectorized, unrolled, U)
setup_Wmask!(ls, W, typeT, vectorized, unrolled, last(order), U)
q = lower_unrolled!(Expr(:block, Expr(:(=), unrolled, 0)), ls, vectorized, U, -1, W, typeT, ls.loops[unrolled])
Expr(:block, ls.preamble, q)
end
Expand All @@ -950,11 +940,8 @@ end
# Requires sorting
function lower(ls::LoopSet)
order, vectorized, U, T = choose_order(ls)
# @show order, U, T
# @show ls.loop_order.loopnames
istiled = T != -1
fillorder!(ls, order, istiled)
# @show order, ls.loop_order.loopnames
istiled ? lower_tiled(ls, vectorized, U, T) : lower_unrolled(ls, vectorized, U)
end

Expand Down
Loading

0 comments on commit 09ff723

Please sign in to comment.