Skip to content

Commit

Permalink
Nvidia GPU slicing and support for opening Intel GPU binaries (#865)
Browse files Browse the repository at this point in the history
* Add cuda registers and ops

* Make appendOperand available to external usage

* Add cuda register size (>=sm_50)

* Refactor slicing to use Block::getInsns to get instructions inside a basic block

* Add a placeholder for pc on cuda and remove a few asserts

* Not decode cuda instructions

* Fix register naming bug

* Change constant to 500 to find more dependencies

* Remove slicing constraint

* Add an interface to use external AssignmentConverter for Slicing

* Add support to share instruction cache between different slices

* Adding predicates fields to Operand classes

* Add predicate registers and methods to get predicate registers

* Add barrier registers

* Initial implementation of slicing support for predicated instructions

* Add missed code for copy predicate information in Operand class's copy constructor and equal operator

* Comment logs

* Add Turing registers

* Fix reg prefix

* added support for intel gen9 gpu

* Remove tab and logs

* Add support for intel gpu instructions

* Add slicing size limit factor

* 1. Avoid using Block pointer as key to a cache. Use block start address
2. Change the default value of bound size limit factor
3. Changes to suppress warning regarding to intel gpu
4. Other code cleanup

Co-authored-by: Jokeren <robinho364@gmail.com>
Co-authored-by: Aaron Cherian <aarontcopal2@jlselogin2.ftm.alcf.anl.gov>
  • Loading branch information
3 people committed Sep 28, 2020
1 parent 46db3aa commit 3fae9d9
Show file tree
Hide file tree
Showing 18 changed files with 725 additions and 123 deletions.
374 changes: 373 additions & 1 deletion common/h/dyn_regs.h

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions common/h/entryIDs.h
Expand Up @@ -3002,6 +3002,9 @@ power_op_dxex,
aarch64_op_yield_hint,
aarch64_op_zip1_advsimd,
aarch64_op_zip2_advsimd,
cuda_op_general,
cuda_op_call,
intel_gpu_op_general,
_entry_ids_max_
};
enum prefixEntryID : unsigned int {
Expand Down
28 changes: 22 additions & 6 deletions common/src/dyn_regs.C
Expand Up @@ -89,11 +89,13 @@ MachRegister MachRegister::getBaseRegister() const {
case Arch_none:
return *this;
case Arch_cuda:
assert(0);
return *this;
case Arch_aarch32:
case Arch_aarch64:
//not verified
return *this;
case Arch_intelGen9:
return *this;
}
return InvalidReg;
}
Expand Down Expand Up @@ -201,8 +203,14 @@ unsigned int MachRegister::size() const {
return 16;
return 8;
case Arch_aarch32:
{
assert(0);
break;
}
case Arch_cuda:
assert(0);
{
return 8;
}
case Arch_aarch64:
if((reg & 0x00ff0000) == aarch64::FPR)
{
Expand Down Expand Up @@ -230,6 +238,11 @@ unsigned int MachRegister::size() const {
}
else
return 4;
case Arch_intelGen9:
{
assert(0);
break;
}
case Arch_none:
return 0;
}
Expand Down Expand Up @@ -268,9 +281,10 @@ MachRegister MachRegister::getPC(Dyninst::Architecture arch)
case Arch_aarch64: //aarch64: pc is not writable
return aarch64::pc;
case Arch_aarch32:
return InvalidReg;
case Arch_cuda:
assert(0);
case Arch_none:
return cuda::pc;
default:
return InvalidReg;
}
return InvalidReg;
Expand All @@ -294,7 +308,7 @@ MachRegister MachRegister::getReturnAddress(Dyninst::Architecture arch)
case Arch_aarch32:
case Arch_cuda:
assert(0);
case Arch_none:
default:
return InvalidReg;
}
return InvalidReg;
Expand Down Expand Up @@ -527,7 +541,9 @@ bool MachRegister::isFlag() const
// and all lower 32 bits are base ID
int baseID = reg & 0x0000FFFF;
return (baseID <= 731 && baseID >= 700) || (baseID <= 629 && baseID >= 621);
}
}
case Arch_cuda:
return false;
default:
assert(!"Not implemented!");
}
Expand Down
78 changes: 36 additions & 42 deletions dataflowAPI/h/Absloc.h
Expand Up @@ -64,6 +64,7 @@ class Absloc {
Register,
Stack,
Heap,
PredicatedRegister,
Unknown } Type;

DATAFLOW_EXPORT static Absloc makePC(Dyninst::Architecture arch);
Expand All @@ -83,14 +84,18 @@ class Absloc {
off_(-1),
region_(-1),
func_(NULL),
addr_(-1) {};
addr_(-1),
preg_(),
trueCond_(false) {};
DATAFLOW_EXPORT Absloc(MachRegister reg) :
type_(Register),
reg_(reg),
off_(-1),
region_(-1),
func_(NULL),
addr_(-1)
addr_(-1),
preg_(),
trueCond_(false)
{};

DATAFLOW_EXPORT Absloc(Address addr) :
Expand All @@ -99,7 +104,10 @@ class Absloc {
off_(-1),
region_(-1),
func_(NULL),
addr_(addr) {};
addr_(addr),
preg_(),
trueCond_(false)
{};
DATAFLOW_EXPORT Absloc(int o,
int r,
ParseAPI::Function *f) :
Expand All @@ -108,59 +116,39 @@ class Absloc {
off_(o),
region_(r),
func_(f),
addr_(-1) {};
addr_(-1),
preg_(),
trueCond_(false)
{};
DATAFLOW_EXPORT Absloc(MachRegister r, MachRegister p, bool c):
type_(PredicatedRegister),
reg_(r),
off_(-1),
region_(-1),
func_(NULL),
addr_(-1),
preg_(p),
trueCond_(c) {};

DATAFLOW_EXPORT std::string format() const;

DATAFLOW_EXPORT const Type &type() const { return type_; };

DATAFLOW_EXPORT bool isValid() const { return type_ != Unknown; };

DATAFLOW_EXPORT const MachRegister &reg() const { assert(type_ == Register); return reg_; };
DATAFLOW_EXPORT const MachRegister &reg() const { assert(type_ == Register || type_ == PredicatedRegister); return reg_; };

DATAFLOW_EXPORT int off() const { assert(type_ == Stack); return off_; };
DATAFLOW_EXPORT int region() const { assert(type_ == Stack); return region_; };
DATAFLOW_EXPORT ParseAPI::Function *func() const { assert(type_ == Stack); return func_; };

DATAFLOW_EXPORT Address addr() const { assert(type_ == Heap); return addr_; };
DATAFLOW_EXPORT const MachRegister &predReg() const { assert(type_ == PredicatedRegister); return preg_;};
DATAFLOW_EXPORT bool isTrueCondition() const { assert(type_ == PredicatedRegister); return trueCond_;};
DATAFLOW_EXPORT void flipPredicateCondition() { assert(type_ == PredicatedRegister); trueCond_ = !trueCond_; }

DATAFLOW_EXPORT bool operator<(const Absloc &rhs) const {
if (type_ != rhs.type_)
return type_ < rhs.type_;
switch(type_) {
case Register:
return reg_ < rhs.reg_;
case Stack:
if (off_ != rhs.off_)
return off_ < rhs.off_;
// Now we get arbitrary
if (region_ != rhs.region_)
return region_ < rhs.region_;
return func_ < rhs.func_;
case Heap:
return addr_ < rhs.addr_;
case Unknown:
return false; // everything is less than an unknown
}
assert(0);
return true;
}

DATAFLOW_EXPORT bool operator==(const Absloc &rhs) const {
if (type_ != rhs.type_) return false;
switch(type_) {
case Register:
return reg_ == rhs.reg_;
case Stack:
return ((off_ == rhs.off_) &&
(region_ == rhs.region_) &&
(func_ == rhs.func_));
case Heap:
return addr_ == rhs.addr_;
default:
return true;
}
}
DATAFLOW_EXPORT bool operator<(const Absloc &rhs) const;
DATAFLOW_EXPORT bool operator==(const Absloc &rhs) const;

DATAFLOW_EXPORT bool operator!=(const Absloc &rhs) const {
return !(*this == rhs);
Expand All @@ -174,6 +162,8 @@ class Absloc {
return 's';
case Heap:
return 'h';
case PredicatedRegister:
return 'p';
default:
return 'u';
}
Expand All @@ -194,6 +184,9 @@ class Absloc {
ParseAPI::Function *func_;

Address addr_;

MachRegister preg_;
bool trueCond_;
};

class AbsRegion {
Expand Down Expand Up @@ -258,6 +251,7 @@ class AbsRegion {
DATAFLOW_EXPORT AST::Ptr generator() const { return generator_; }

DATAFLOW_EXPORT bool isImprecise() const { return type_ != Absloc::Unknown; }
DATAFLOW_EXPORT void flipPredicateCondition() { absloc_.flipPredicateCondition(); }
friend std::ostream &operator<<(std::ostream &os, const AbsRegion &a) {
os << a.format();
return os;
Expand Down
4 changes: 4 additions & 0 deletions dataflowAPI/h/AbslocInterface.h
Expand Up @@ -80,6 +80,10 @@ class AbsRegionConverter {
ParseAPI::Function *func,
ParseAPI::Block *block);

DATAFLOW_EXPORT AbsRegion convertPredicatedRegister(InstructionAPI::RegisterAST::Ptr r,
InstructionAPI::RegisterAST::Ptr p,
bool c);

// Cons up a stack reference at the current addr
DATAFLOW_EXPORT AbsRegion stack(Address addr,
ParseAPI::Function *func,
Expand Down
30 changes: 27 additions & 3 deletions dataflowAPI/h/slicing.h
Expand Up @@ -137,11 +137,30 @@ class Slicer {
typedef std::pair<InstructionAPI::Instruction, Address> InsnInstance;
typedef std::vector<InsnInstance> InsnVec;

// An instruction cache to avoid redundant instruction decoding.
// A user can optionaly provide a cache shared by multiple slicers.
// The cache is keyed with basic block starting address.
typedef dyn_hash_map<Address, InsnVec> InsnCache;

DATAFLOW_EXPORT Slicer(AssignmentPtr a,
ParseAPI::Block *block,
ParseAPI::Function *func,
bool cache = true,
bool stackAnalysis = true);

DATAFLOW_EXPORT Slicer(AssignmentPtr a,
ParseAPI::Block *block,
ParseAPI::Function *func,
AssignmentConverter *ac);

DATAFLOW_EXPORT Slicer(AssignmentPtr a,
ParseAPI::Block *block,
ParseAPI::Function *func,
AssignmentConverter *ac,
InsnCache *c);


DATAFLOW_EXPORT ~Slicer();

DATAFLOW_EXPORT static bool isWidenNode(Node::Ptr n);

Expand Down Expand Up @@ -261,11 +280,15 @@ class Slicer {
public:
typedef std::pair<ParseAPI::Function *, int> StackDepth_t;
typedef std::stack<StackDepth_t> CallStack_t;

DATAFLOW_EXPORT bool performCacheClear() { if (clearCache) {clearCache = false; return true;} else return false; }
DATAFLOW_EXPORT void setClearCache(bool b) { clearCache = b; }
DATAFLOW_EXPORT bool searchForControlFlowDep() { return controlFlowDep; }
DATAFLOW_EXPORT void setSearchForControlFlowDep(bool cfd) { controlFlowDep = cfd; }

// A negative number means that we do not bound slicing size.
DATAFLOW_EXPORT virtual int slicingSizeLimitFactor() { return -1; }

DATAFLOW_EXPORT virtual bool allowImprecision() { return false; }
DATAFLOW_EXPORT virtual bool widenAtPoint(AssignmentPtr) { return false; }
DATAFLOW_EXPORT virtual bool endAtPoint(AssignmentPtr) { return false; }
Expand Down Expand Up @@ -314,7 +337,6 @@ class Slicer {
forward,
backward } Direction;

typedef std::map<ParseAPI::Block *, InsnVec> InsnCache;

// Our slicing is context-sensitive; that is, if we enter
// a function foo from a caller bar, all return edges
Expand Down Expand Up @@ -688,7 +710,8 @@ class Slicer {

void mergeRecursiveCaches(std::map<Address, DefCache>& sc, std::map<Address, DefCache>& c, Address a);

InsnCache insnCache_;
InsnCache* insnCache_;
bool own_insnCache;

AssignmentPtr a_;
ParseAPI::Block *b_;
Expand Down Expand Up @@ -720,7 +743,8 @@ class Slicer {
std::deque<Address> addrStack;
std::set<Address> addrSet;

AssignmentConverter converter;
AssignmentConverter* converter;
bool own_converter;

SliceNode::Ptr widen_;
public:
Expand Down
56 changes: 56 additions & 0 deletions dataflowAPI/src/Absloc.C
Expand Up @@ -99,6 +99,11 @@ std::string Absloc::format() const {
case Heap:
ret << "_" << std::hex << addr_ << std::dec;
break;
case PredicatedRegister:
ret << "PRED_REG[";
if (!trueCond_) ret << "!";
ret << preg_.name() << "," << reg_.name() << "]";
break;
default:
ret << "(UNKNOWN)";
break;
Expand All @@ -107,6 +112,53 @@ std::string Absloc::format() const {
return ret.str();
}

bool Absloc::operator<(const Absloc & rhs) const {
if (type_ != rhs.type_)
return type_ < rhs.type_;
switch(type_) {
case Register:
return reg_ < rhs.reg_;
case Stack:
if (off_ != rhs.off_)
return off_ < rhs.off_;
// Now we get arbitrary
if (region_ != rhs.region_)
return region_ < rhs.region_;
return func_ < rhs.func_;
case Heap:
return addr_ < rhs.addr_;
case PredicatedRegister:
if (reg_ != rhs.reg_)
return reg_ < rhs.reg_;
if (preg_ != rhs.preg_)
return preg_ < rhs.preg_;
return trueCond_ < rhs.trueCond_;
case Unknown:
return false; // everything is less than an unknown
}
assert(0);
return true;
}

bool Absloc::operator==(const Absloc & rhs) const {
if (type_ != rhs.type_) return false;
switch(type_) {
case Register:
return reg_ == rhs.reg_;
case Stack:
return ((off_ == rhs.off_) &&
(region_ == rhs.region_) &&
(func_ == rhs.func_));
case Heap:
return addr_ == rhs.addr_;
case PredicatedRegister:
return (reg_ == rhs.reg_) && (preg_ == rhs.preg_) && (trueCond_ == rhs.trueCond_);
default:
return true;
}
}


bool AbsRegion::contains(const Absloc::Type t) const {
// Abslocs, if they exist, must be specific.
// So just check our type
Expand Down Expand Up @@ -152,6 +204,10 @@ bool AbsRegion::contains(const AbsRegion &rhs) const {

if (absloc_ == rhs.absloc_) return true;

// If rhs is a predicated register and the lhs is a non-predicated register,
// then the lhs contains the rhs when the base registers are the same.
if (rhs.absloc_.type() == Absloc::PredicatedRegister && rhs.absloc_.reg() == absloc_.reg()) return true;

// Stack slots operate kinda... odd...
if ((absloc_.type() == Absloc::Stack) &&
(rhs.absloc_.type() == Absloc::Stack)) {
Expand Down

0 comments on commit 3fae9d9

Please sign in to comment.