Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
[ARM] Use NEON for loading the values from psq_l, gives a minimal per…
…formance increase. This change also begins a new NEONXEmitter for having cleaner support for NEON.
  • Loading branch information
Sonicadvance1 committed Sep 8, 2013
1 parent 40f848d commit e6af497
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 68 deletions.
144 changes: 96 additions & 48 deletions Source/Core/Common/Src/ArmEmitter.cpp
Expand Up @@ -892,54 +892,6 @@ ARMReg ARMXEmitter::SubBase(ARMReg Reg)
return Reg;
}

// NEON Specific
void ARMXEmitter::VABD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
{
_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VABD(float)");
_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VABD(float) when CPU doesn't support it");
bool register_quad = Vd >= Q0;

// Gets encoded as a double register
Vd = SubBase(Vd);
Vn = SubBase(Vn);
Vm = SubBase(Vm);

Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \
| ((Vd & 0xF) << 12) | (0xD << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \
| ((Vm & 0x10) << 2) | (Vm & 0xF));
}
void ARMXEmitter::VADD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
{
_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VADD(integer)");
_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VADD(integer) when CPU doesn't support it");

bool register_quad = Vd >= Q0;

// Gets encoded as a double register
Vd = SubBase(Vd);
Vn = SubBase(Vn);
Vm = SubBase(Vm);

Write32((0xF2 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \
| ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \
| ((Vm & 0x10) << 1) | (Vm & 0xF));

}
void ARMXEmitter::VSUB(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
{
_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to VSUB(integer)");
_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VSUB(integer) when CPU doesn't support it");

// Gets encoded as a double register
Vd = SubBase(Vd);
Vn = SubBase(Vn);
Vm = SubBase(Vm);

Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \
| ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (1 << 6) \
| ((Vm & 0x10) << 2) | (Vm & 0xF));
}

// Double/single, Neon
extern const VFPEnc VFPOps[16][2] = {
{{0xE0, 0xA0}, {0x20, 0xD1}}, // 0: VMLA
Expand Down Expand Up @@ -1269,4 +1221,100 @@ void ARMXEmitter::VCVT(ARMReg Dest, ARMReg Source, int flags)
}
}

void NEONXEmitter::VABD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
{
_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VABD(float)");
_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VABD(float) when CPU doesn't support it");
bool register_quad = Vd >= Q0;

// Gets encoded as a double register
Vd = SubBase(Vd);
Vn = SubBase(Vn);
Vm = SubBase(Vm);

Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (encodedSize(Size) << 20) | ((Vn & 0xF) << 16) \
| ((Vd & 0xF) << 12) | (0xD << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \
| ((Vm & 0x10) << 2) | (Vm & 0xF));
}
void NEONXEmitter::VADD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
{
_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VADD(integer)");
_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VADD(integer) when CPU doesn't support it");

bool register_quad = Vd >= Q0;

// Gets encoded as a double register
Vd = SubBase(Vd);
Vn = SubBase(Vn);
Vm = SubBase(Vm);

Write32((0xF2 << 24) | ((Vd & 0x10) << 18) | (encodedSize(Size) << 20) | ((Vn & 0xF) << 16) \
| ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \
| ((Vm & 0x10) << 1) | (Vm & 0xF));

}
void NEONXEmitter::VSUB(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
{
_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to VSUB(integer)");
_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VSUB(integer) when CPU doesn't support it");

// Gets encoded as a double register
Vd = SubBase(Vd);
Vn = SubBase(Vn);
Vm = SubBase(Vm);

Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (encodedSize(Size) << 20) | ((Vn & 0xF) << 16) \
| ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (1 << 6) \
| ((Vm & 0x10) << 2) | (Vm & 0xF));
}

void NEONXEmitter::VLD1(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
{
u32 spacing = 0x7; // Only support loading to 1 reg
// Gets encoded as a double register
Vd = SubBase(Vd);

Write32((0xF4 << 24) | ((Vd & 0x10) << 18) | (1 << 21) | (Rn << 16)
| ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6)
| (align << 4) | Rm);
}

void NEONXEmitter::VLD2(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
{
u32 spacing = 0x8; // Single spaced registers
// Gets encoded as a double register
Vd = SubBase(Vd);

Write32((0xF4 << 24) | ((Vd & 0x10) << 18) | (1 << 21) | (Rn << 16)
| ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6)
| (align << 4) | Rm);
}

void NEONXEmitter::VREVX(u32 size, NEONElementType Size, ARMReg Vd, ARMReg Vm)
{
bool register_quad = Vd >= Q0;
Vd = SubBase(Vd);
Vm = SubBase(Vm);

Write32((0xF3 << 24) | (1 << 23) | ((Vd & 0x10) << 18) | (0x3 << 20)
| (encodedSize(Size) << 18) | ((Vd & 0xF) << 12) | (size << 7)
| (register_quad << 6) | ((Vm & 0x10) << 2) | (Vm & 0xF));
}

void NEONXEmitter::VREV64(NEONElementType Size, ARMReg Vd, ARMReg Vm)
{
VREVX(2, Size, Vd, Vm);
}

void NEONXEmitter::VREV32(NEONElementType Size, ARMReg Vd, ARMReg Vm)
{
VREVX(1, Size, Vd, Vm);
}

void NEONXEmitter::VREV16(NEONElementType Size, ARMReg Vd, ARMReg Vm)
{
VREVX(0, Size, Vd, Vm);
}

}

73 changes: 61 additions & 12 deletions Source/Core/Common/Src/ArmEmitter.h
Expand Up @@ -104,13 +104,6 @@ enum ShiftType
ST_ROR = 3,
ST_RRX = 4
};
enum IntegerSize
{
I_I8 = 0,
I_I16,
I_I32,
I_I64
};

enum
{
Expand Down Expand Up @@ -349,6 +342,7 @@ typedef const u8* JumpTarget;
class ARMXEmitter
{
friend struct OpArg; // for Write8 etc
friend class NEONXEmitter;
private:
u8 *code, *startcode;
u8 *lastCacheFlushEnd;
Expand Down Expand Up @@ -533,11 +527,7 @@ class ARMXEmitter

// Subtracts the base from the register to give us the real one
ARMReg SubBase(ARMReg Reg);
// NEON Only
void VABD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VADD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VSUB(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);


// VFP Only
void VLDR(ARMReg Dest, ARMReg Base, s16 offset);
void VSTR(ARMReg Src, ARMReg Base, s16 offset);
Expand Down Expand Up @@ -584,6 +574,65 @@ class ARMXEmitter

}; // class ARMXEmitter

enum NEONElementType
{
I_8 = (1 << 0),
I_16 = (1 << 1),
I_32 = (1 << 2),
I_64 = (1 << 3),
I_SIGNED = (1 << 4),
I_UNSIGNED = (1 << 5),
F_32 = (1 << 6)
};

enum NEONAlignment
{
ALIGN_NONE = 0,
ALIGN_64 = 1,
ALIGN_128 = 2,
ALIGN_256 = 3
};


class NEONXEmitter
{
private:
ARMXEmitter *_emit;
ARMReg SubBase(ARMReg Reg) { return _emit->SubBase(Reg); }
inline void Write32(u32 value) { _emit->Write32(value); }

inline u32 encodedSize(u32 value)
{
if (value & I_8)
return 0;
else if (value & I_16)
return 1;
else if (value & I_32)
return 2;
else if (value & I_64)
return 3;
else
_dbg_assert_msg_(DYNA_REC, false, "Passed invalid size to integer NEON instruction");
return 0;
}

void VREVX(u32 size, NEONElementType Size, ARMReg Vd, ARMReg Vm);

public:
NEONXEmitter(ARMXEmitter *emit)
: _emit(emit)
{}

void VABD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VADD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VSUB(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VREV64(NEONElementType Size, ARMReg Vd, ARMReg Vm);
void VREV32(NEONElementType Size, ARMReg Vd, ARMReg Vm);
void VREV16(NEONElementType Size, ARMReg Vd, ARMReg Vm);

void VLD1(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);
void VLD2(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);
};

// Everything that needs to generate X86 code should inherit from this.
// You get memory management for free, plus, you can use all the MOV etc functions without
Expand Down
3 changes: 2 additions & 1 deletion Source/Core/Core/Src/PowerPC/JitArm32/Jit.cpp
Expand Up @@ -467,7 +467,8 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
MOVI2R(RB, (u32)&One);
VLDR(VA, RA, 0);
VLDR(VB, RB, 0);
VADD(I_I64, VA, VA, VB);
NEONXEmitter nemit(this);
nemit.VADD(I_64, VA, VA, VB);
VSTR(VA, RA, 0);
gpr.Unlock(RA, RB);
fpr.Unlock(VA);
Expand Down
10 changes: 3 additions & 7 deletions Source/Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp
Expand Up @@ -157,13 +157,9 @@ void JitArmAsmRoutineManager::GenerateCommon()
MOVI2R(R14, (u32)Memory::base);
ADD(R10, R10, R14);

LDR(R12, R10);
REV(R12, R12);
VMOV(S0, R12);

LDR(R12, R10, 4);
REV(R12, R12);
VMOV(S1, R12);
NEONXEmitter nemit(this);
nemit.VLD1(I_32, D0, R10);
nemit.VREV32(I_8, D0, D0);

POP(2, R12, _PC);
const u8* loadPairedFloatOne = GetCodePtr();
Expand Down

0 comments on commit e6af497

Please sign in to comment.