Expand Up
@@ -41,6 +41,64 @@ using namespace ArmGen;
JitArmAsmRoutineManager asm_routines;
static const float GC_ALIGNED16 (m_quantizeTableS[]) =
{
(1 << 0 ), (1 << 1 ), (1 << 2 ), (1 << 3 ),
(1 << 4 ), (1 << 5 ), (1 << 6 ), (1 << 7 ),
(1 << 8 ), (1 << 9 ), (1 << 10 ), (1 << 11 ),
(1 << 12 ), (1 << 13 ), (1 << 14 ), (1 << 15 ),
(1 << 16 ), (1 << 17 ), (1 << 18 ), (1 << 19 ),
(1 << 20 ), (1 << 21 ), (1 << 22 ), (1 << 23 ),
(1 << 24 ), (1 << 25 ), (1 << 26 ), (1 << 27 ),
(1 << 28 ), (1 << 29 ), (1 << 30 ), (1 << 31 ),
1.0 / (1ULL << 32 ), 1.0 / (1 << 31 ), 1.0 / (1 << 30 ), 1.0 / (1 << 29 ),
1.0 / (1 << 28 ), 1.0 / (1 << 27 ), 1.0 / (1 << 26 ), 1.0 / (1 << 25 ),
1.0 / (1 << 24 ), 1.0 / (1 << 23 ), 1.0 / (1 << 22 ), 1.0 / (1 << 21 ),
1.0 / (1 << 20 ), 1.0 / (1 << 19 ), 1.0 / (1 << 18 ), 1.0 / (1 << 17 ),
1.0 / (1 << 16 ), 1.0 / (1 << 15 ), 1.0 / (1 << 14 ), 1.0 / (1 << 13 ),
1.0 / (1 << 12 ), 1.0 / (1 << 11 ), 1.0 / (1 << 10 ), 1.0 / (1 << 9 ),
1.0 / (1 << 8 ), 1.0 / (1 << 7 ), 1.0 / (1 << 6 ), 1.0 / (1 << 5 ),
1.0 / (1 << 4 ), 1.0 / (1 << 3 ), 1.0 / (1 << 2 ), 1.0 / (1 << 1 ),
};
static const float GC_ALIGNED16 (m_dequantizeTableS[]) =
{
1.0 / (1 << 0 ), 1.0 / (1 << 1 ), 1.0 / (1 << 2 ), 1.0 / (1 << 3 ),
1.0 / (1 << 4 ), 1.0 / (1 << 5 ), 1.0 / (1 << 6 ), 1.0 / (1 << 7 ),
1.0 / (1 << 8 ), 1.0 / (1 << 9 ), 1.0 / (1 << 10 ), 1.0 / (1 << 11 ),
1.0 / (1 << 12 ), 1.0 / (1 << 13 ), 1.0 / (1 << 14 ), 1.0 / (1 << 15 ),
1.0 / (1 << 16 ), 1.0 / (1 << 17 ), 1.0 / (1 << 18 ), 1.0 / (1 << 19 ),
1.0 / (1 << 20 ), 1.0 / (1 << 21 ), 1.0 / (1 << 22 ), 1.0 / (1 << 23 ),
1.0 / (1 << 24 ), 1.0 / (1 << 25 ), 1.0 / (1 << 26 ), 1.0 / (1 << 27 ),
1.0 / (1 << 28 ), 1.0 / (1 << 29 ), 1.0 / (1 << 30 ), 1.0 / (1 << 31 ),
(1ULL << 32 ), (1 << 31 ), (1 << 30 ), (1 << 29 ),
(1 << 28 ), (1 << 27 ), (1 << 26 ), (1 << 25 ),
(1 << 24 ), (1 << 23 ), (1 << 22 ), (1 << 21 ),
(1 << 20 ), (1 << 19 ), (1 << 18 ), (1 << 17 ),
(1 << 16 ), (1 << 15 ), (1 << 14 ), (1 << 13 ),
(1 << 12 ), (1 << 11 ), (1 << 10 ), (1 << 9 ),
(1 << 8 ), (1 << 7 ), (1 << 6 ), (1 << 5 ),
(1 << 4 ), (1 << 3 ), (1 << 2 ), (1 << 1 ),
};
static void WriteDual32 (u32 value1, u32 value2, u32 address)
{
Memory::Write_U32 (value1, address);
Memory::Write_U32 (value2, address + 4 );
}
static void WriteDual16 (u32 value1, u32 value2, u32 address)
{
Memory::Write_U16 (value1, address);
Memory::Write_U16 (value2, address + 2 );
}
static void WriteDual8 (u32 value1, u32 value2, u32 address)
{
Memory::Write_U8 (value1, address);
Memory::Write_U8 (value2, address + 1 );
}
void JitArmAsmRoutineManager::Generate ()
{
enterCode = GetCodePtr ();
Expand Down
Expand Up
@@ -150,48 +208,221 @@ void JitArmAsmRoutineManager::GenerateCommon()
// R11 is scale
// R10 is the address
Operand2 mask (3 , 1 ); // ~(Memory::MEMVIEW32_MASK)
Operand2 arghmask (3 , 3 ); // 0x0C000000
NEONXEmitter nemit (this );
const u8* loadPairedIllegal = GetCodePtr ();
BKPT (0x10 );
const u8* loadPairedFloatTwo = GetCodePtr ();
BIC (R10, R10, mask);
MOVI2R (R12, (u32)Memory::base);
ADD (R10, R10, R12);
nemit.VLD1 (I_32, D0, R10);
nemit.VREV32 (I_8, D0, D0);
MOV (_PC, _LR);
{
BIC (R10, R10, mask);
MOVI2R (R12, (u32)Memory::base);
ADD (R10, R10, R12);
nemit.VLD1 (I_32, D0, R10);
nemit.VREV32 (I_8, D0, D0);
MOV (_PC, _LR);
}
const u8* loadPairedFloatOne = GetCodePtr ();
BIC (R10, R10, mask);
MOVI2R (R12, (u32)Memory::base );
ADD (R10, R10, R12 );
nemit. VLD1 (I_32, D0, R10);
nemit.VREV32 (I_8 , D0, D0 );
MOVI2F (S1, 1 . 0f , INVALID_REG); // Temp reg isn't used for 1.0f
MOV (_PC, _LR);
{
BIC (R10, R10, mask );
MOVI2R (R12, (u32)Memory::base );
ADD (R10, R10, R12);
nemit.VLD1 (I_32 , D0, R10 );
nemit. VREV32 (I_8, D0, D0);
MOV (_PC, _LR);
}
const u8* loadPairedU8Two = GetCodePtr ();
BKPT (0x13 );
{
BIC (R10, R10, mask);
MOVI2R (R12, (u32)Memory::base);
ADD (R10, R10, R12);
LDRH (R12, R10);
SXTB (R12, R12);
VMOV (S0, R12);
LDRH (R12, R10, 2 );
SXTB (R12, R12);
VMOV (S1, R12);
MOVI2R (R10, (u32)&m_dequantizeTableS);
ADD (R10, R10, R11);
VLDR (S2, R10, 0 );
VCVT (S0, S0, TO_FLOAT);
VCVT (S1, S1, TO_FLOAT);
VMUL (S0, S0, S2);
VMUL (S1, S1, S2);
MOV (_PC, _LR);
}
const u8* loadPairedU8One = GetCodePtr ();
BKPT (0x14 );
{
BIC (R10, R10, mask);
MOVI2R (R12, (u32)Memory::base);
ADD (R10, R10, R12);
LDRB (R12, R10);
SXTB (R12, R12);
VMOV (S0, R12);
MOVI2R (R10, (u32)&m_dequantizeTableS);
ADD (R10, R10, R11);
VLDR (S2, R10, 0 );
VCVT (S0, S0, TO_FLOAT);
VMUL (S0, S0, S2);
MOV (_PC, _LR);
}
const u8* loadPairedS8Two = GetCodePtr ();
BKPT (0x15 );
{
BIC (R10, R10, mask);
MOVI2R (R12, (u32)Memory::base);
ADD (R10, R10, R12);
LDRH (R12, R10);
SXTB (R12, R12);
VMOV (S0, R12);
LDRH (R12, R10, 2 );
SXTB (R12, R12);
VMOV (S1, R12);
MOVI2R (R10, (u32)&m_dequantizeTableS);
ADD (R10, R10, R11);
VLDR (S2, R10, 0 );
VCVT (S0, S0, TO_FLOAT | IS_SIGNED);
VCVT (S1, S1, TO_FLOAT | IS_SIGNED);
VMUL (S0, S0, S2);
VMUL (S1, S1, S2);
MOV (_PC, _LR);
}
const u8* loadPairedS8One = GetCodePtr ();
BKPT (0x16 );
{
BIC (R10, R10, mask);
MOVI2R (R12, (u32)Memory::base);
ADD (R10, R10, R12);
LDRB (R12, R10);
SXTB (R12, R12);
VMOV (S0, R12);
MOVI2R (R10, (u32)&m_dequantizeTableS);
ADD (R10, R10, R11);
VLDR (S2, R10, 0 );
VCVT (S0, S0, TO_FLOAT | IS_SIGNED);
VMUL (S0, S0, S2);
MOV (_PC, _LR);
}
const u8* loadPairedU16Two = GetCodePtr ();
BKPT (0x17 );
{
BIC (R10, R10, mask);
MOVI2R (R12, (u32)Memory::base);
ADD (R10, R10, R12);
LDRH (R12, R10);
REV16 (R12, R12);
SXTH (R12, R12);
VMOV (S0, R12);
LDRH (R12, R10, 2 );
REV16 (R12, R12);
SXTH (R12, R12);
VMOV (S1, R12);
MOVI2R (R10, (u32)&m_dequantizeTableS);
ADD (R10, R10, R11);
VLDR (S2, R10, 0 );
VCVT (S0, S0, TO_FLOAT);
VCVT (S1, S1, TO_FLOAT);
VMUL (S0, S0, S2);
VMUL (S1, S1, S2);
MOV (_PC, _LR);
}
const u8* loadPairedU16One = GetCodePtr ();
BKPT (0x18 );
{
BIC (R10, R10, mask);
MOVI2R (R12, (u32)Memory::base);
ADD (R10, R10, R12);
LDRH (R12, R10);
REV16 (R12, R12);
VMOV (S0, R12);
MOVI2R (R10, (u32)&m_dequantizeTableS);
ADD (R10, R10, R11);
VLDR (S2, R10, 0 );
VCVT (S0, S0, TO_FLOAT);
VMUL (S0, S0, S2);
MOV (_PC, _LR);
}
const u8* loadPairedS16Two = GetCodePtr ();
BKPT (0x19 );
{
BIC (R10, R10, mask);
MOVI2R (R12, (u32)Memory::base);
ADD (R10, R10, R12);
LDRH (R12, R10);
REV16 (R12, R12);
SXTH (R12, R12);
VMOV (S0, R12);
LDRH (R12, R10, 2 );
REV16 (R12, R12);
SXTH (R12, R12);
VMOV (S1, R12);
MOVI2R (R10, (u32)&m_dequantizeTableS);
ADD (R10, R10, R11);
VLDR (S2, R10, 0 );
VCVT (S0, S0, TO_FLOAT | IS_SIGNED);
VCVT (S1, S1, TO_FLOAT | IS_SIGNED);
VMUL (S0, S0, S2);
VMUL (S1, S1, S2);
MOV (_PC, _LR);
}
const u8* loadPairedS16One = GetCodePtr ();
BKPT (0x20 );
{
BIC (R10, R10, mask);
MOVI2R (R12, (u32)Memory::base);
ADD (R10, R10, R12);
LDRH (R12, R10);
MOVI2R (R10, (u32)&m_dequantizeTableS);
ADD (R10, R10, R11);
VLDR (S2, R10, 0 );
REV16 (R12, R12);
SXTH (R12, R12);
VMOV (S0, R12);
VCVT (S0, S0, TO_FLOAT | IS_SIGNED);
VMUL (S0, S0, S2);
MOV (_PC, _LR);
}
pairedLoadQuantized = reinterpret_cast <const u8**>(const_cast <u8*>(AlignCode16 ()));
ReserveCodeSpace (16 * sizeof (u8*));
Expand All
@@ -214,4 +445,182 @@ void JitArmAsmRoutineManager::GenerateCommon()
pairedLoadQuantized[14 ] = loadPairedS8One;
pairedLoadQuantized[15 ] = loadPairedS16One;
// Stores
const u8* storePairedIllegal = GetCodePtr ();
BKPT (0x21 );
const u8* storePairedFloat = GetCodePtr ();
{
TST (R10, arghmask);
FixupBranch argh = B_CC (CC_NEQ);
BIC (R10, R10, mask);
MOVI2R (R12, (u32)Memory::base);
ADD (R10, R10, R12);
nemit.VREV32 (I_8, D0, D0);
nemit.VST1 (I_32, D0, R10);
MOV (_PC, _LR);
SetJumpTarget (argh);
PUSH (5 , R0, R1, R2, R3, _LR);
VMOV (R0, S0);
VMOV (R1, S1);
MOV (R2, R10);
MOVI2R (R12, (u32)&WriteDual32);
BL (R12);
POP (5 , R0, R1, R2, R3, _PC);
}
const u8* storePairedU8 = GetCodePtr ();
const u8* storePairedS8 = GetCodePtr ();
{
// R10 is the addr
// R11 is the scale
// R12 is scratch
// S0, S1 is the values
PUSH (5 , R0, R1, R2, R3, _LR);
MOVI2R (R12, (u32)&m_quantizeTableS);
ADD (R12, R12, R11);
VLDR (S2, R12, 0 );
VMUL (S0, S0, S2);
VMUL (S1, S1, S2);
VCVT (S0, S0, TO_INT | ROUND_TO_ZERO);
VCVT (S1, S1, TO_INT | ROUND_TO_ZERO);
VMOV (R0, S0);
VMOV (R1, S1);
MOV (R2, R10);
MOVI2R (R12, (u32)&WriteDual8);
BL (R12);
POP (5 , R0, R1, R2, R3, _PC);
}
const u8* storePairedU16 = GetCodePtr ();
const u8* storePairedS16 = GetCodePtr ();
{
PUSH (5 , R0, R1, R2, R3, _LR);
MOVI2R (R12, (u32)&m_quantizeTableS);
ADD (R12, R12, R11);
VLDR (S2, R12, 0 );
VMUL (S0, S0, S2);
VMUL (S1, S1, S2);
VCVT (S0, S0, TO_INT | ROUND_TO_ZERO);
VCVT (S1, S1, TO_INT | ROUND_TO_ZERO);
VMOV (R0, S0);
VMOV (R1, S1);
MOV (R2, R10);
MOVI2R (R12, (u32)&WriteDual16);
BL (R12);
POP (5 , R0, R1, R2, R3, _PC);
}
const u8* storeSingleIllegal = GetCodePtr ();
BKPT (0x27 );
const u8* storeSingleFloat = GetCodePtr ();
{
TST (R10, arghmask);
FixupBranch argh = B_CC (CC_NEQ);
BIC (R10, R10, mask);
MOVI2R (R12, (u32)Memory::base);
ADD (R10, R10, R12);
VMOV (R12, S0);
REV (R12, R12);
STR (R12, R10);
MOV (_PC, _LR);
SetJumpTarget (argh);
PUSH (5 , R0, R1, R2, R3, _LR);
VMOV (R0, S0);
MOV (R1, R10);
MOVI2R (R10, (u32)&Memory::Write_U32);
BL (R10);
POP (5 , R0, R1, R2, R3, _PC);
}
const u8* storeSingleU8 = GetCodePtr (); // Used by MKWii
const u8* storeSingleS8 = GetCodePtr ();
{
MOVI2R (R12, (u32)&m_quantizeTableS);
ADD (R12, R12, R11);
VLDR (S2, R12, 0 );
VMUL (S0, S0, S2);
TST (R10, arghmask);
FixupBranch argh = B_CC (CC_NEQ);
BIC (R10, R10, mask);
MOVI2R (R12, (u32)Memory::base);
ADD (R10, R10, R12);
VCVT (S0, S0, TO_INT | ROUND_TO_ZERO);
VMOV (R12, S0);
STRB (R12, R10);
MOV (_PC, _LR);
SetJumpTarget (argh);
PUSH (5 , R0, R1, R2, R3, _LR);
VMOV (R0, S0);
MOV (R1, R10);
MOVI2R (R10, (u32)&Memory::Write_U8);
BL (R10);
POP (5 , R0, R1, R2, R3, _PC);
}
const u8* storeSingleU16 = GetCodePtr (); // Used by MKWii
const u8* storeSingleS16 = GetCodePtr ();
{
MOVI2R (R12, (u32)&m_quantizeTableS);
ADD (R12, R12, R11);
VLDR (S2, R12, 0 );
VMUL (S0, S0, S2);
TST (R10, arghmask);
FixupBranch argh = B_CC (CC_NEQ);
BIC (R10, R10, mask);
MOVI2R (R12, (u32)Memory::base);
ADD (R10, R10, R12);
VCVT (S0, S0, TO_INT | ROUND_TO_ZERO);
VMOV (R12, S0);
REV16 (R12, R12);
STRH (R12, R10);
MOV (_PC, _LR);
SetJumpTarget (argh);
PUSH (5 , R0, R1, R2, R3, _LR);
VMOV (R0, S0);
MOV (R1, R10);
MOVI2R (R10, (u32)&Memory::Write_U16);
BL (R10);
POP (5 , R0, R1, R2, R3, _PC);
}
pairedStoreQuantized = reinterpret_cast <const u8**>(const_cast <u8*>(AlignCode16 ()));
ReserveCodeSpace (16 * sizeof (u8*));
pairedStoreQuantized[0 ] = storePairedFloat;
pairedStoreQuantized[1 ] = storePairedIllegal;
pairedStoreQuantized[2 ] = storePairedIllegal;
pairedStoreQuantized[3 ] = storePairedIllegal;
pairedStoreQuantized[4 ] = storePairedU8;
pairedStoreQuantized[5 ] = storePairedU16;
pairedStoreQuantized[6 ] = storePairedS8;
pairedStoreQuantized[7 ] = storePairedS16;
pairedStoreQuantized[8 ] = storeSingleFloat;
pairedStoreQuantized[9 ] = storeSingleIllegal;
pairedStoreQuantized[10 ] = storeSingleIllegal;
pairedStoreQuantized[11 ] = storeSingleIllegal;
pairedStoreQuantized[12 ] = storeSingleU8;
pairedStoreQuantized[13 ] = storeSingleU16;
pairedStoreQuantized[14 ] = storeSingleS8;
pairedStoreQuantized[15 ] = storeSingleS16;
}