463 changes: 436 additions & 27 deletions Source/Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp
Expand Up @@ -41,6 +41,64 @@ using namespace ArmGen;

JitArmAsmRoutineManager asm_routines;

static const float GC_ALIGNED16(m_quantizeTableS[]) =
{
(1 << 0), (1 << 1), (1 << 2), (1 << 3),
(1 << 4), (1 << 5), (1 << 6), (1 << 7),
(1 << 8), (1 << 9), (1 << 10), (1 << 11),
(1 << 12), (1 << 13), (1 << 14), (1 << 15),
(1 << 16), (1 << 17), (1 << 18), (1 << 19),
(1 << 20), (1 << 21), (1 << 22), (1 << 23),
(1 << 24), (1 << 25), (1 << 26), (1 << 27),
(1 << 28), (1 << 29), (1 << 30), (1 << 31),
1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29),
1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25),
1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21),
1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17),
1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13),
1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9),
1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5),
1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1),
};

static const float GC_ALIGNED16(m_dequantizeTableS[]) =
{
1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3),
1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7),
1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11),
1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15),
1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19),
1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23),
1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27),
1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31),
(1ULL << 32), (1 << 31), (1 << 30), (1 << 29),
(1 << 28), (1 << 27), (1 << 26), (1 << 25),
(1 << 24), (1 << 23), (1 << 22), (1 << 21),
(1 << 20), (1 << 19), (1 << 18), (1 << 17),
(1 << 16), (1 << 15), (1 << 14), (1 << 13),
(1 << 12), (1 << 11), (1 << 10), (1 << 9),
(1 << 8), (1 << 7), (1 << 6), (1 << 5),
(1 << 4), (1 << 3), (1 << 2), (1 << 1),
};

static void WriteDual32(u32 value1, u32 value2, u32 address)
{
Memory::Write_U32(value1, address);
Memory::Write_U32(value2, address + 4);
}

static void WriteDual16(u32 value1, u32 value2, u32 address)
{
Memory::Write_U16(value1, address);
Memory::Write_U16(value2, address + 2);
}

static void WriteDual8(u32 value1, u32 value2, u32 address)
{
Memory::Write_U8(value1, address);
Memory::Write_U8(value2, address + 1);
}

void JitArmAsmRoutineManager::Generate()
{
enterCode = GetCodePtr();
Expand Down Expand Up @@ -150,48 +208,221 @@ void JitArmAsmRoutineManager::GenerateCommon()
// R11 is scale
// R10 is the address
Operand2 mask(3, 1); // ~(Memory::MEMVIEW32_MASK)
Operand2 arghmask(3, 3); // 0x0C000000
NEONXEmitter nemit(this);

const u8* loadPairedIllegal = GetCodePtr();
BKPT(0x10);

const u8* loadPairedFloatTwo = GetCodePtr();
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

nemit.VLD1(I_32, D0, R10);
nemit.VREV32(I_8, D0, D0);

MOV(_PC, _LR);

{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

nemit.VLD1(I_32, D0, R10);
nemit.VREV32(I_8, D0, D0);

MOV(_PC, _LR);
}
const u8* loadPairedFloatOne = GetCodePtr();
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

nemit.VLD1(I_32, D0, R10);
nemit.VREV32(I_8, D0, D0);

MOVI2F(S1, 1.0f, INVALID_REG); // Temp reg isn't used for 1.0f
MOV(_PC, _LR);

{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

nemit.VLD1(I_32, D0, R10);
nemit.VREV32(I_8, D0, D0);
MOV(_PC, _LR);
}
const u8* loadPairedU8Two = GetCodePtr();
BKPT(0x13);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

LDRH(R12, R10);
SXTB(R12, R12);
VMOV(S0, R12);

LDRH(R12, R10, 2);
SXTB(R12, R12);
VMOV(S1, R12);

MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);

VCVT(S0, S0, TO_FLOAT);
VCVT(S1, S1, TO_FLOAT);

VMUL(S0, S0, S2);
VMUL(S1, S1, S2);

MOV(_PC, _LR);
}
const u8* loadPairedU8One = GetCodePtr();
BKPT(0x14);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

LDRB(R12, R10);
SXTB(R12, R12);
VMOV(S0, R12);

MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);

VCVT(S0, S0, TO_FLOAT);

VMUL(S0, S0, S2);

MOV(_PC, _LR);
}
const u8* loadPairedS8Two = GetCodePtr();
BKPT(0x15);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

LDRH(R12, R10);
SXTB(R12, R12);
VMOV(S0, R12);

LDRH(R12, R10, 2);
SXTB(R12, R12);
VMOV(S1, R12);

MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);

VCVT(S0, S0, TO_FLOAT | IS_SIGNED);
VCVT(S1, S1, TO_FLOAT | IS_SIGNED);

VMUL(S0, S0, S2);
VMUL(S1, S1, S2);

MOV(_PC, _LR);
}
const u8* loadPairedS8One = GetCodePtr();
BKPT(0x16);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

LDRB(R12, R10);
SXTB(R12, R12);
VMOV(S0, R12);

MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);

VCVT(S0, S0, TO_FLOAT | IS_SIGNED);

VMUL(S0, S0, S2);

MOV(_PC, _LR);
}
const u8* loadPairedU16Two = GetCodePtr();
BKPT(0x17);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

LDRH(R12, R10);
REV16(R12, R12);
SXTH(R12, R12);
VMOV(S0, R12);

LDRH(R12, R10, 2);
REV16(R12, R12);
SXTH(R12, R12);
VMOV(S1, R12);

MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);

VCVT(S0, S0, TO_FLOAT);
VCVT(S1, S1, TO_FLOAT);

VMUL(S0, S0, S2);
VMUL(S1, S1, S2);

MOV(_PC, _LR);
}
const u8* loadPairedU16One = GetCodePtr();
BKPT(0x18);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

LDRH(R12, R10);
REV16(R12, R12);
VMOV(S0, R12);

MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);

VCVT(S0, S0, TO_FLOAT);

VMUL(S0, S0, S2);
MOV(_PC, _LR);
}
const u8* loadPairedS16Two = GetCodePtr();
BKPT(0x19);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

LDRH(R12, R10);
REV16(R12, R12);
SXTH(R12, R12);
VMOV(S0, R12);

LDRH(R12, R10, 2);
REV16(R12, R12);
SXTH(R12, R12);
VMOV(S1, R12);

MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);

VCVT(S0, S0, TO_FLOAT | IS_SIGNED);
VCVT(S1, S1, TO_FLOAT | IS_SIGNED);

VMUL(S0, S0, S2);
VMUL(S1, S1, S2);

MOV(_PC, _LR);
}
const u8* loadPairedS16One = GetCodePtr();
BKPT(0x20);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

LDRH(R12, R10);

MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);

REV16(R12, R12);
SXTH(R12, R12);
VMOV(S0, R12);
VCVT(S0, S0, TO_FLOAT | IS_SIGNED);

VMUL(S0, S0, S2);
MOV(_PC, _LR);
}

pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(16 * sizeof(u8*));
Expand All @@ -214,4 +445,182 @@ void JitArmAsmRoutineManager::GenerateCommon()
pairedLoadQuantized[14] = loadPairedS8One;
pairedLoadQuantized[15] = loadPairedS16One;

// Stores
const u8* storePairedIllegal = GetCodePtr();
BKPT(0x21);
const u8* storePairedFloat = GetCodePtr();
{
TST(R10, arghmask);
FixupBranch argh = B_CC(CC_NEQ);
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

nemit.VREV32(I_8, D0, D0);
nemit.VST1(I_32, D0, R10);
MOV(_PC, _LR);

SetJumpTarget(argh);

PUSH(5, R0, R1, R2, R3, _LR);
VMOV(R0, S0);
VMOV(R1, S1);
MOV(R2, R10);
MOVI2R(R12, (u32)&WriteDual32);
BL(R12);
POP(5, R0, R1, R2, R3, _PC);
}
const u8* storePairedU8 = GetCodePtr();
const u8* storePairedS8 = GetCodePtr();
{
// R10 is the addr
// R11 is the scale
// R12 is scratch
// S0, S1 is the values
PUSH(5, R0, R1, R2, R3, _LR);

MOVI2R(R12, (u32)&m_quantizeTableS);
ADD(R12, R12, R11);
VLDR(S2, R12, 0);
VMUL(S0, S0, S2);
VMUL(S1, S1, S2);

VCVT(S0, S0, TO_INT | ROUND_TO_ZERO);
VCVT(S1, S1, TO_INT | ROUND_TO_ZERO);

VMOV(R0, S0);
VMOV(R1, S1);
MOV(R2, R10);
MOVI2R(R12, (u32)&WriteDual8);
BL(R12);

POP(5, R0, R1, R2, R3, _PC);
}
const u8* storePairedU16 = GetCodePtr();
const u8* storePairedS16 = GetCodePtr();
{
PUSH(5, R0, R1, R2, R3, _LR);

MOVI2R(R12, (u32)&m_quantizeTableS);
ADD(R12, R12, R11);
VLDR(S2, R12, 0);
VMUL(S0, S0, S2);
VMUL(S1, S1, S2);

VCVT(S0, S0, TO_INT | ROUND_TO_ZERO);
VCVT(S1, S1, TO_INT | ROUND_TO_ZERO);

VMOV(R0, S0);
VMOV(R1, S1);
MOV(R2, R10);
MOVI2R(R12, (u32)&WriteDual16);
BL(R12);

POP(5, R0, R1, R2, R3, _PC);
}
const u8* storeSingleIllegal = GetCodePtr();
BKPT(0x27);
const u8* storeSingleFloat = GetCodePtr();
{
TST(R10, arghmask);
FixupBranch argh = B_CC(CC_NEQ);
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

VMOV(R12, S0);
REV(R12, R12);
STR(R12, R10);
MOV(_PC, _LR);

SetJumpTarget(argh);

PUSH(5, R0, R1, R2, R3, _LR);
VMOV(R0, S0);
MOV(R1, R10);
MOVI2R(R10, (u32)&Memory::Write_U32);
BL(R10);

POP(5, R0, R1, R2, R3, _PC);
}
const u8* storeSingleU8 = GetCodePtr(); // Used by MKWii
const u8* storeSingleS8 = GetCodePtr();
{
MOVI2R(R12, (u32)&m_quantizeTableS);
ADD(R12, R12, R11);
VLDR(S2, R12, 0);
VMUL(S0, S0, S2);

TST(R10, arghmask);
FixupBranch argh = B_CC(CC_NEQ);
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

VCVT(S0, S0, TO_INT | ROUND_TO_ZERO);
VMOV(R12, S0);
STRB(R12, R10);
MOV(_PC, _LR);

SetJumpTarget(argh);

PUSH(5, R0, R1, R2, R3, _LR);
VMOV(R0, S0);
MOV(R1, R10);
MOVI2R(R10, (u32)&Memory::Write_U8);
BL(R10);
POP(5, R0, R1, R2, R3, _PC);
}
const u8* storeSingleU16 = GetCodePtr(); // Used by MKWii
const u8* storeSingleS16 = GetCodePtr();
{
MOVI2R(R12, (u32)&m_quantizeTableS);
ADD(R12, R12, R11);
VLDR(S2, R12, 0);
VMUL(S0, S0, S2);

TST(R10, arghmask);
FixupBranch argh = B_CC(CC_NEQ);
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);

VCVT(S0, S0, TO_INT | ROUND_TO_ZERO);
VMOV(R12, S0);
REV16(R12, R12);
STRH(R12, R10);
MOV(_PC, _LR);

SetJumpTarget(argh);

PUSH(5, R0, R1, R2, R3, _LR);
VMOV(R0, S0);
MOV(R1, R10);
MOVI2R(R10, (u32)&Memory::Write_U16);
BL(R10);

POP(5, R0, R1, R2, R3, _PC);
}

pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(16 * sizeof(u8*));

pairedStoreQuantized[0] = storePairedFloat;
pairedStoreQuantized[1] = storePairedIllegal;
pairedStoreQuantized[2] = storePairedIllegal;
pairedStoreQuantized[3] = storePairedIllegal;
pairedStoreQuantized[4] = storePairedU8;
pairedStoreQuantized[5] = storePairedU16;
pairedStoreQuantized[6] = storePairedS8;
pairedStoreQuantized[7] = storePairedS16;

pairedStoreQuantized[8] = storeSingleFloat;
pairedStoreQuantized[9] = storeSingleIllegal;
pairedStoreQuantized[10] = storeSingleIllegal;
pairedStoreQuantized[11] = storeSingleIllegal;
pairedStoreQuantized[12] = storeSingleU8;
pairedStoreQuantized[13] = storeSingleU16;
pairedStoreQuantized[14] = storeSingleS8;
pairedStoreQuantized[15] = storeSingleS16;

}