davidgiven · davidgiven · Oct 26, 2017 · Feb 11, 2017 · Feb 12, 2017 · Feb 12, 2017
diff --git a/h/out.h b/h/out.h
@@ -61,13 +61,13 @@ struct outname {
 /*
  * relocation type bits
  */
-#define RELSZ	0x0fff			/* relocation length */
-#define RELO1	   1			/* 1 byte */
-#define RELO2	   2			/* 2 bytes */
-#define RELO4	   3			/* 4 bytes */
-#define RELOPPC    4			/* PowerPC 26-bit address */
-#define RELOLIS    5			/* PowerPC lis */
-#define RELOVC4    6	/* VideoCore IV address in 32-bit instruction */
+#define RELSZ   0x0fff          /* relocation length */
+#define RELO1        1          /* 1 byte */
+#define RELO2        2          /* 2 bytes */
+#define RELO4        3          /* 4 bytes */
+#define RELOPPC      4          /* PowerPC 26-bit address */
+#define RELOPPC_LIS  5          /* PowerPC lis */
+#define RELOVC4      6  /* VideoCore IV address in 32-bit instruction */
 
 #define RELPC	0x2000			/* pc relative */
 #define RELBR	0x4000			/* High order byte lowest address. */

diff --git a/mach/powerpc/as/mach5.c b/mach/powerpc/as/mach5.c
@@ -42,8 +42,8 @@ void emit_hl(word_t in)
 	case OP_HA:  /* ha16[expr] */
 		if (PASS_RELO && (hl_expr.typ & S_TYP) != S_ABS) {
 			/*
-			 * RELOLIS only works with lis _, _ (same as
-			 * addis _, r0, _).  Check if instruction
+			 * RELOPPC_LIS only works with lis _, _ (same
+			 * as addis _, r0, _).  Check if instruction
 			 * isn't addis or register RA isn't r0.
 			 */
 			if ((in & 0xfc1f0000) != (0x3c000000))
@@ -55,7 +55,7 @@ void emit_hl(word_t in)
 			 * Low 26 bits: signed offset
 			 */
 			fit(fitx(hl_expr.val, 26));
-			newrelo(hl_expr.typ, RELOLIS | FIXUPFLAGS);
+			newrelo(hl_expr.typ, RELOPPC_LIS | FIXUPFLAGS);
 			reg = (in >> 21) & 0x1f;
 			in = (hl_token == OP_HA) << 31;
 			in |= reg << 26;

diff --git a/mach/powerpc/libem/aar4.s b/mach/powerpc/libem/aar4.s
@@ -1,14 +1,9 @@
 .sect .text
 
-! Index into a bounds-checked array.
+! Get address of element of bounds-checked array.
 !
-! On entry:
-!    r3 = ptr to descriptor
-!    r4 = index
-!    r5 = address of array
-! Yields:
-!    r3 = address of element
-!    r0 = size of element (used by .lar4, .sar4)
+! Stack: ( array-adr index descriptor-adr -- element-adr )
+! Sets r3 = size of element for .los4, .sts4
 ! Preserves r10 for .lar4, .sar4
 
 .define .aar4
@@ -17,16 +12,21 @@
 	ori r0, r0, lo16[.trap_earray]
 	mtspr ctr, r0            ! load CTR with trap address
 
-	lwz r0, 0(r3)
-	subf. r4, r0, r4         ! adjust range
-	bltctr                   ! check lower bound
+	lwz r4, 0(sp)            ! r4 = address of descriptor
+	lwz r5, 4(sp)            ! r5 = index
+	lwz r6, 8(sp)            ! r6 = address of array
 
-	lwz r0, 4(r3)
-	cmplw r4, r3
-	bgectr                   ! check upper bound
+	lwz r0, 0(r4)
+	subf. r5, r0, r5         ! subtract lower bound from index
+	bltctr                   ! check lower bound
 
-	lwz r0, 8(r3)
-	mullw r4, r4, r0         ! scale index
-	add r3, r4, r5           ! calculate element address
+	lwz r0, 4(r4)
+	cmplw r5, r0
+	bgtctr                   ! check upper bound
 
+	lwz r3, 8(r4)            ! r3 = size of element
+	mullw r5, r5, r3         ! scale index by size
+	add r6, r6, r5
+	stw r6, 8(sp)            ! push address of element
+	addi sp, sp, 8
 	blr
diff --git a/mach/powerpc/libem/and.s b/mach/powerpc/libem/and.s
@@ -1,24 +1,20 @@
 .sect .text
 
 ! Set intersection.
-!  Stack: ( b a size -- a*b )
+!  Stack: ( a b size -- a*b )
 
 .define .and
 .and:
-	lwz r3, 0 (sp)      ! r3 = size
-	addi sp, sp, 4
+	lwz	r3, 0(sp)		! r3 = size
+	srwi	r7, r3, 2
+	mtspr	ctr, r7			! ctr = size / 4
+	add	r4, sp, r3		! r4 = pointer before set a
 
-	mr	r4, sp			! r4 = ptr to set a
-	add	r5, sp, r3		! r5 = ptr to set b
-	srwi	r6, r3, 2
-	mtspr	ctr, r6			! ctr = r3 / 4
-1:
-	lwz	r7, 0(r4)
-	lwz	r8, 0(r5)
-	and	r8, r7, r8		! intersection of words
-	stw	r8, 0(r5)
-	addi	r4, r4, 4
-	addi	r5, r5, 4
+	! Loop with r4 in set a and sp in set b.
+1:	lwzu	r5, 4(r4)
+	lwzu	r6, 4(sp)
+	and	r7, r5, r6		! intersection of words
+	stw	r7, 0(r4)
 	bdnz	1b			! loop ctr times
-	add	sp, sp, r3
+	addi	sp, sp, 4		! drop last word of set b
 	blr
diff --git a/mach/powerpc/libem/build.lua b/mach/powerpc/libem/build.lua
@@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do
 	acklibrary {
 		name = "lib_"..plat,
 		srcs = {
-			"./*.s",
+			"./*.s", -- rm ret.s
 		},
 		vars = { plat = plat },
 		deps = {

diff --git a/mach/powerpc/libem/cif8.s b/mach/powerpc/libem/cif8.s
@@ -8,25 +8,24 @@
 
 .define .cif8
 .cif8:
-	addi sp, sp, -4          ! make space for the double
+	! Conversion uses the pivot value
+	!   1 << 52 = 0x4330 0000 0000 0000
+	!
+	! From signed integer i, we compute
+	!   ((1 << 52) + (1 << 31) + i) - ((1 << 52) + (1 << 31))
+
+	lis r3, 0x4330
+	stwu r3, -4(sp)          ! make space for the double
 
 	lwz r3, 4(sp)
 	xoris r3, r3, 0x8000
-	stw r3, 4(sp)            ! flip sign of integer value
-
-	addis r3, r0, 0x4330
-	stw r3, 0(sp)            ! set high word to construct a double
+	stw r3, 4(sp)            ! flip sign bit to get (1 << 31) + i
 
-	lfd f0, 0(sp)            ! load value
-
-	lis r3, ha16[pivot]
-	lfd f1, lo16[pivot](r3)  ! load pivot value
-	fsub f0, f0, f1          ! adjust
+	lfd f0, 0(sp)            ! f0 = (1 << 52) + (1 << 31) + i
+	lis r3, 0x8000
+	stw r3, 4(sp)
+	lfd f1, 0(sp)            ! f1 = (1 << 52) + (1 << 31)
+	fsub f0, f0, f1          ! finish conversion
 
 	stfd f0, 0(sp)           ! save value again...
 	blr                      ! ...and return 
-
-.sect .rom
-pivot:
-	.data4 0x43300000
-	.data4 0x80000000
diff --git a/mach/powerpc/libem/cms.s b/mach/powerpc/libem/cms.s
@@ -1,30 +1,27 @@
 .sect .text
 
 ! Compare sets a, b.
-!  Stack: ( b a -- )
-!  With r3 = size of each set
-!  Yields r3 = 0 if equal, nonzero if not equal
+!  Stack: ( a b size -- result )
+!  Result is 0 if equal, nonzero if not equal.
 
 .define .cms
 .cms:
-	mr	r4, sp			! r4 = ptr to set a
-	add	r5, sp, r3		! r5 = ptr to set b
-	mr	r6, r3			! r6 = size
-	srwi	r3, r3, 2
-	mtspr	ctr, r3			! ctr = size / 4
-1:
-	lwz	r7, 0(r4)
-	lwz	r8, 0(r5)
-	cmpw	cr0, r7, r8		! compare words in sets
-	addi	r4, r4, 4
-	addi	r5, r5, 4
-	bne	cr0, 2f			! branch if not equal
+	lwz	r3, 0(sp)		! r3 = size of each set
+	srwi	r7, r3, 2
+	mtspr	ctr, r7			! ctr = size / 4
+	add	r4, sp, r3		! r4 = pointer before set a
+	add	r7, r4, r3		! r7 = pointer to store result
+
+	! Loop with r4 in a set a and sp in set b.
+1:	lwzu	r5, 4(r4)
+	lwzu	r6, 4(sp)
+	cmpw	r5, r6			! compare words
+	bne	2f			! branch if not equal
 	bdnz	1b			! loop ctr times
-	addi	r3, r0, 0		! equal: return 0
+
+	li	r3, 0			! equal: return 0
 	b	3f
-2:
-	addi	r3, r0, 1		! not equal: return 1
-3:
-	slwi	r6, r6, 1		! r6 = size * 2
-	add	sp, sp, r6		! remove sets from stack
+2:	li	r3, 1			! not equal: return 1
+3:	mr	sp, r7
+	stw	r3, 0(sp)		! push result
 	blr
diff --git a/mach/powerpc/libem/com.s b/mach/powerpc/libem/com.s
@@ -5,16 +5,15 @@
 
 .define .com
 .com:
-	lwz r3, 0 (sp)       ! size
-	addi sp, sp, 4
+	lwz	r3, 0(sp)		! r3 = size
+	srwi	r7, r3, 2
+	mtspr	ctr, r7			! ctr = size / 4
+	mr	r4, sp			! r4 = pointer before set a
 
-	mr	r4, sp			! r4 = pointer to set a
-	srwi	r5, r3, 2
-	mtspr	ctr, r5			! ctr = r3 / 4
-1:
-	lwz	r6, 0(r4)
-	nor	r6, r6, r6		! complement of word
-	stw	r6, 0(r4)
-	addi	r4, r4, 4
+	! Loop with r4 in set a.
+1:	lwzu	r5, 4(r4)
+	nor	r7, r5, r5		! complement of word
+	stw	r7, 0(r4)
 	bdnz	1b			! loop ctr times
+	addi	sp, sp, 4		! drop size from stack
 	blr
diff --git a/mach/powerpc/libem/cuf8.s b/mach/powerpc/libem/cuf8.s
@@ -6,21 +6,20 @@
 
 .define .cuf8
 .cuf8:
-	addi sp, sp, -4          ! make space for the double
+	! Conversion uses the pivot value
+	!   1 << 52 = 0x4330 0000 0000 0000
+	!
+	! From unsigned integer u, we compute
+	!   ((1 << 52) + u) - (1 << 52)
 
 	lis r3, 0x4330
-	stw r3, 0(sp)            ! set high word to construct a double
+	stwu r3, -4(sp)          ! make space for the double
 
-	lfd f0, 0(sp)            ! load value
-
-	lis r3, ha16[pivot]
-	lfd f1, lo16[pivot](r3)  ! load pivot value
-	fsub f0, f0, f1          ! adjust
+	lfd f0, 0(sp)            ! f0 = (1 << 52) + u
+	li r3, 0x0000
+	stw r3, 4(sp)
+	lfd f1, 0(sp)            ! f1 = (1 << 52)
+	fsub f0, f0, f1          ! finish conversion
 
 	stfd f0, 0(sp)           ! save value again...
 	blr                      ! ...and return
-
-.sect .rom
-pivot:
-	.data4 0x43300000
-	.data4 0x00000000
diff --git a/mach/powerpc/libem/fef8.s b/mach/powerpc/libem/fef8.s
@@ -3,35 +3,48 @@
 .sect .text
 
 ! Split a double-precision float into fraction and exponent, like
-! frexp(3) in C.  On entry:
-!  r3 = float, high word (bits 0..31)
-!  r4 = float, low word (bits 32..63)
-! Yields:
-!  r3 = fraction, high word (bits 0..31)
-!  r4 = fraction, low word (bits 32..63)
-!  r5 = exponent
+! frexp(3) in C.
+!
+! Stack: ( double -- fraction exponent )
 
 .define .fef8
 .fef8:
+	lwz r3, 0(sp)			! r3 = high word (bits 0..31)
+	lwz r4, 4(sp)			! r4 = low word (bits 32..63)
+
 	! IEEE double-precision format:
 	!   sign  exponent  fraction
 	!   0     1..11     12..63
-	extrwi r6, r3, 11, 1		! r6 = IEEE exponent
-	addi r5, r6, -1022		! r5 = true exponent
+	!
+	! To get fraction in [0.5, 1) or (-1, -0.5], we subtract 1022
+	! from the IEEE exponent.
+
+	extrwi. r6, r3, 11, 1		! r6 = IEEE exponent
+	addi r5, r6, -1022		! r5 = our exponent
+	beq 2f				! jump if zero or denormalized
 	cmpwi r6, 2047
-	beqlr				! return if infinity or NaN
-	cmpwi r6, 0
-	bne 1f				! jump if normalized number
+	beq 1f				! jump if infinity or NaN
+	! fall through if normalized
+
+	! Put fraction in [0.5, 1) or (-1, -0.5] by setting its
+	! IEEE exponent to 1022.
+	rlwinm r3, r3, 0, 12, 0		! clear old exponent
+	oris r3, r3, 1022 << 4		! set new exponent
+	! fall through
 
-	! Got denormalized number or zero, probably zero.
+1:	stw r3, 0(sp)
+	stw r4, 4(sp)			! push fraction
+	stwu r5, -4(sp)			! push exponent
+	blr
+
+2:	! Got denormalized number or zero, probably zero.
 	extrwi r6, r3, 22, 12
-	addi r5, r0, 0			! r5 = true exponent = 0
 	or. r6, r6, r4			! r6 = high|low fraction
-	beqlr				! return if zero
+	bne 3f				! jump if not zero
+	li r5, 0			! exponent = 0
+	b 1b
 
-	! Got denormalized number, not zero.
-	stwu r4, -4(sp)
-	stwu r3, -4(sp)
+3:	! Got denormalized number, not zero.
 	lfd f0, 0(sp)
 	lis r6, ha16[_2_64]
 	lfd f1, lo16[_2_64](r6)
@@ -40,14 +53,8 @@
 	lwz r3, 0(sp)
 	lwz r4, 4(sp)
 	extrwi r6, r3, 11, 1		! r6 = IEEE exponent
-	addi sp, sp, 8
-	addi r5, r6, -1022 - 64		! r5 = true exponent
-1:
-	! Put fraction in [0.5, 1) or (-1, -0.5] by setting its
-	! exponent to true 0, IEEE 1022.
-	rlwinm r3, r3, 0, 12, 0		! clear old exponent
-	oris r3, r3, 1022 << 4		! set new exponent
-	blr
+	addi r5, r6, -1022 - 64		! r5 = our exponent
+	b 1b
 
 .sect .rom
 _2_64: