bandersnatch/multiexp.go

// Copyright 2020 ConsenSys Software Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by consensys/gnark-crypto DO NOT EDIT
//
// This code has been editted to be suitable for inner curves

package bandersnatch

import (
	"errors"
	"math"
	"runtime"
	"sync"

	"github.com/crate-crypto/go-ipa/bandersnatch/fr"
	"github.com/crate-crypto/go-ipa/common/parallel"
)

// MultiExpConfig enables to set optional configuration attribute to a call to MultiExp
type MultiExpConfig struct {
	NbTasks     int  // go routines to be used in the multiexp. can be larger than num cpus.
	ScalarsMont bool // indicates if the scalars are in montgomery form. Default to false.
}

// selector stores the index, mask and shifts needed to select bits from a scalar
// it is used during the multiExp algorithm or the batch scalar multiplication
type selector struct {
	index uint64 // index in the multi-word scalar to select bits from
	mask  uint64 // mask (c-bit wide)
	shift uint64 // shift needed to get our bits on low positions

	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
	maskHigh        uint64 // same than mask, for index+1
	shiftHigh       uint64 // same than shift, for index+1
}

// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
// 2^{c} to the current digit, making it negative.
// negative digits can be processed in a later step as adding -G into the bucket instead of G
// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul)
// scalarsMont indicates wheter the provided scalars are in montgomery form
// returns smallValues, which represent the number of scalars which meets the following condition
// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
	toReturn := make([]fr.Element, len(scalars))

	// number of c-bit radixes in a scalar
	nbChunks := fr.Limbs * 64 / c
	if (fr.Limbs*64)%c != 0 {
		nbChunks++
	}

	mask := uint64((1 << c) - 1)      // low c bits are 1
	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
	max := int(1 << (c - 1))          // max value we want for our digits
	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words

	// compute offset and word selector / shift to select the right bits of our windows
	selectors := make([]selector, nbChunks)
	for chunk := uint64(0); chunk < nbChunks; chunk++ {
		jc := uint64(chunk * c)
		d := selector{}
		d.index = jc / 64
		d.shift = jc - (d.index * 64)
		d.mask = mask << d.shift
		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
		if d.multiWordSelect {
			nbBitsHigh := d.shift - uint64(64-c)
			d.maskHigh = (1 << nbBitsHigh) - 1
			d.shiftHigh = (c - nbBitsHigh)
		}
		selectors[chunk] = d
	}

	// for each chunk, we could track the number of non-zeros points we will need to process
	// this way, if a chunk has more work to do than others, we can spawn off more go routines
	// (at the cost of more buckets allocated)
	// a simplified approach is to track the small values where only the first word is set
	// if this number represent a significant number of points, then we will split first chunk
	// processing in the msm in 2, to ensure all go routines finish at ~same time
	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
	// if it does, though, this will deadlocK.
	chSmallValues := make(chan int, nbTasks)

	parallel.Execute(len(scalars), func(start, end int) {
		smallValues := 0
		for i := start; i < end; i++ {
			var carry int

			scalar := scalars[i]
			if scalarsMont {
				scalar.FromMont()
			}
			if scalar.IsUint64() {
				// everything is 0, no need to process this scalar
				if scalar[0] == 0 {
					continue
				}
				// low c-bits are 1 in mask
				if scalar[0]&mask == scalar[0] {
					smallValues++
				}
			}

			// for each chunk in the scalar, compute the current digit, and an eventual carry
			for chunk := uint64(0); chunk < nbChunks; chunk++ {
				s := selectors[chunk]

				// init with carry if any
				digit := carry
				carry = 0

				// digit = value of the c-bit window
				digit += int((scalar[s.index] & s.mask) >> s.shift)

				if s.multiWordSelect {
					// we are selecting bits over 2 words
					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
				}

				// if digit is zero, no impact on result
				if digit == 0 {
					continue
				}

				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
				// 2^{c} to the current digit, making it negative.
				if digit >= max {
					digit -= (1 << c)
					carry = 1
				}

				var bits uint64
				if digit >= 0 {
					bits = uint64(digit)
				} else {
					bits = uint64(-digit-1) | msbWindow
				}

				toReturn[i][s.index] |= (bits << s.shift)
				if s.multiWordSelect {
					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
				}

			}
		}

		chSmallValues <- smallValues

	}, nbTasks)

	// aggregate small values
	close(chSmallValues)
	smallValues := 0
	for o := range chSmallValues {
		smallValues += o
	}
	return toReturn, smallValues
}

// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
func MultiExpAffine(points []PointAffine, scalars []fr.Element, config MultiExpConfig) (PointAffine, error) {
	var _p PointProj
	if _, err := MultiExp(&_p,points, scalars, config); err != nil {
		return PointAffine{}, err
	}

	var p PointAffine
	p.FromProj(&_p)
	return p, nil
}

// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
//Note: We rely on this algortithm not use Equal functionality, since it is called by a banderwagon element
func MultiExp(p *PointProj, points []PointAffine, scalars []fr.Element, config MultiExpConfig) (*PointProj, error) {
	// note:
	// each of the msmCX method is the same, except for the c constant it declares
	// duplicating (through template generation) these methods allows to declare the buckets on the stack
	// the choice of c needs to be improved:
	// there is a theoritical value that gives optimal asymptotics
	// but in practice, other factors come into play, including:
	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
	// * number of CPUs
	// * cache friendliness (which depends on the host, G1 or G2... )
	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.

	// for each msmCX
	// step 1
	// we compute, for each scalars over c-bit wide windows, nbChunk digits
	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
	// 2^{c} to the current digit, making it negative.
	// negative digits will be processed in the next step as adding -G into the bucket instead of G
	// (computing -G is cheap, and this saves us half of the buckets)
	// step 2
	// buckets are declared on the stack
	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
	// we use jacobian extended formulas here as they are faster than mixed addition
	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
	// step 3
	// reduce the buckets weigthed sums into our result (msmReduceChunk)

	// ensure len(points) == len(scalars)
	nbPoints := len(points)
	if nbPoints != len(scalars) {
		return nil, errors.New("len(points) != len(scalars)")
	}

	// if nbTasks is not set, use all available CPUs
	if config.NbTasks <= 0 {
		config.NbTasks = runtime.NumCPU()
	}

	// here, we compute the best C for nbPoints
	// we split recursively until nbChunks(c) >= nbTasks,
	bestC := func(nbPoints int) uint64 {
		// implemented msmC methods (the c we use must be in this slice)
		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
		var C uint64
		// approximate cost (in group operations)
		// cost = bits/c * (nbPoints + 2^{c})
		// this needs to be verified empirically.
		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
		min := math.MaxFloat64
		for _, c := range implementedCs {
			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
			cost := float64(cc) / float64(c)
			if cost < min {
				min = cost
				C = c
			}
		}
		// empirical, needs to be tuned.
		// if C > 16 && nbPoints < 1 << 23 {
		// 	C = 16
		// }
		return C
	}

	var C uint64
	nbSplits := 1
	nbChunks := 0
	for nbChunks < config.NbTasks {
		C = bestC(nbPoints)
		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
		if (fr.Limbs*64)%C != 0 {
			nbChunks++
		}
		nbChunks *= nbSplits
		if nbChunks < config.NbTasks {
			nbSplits <<= 1
			nbPoints >>= 1
		}
	}

	// partition the scalars
	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
	var smallValues int
	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)

	// if we have more than 10% of small values, we split the processing of the first chunk in 2
	// we may want to do that in msmInnerPointProj , but that would incur a cost of looping through all scalars one more time
	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1

	// we have nbSplits intermediate results that we must sum together.
	_p := make([]PointProj, nbSplits-1)
	chDone := make(chan int, nbSplits-1)
	for i := 0; i < nbSplits-1; i++ {
		start := i * nbPoints
		end := start + nbPoints
		go func(start, end, i int) {
			msmInnerPointProj(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
			chDone <- i
		}(start, end, i)
	}

	msmInnerPointProj(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
	for i := 0; i < nbSplits-1; i++ {
		done := <-chDone
		p.Add(p, &_p[done])
	}
	close(chDone)
	return p, nil
}

func msmInnerPointProj(p *PointProj, c int, points []PointAffine, scalars []fr.Element, splitFirstChunk bool) {

	switch c {

	case 4:
		msmC4(p,points, scalars, splitFirstChunk)

	case 5:
		msmC5(p,points, scalars, splitFirstChunk)

	case 6:
		msmC6(p,points, scalars, splitFirstChunk)

	case 7:
		msmC7(p,points, scalars, splitFirstChunk)

	case 8:
		msmC8(p,points, scalars, splitFirstChunk)

	case 9:
		msmC9(p,points, scalars, splitFirstChunk)

	case 10:
		msmC10(p,points, scalars, splitFirstChunk)

	case 11:
		msmC11(p,points, scalars, splitFirstChunk)

	case 12:
		msmC12(p,points, scalars, splitFirstChunk)

	case 13:
		msmC13(p,points, scalars, splitFirstChunk)

	case 14:
		msmC14(p,points, scalars, splitFirstChunk)

	case 15:
		msmC15(p,points, scalars, splitFirstChunk)

	case 16:
		msmC16(p,points, scalars, splitFirstChunk)

	case 20:
		msmC20(p,points, scalars, splitFirstChunk)

	case 21:
		msmC21(p,points, scalars, splitFirstChunk)

	case 22:
		msmC22(p,points, scalars, splitFirstChunk)

	default:
		panic("not implemented")
	}
}

// msmReduceChunkPointAffine reduces the weighted sum of the buckets into the result of the multiExp
func msmReduceChunkPointAffine(p *PointProj, c int, chChunks []chan PointProj) *PointProj {
	var _p PointProj
	totalj := <-chChunks[len(chChunks)-1]
	_p.Set(&totalj)
	for j := len(chChunks) - 2; j >= 0; j-- {
		for l := 0; l < c; l++ {
			_p.Double(&_p)
		}
		totalj := <-chChunks[j]
		_p.Add(&_p, &totalj)
	}

	p.Set(&_p)

	return p
}

func msmReduceChunkPointAffineDMA(p *PointProj, c int, chChunks []PointProj) *PointProj {
	var _p PointProj
	totalj := chChunks[len(chChunks)-1]
	_p.Set(&totalj)
	for j := len(chChunks) - 2; j >= 0; j-- {
		for l := 0; l < c; l++ {
			_p.Double(&_p)
		}
		totalj := chChunks[j]
		_p.Add(&_p, &totalj)
	}

	p.Set(&_p)

	return p
}

func msmProcessChunkPointAffine(chunk uint64,
	chRes chan<- PointProj,
	buckets []PointProj,
	c uint64,
	points []PointAffine,
	scalars []fr.Element) {
	var total PointProj
	msmProcessChunkPointAffineDMA(chunk, &total, buckets, c, points, scalars)
	chRes <- total
}

func msmProcessChunkPointAffineDMA(chunk uint64,
	res *PointProj,
	buckets []PointProj,
	c uint64,
	points []PointAffine,
	scalars []fr.Element) {

	mask := uint64((1 << c) - 1) // low c bits are 1
	msbWindow := uint64(1 << (c - 1))

	for i := 0; i < len(buckets); i++ {
		buckets[i] = Identity
	}

	jc := uint64(chunk * c)
	s := selector{}
	s.index = jc / 64
	s.shift = jc - (s.index * 64)
	s.mask = mask << s.shift
	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
	if s.multiWordSelect {
		nbBitsHigh := s.shift - uint64(64-c)
		s.maskHigh = (1 << nbBitsHigh) - 1
		s.shiftHigh = (c - nbBitsHigh)
	}

	// for each scalars, get the digit corresponding to the chunk we're processing.
	for i := 0; i < len(scalars); i++ {
		bits := (scalars[i][s.index] & s.mask) >> s.shift
		if s.multiWordSelect {
			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
		}

		if bits == 0 {
			continue
		}

		// if msbWindow bit is set, we need to substract
		if bits&msbWindow == 0 {
			// add
			var pProj PointProj
			pProj.FromAffine(&points[i])
			buckets[bits-1].Add(&pProj, &buckets[bits-1])
		} else {
			// sub
			var pProj PointProj
			pProj.FromAffine(&points[i])
			pProj.Neg(&pProj)
			buckets[bits & ^msbWindow].Add(&buckets[bits & ^msbWindow], &pProj)
		}
	}

	// reduce buckets into total
	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]

	runningSum, total := Identity, Identity
	for k := len(buckets) - 1; k >= 0; k-- {

		runningSum.Add(&runningSum, &buckets[k])

		total.Add(&total, &runningSum)
	}

	*res = total

}

func msmC4(p *PointProj, points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 4                   // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks]PointProj
	processChunk := func(j int, points []PointAffine, scalars []fr.Element, pointProj *PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffineDMA(uint64(j), pointProj, buckets[:], c, points, scalars)
	}

	var wg sync.WaitGroup
	wg.Add(int(nbChunks - 1))
	for j := int(nbChunks - 1); j > 0; j-- {
		j := j
		go func() {
			processChunk(j, points, scalars, &chChunks[j])
			wg.Done()
		}()
	}
	wg.Wait()

	if !splitFirstChunk {
		processChunk(0, points, scalars, &chChunks[0])
	} else {
		chSplits := make([]PointProj, 2)
		split := len(points) / 2
		var wg sync.WaitGroup
		wg.Add(2)
		go func() {
			processChunk(0, points[:split], scalars[:split], &chSplits[0])
			wg.Done()
		}()
		go func() {
			processChunk(0, points[split:], scalars[split:], &chSplits[1])
			wg.Done()
		}()
		wg.Wait()
		chSplits[0].Add(&chSplits[0], &chSplits[1])
		chChunks[0] = chSplits[0]
	}

	return msmReduceChunkPointAffineDMA(p, c, chChunks[:])
}

func msmC5(p *PointProj,points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 5                   // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks + 1]chan PointProj
	for i := 0; i < len(chChunks); i++ {
		chChunks[i] = make(chan PointProj, 1)
	}

	// c doesn't divide 256, last window is smaller we can allocate less buckets
	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
	go func(j uint64, points []PointAffine, scalars []fr.Element) {
		var buckets [1 << (lastC - 1)]PointProj
		msmProcessChunkPointAffine(j, chChunks[j], buckets[:], c, points, scalars)
	}(uint64(nbChunks), points, scalars)

	processChunk := func(j int, points []PointAffine, scalars []fr.Element, chChunk chan PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
	}

	for j := int(nbChunks - 1); j > 0; j-- {
		go processChunk(j, points, scalars, chChunks[j])
	}

	if !splitFirstChunk {
		go processChunk(0, points, scalars, chChunks[0])
	} else {
		chSplit := make(chan PointProj, 2)
		split := len(points) / 2
		go processChunk(0, points[:split], scalars[:split], chSplit)
		go processChunk(0, points[split:], scalars[split:], chSplit)
		go func() {
			s1 := <-chSplit
			s2 := <-chSplit
			close(chSplit)
			s1.Add(&s1, &s2)
			chChunks[0] <- s1
		}()
	}

	return msmReduceChunkPointAffine(p, c, chChunks[:])
}

func  msmC6(p *PointProj,points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 6                   // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks + 1]chan PointProj
	for i := 0; i < len(chChunks); i++ {
		chChunks[i] = make(chan PointProj, 1)
	}

	// c doesn't divide 256, last window is smaller we can allocate less buckets
	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
	go func(j uint64, points []PointAffine, scalars []fr.Element) {
		var buckets [1 << (lastC - 1)]PointProj
		msmProcessChunkPointAffine(j, chChunks[j], buckets[:], c, points, scalars)
	}(uint64(nbChunks), points, scalars)

	processChunk := func(j int, points []PointAffine, scalars []fr.Element, chChunk chan PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
	}

	for j := int(nbChunks - 1); j > 0; j-- {
		go processChunk(j, points, scalars, chChunks[j])
	}

	if !splitFirstChunk {
		go processChunk(0, points, scalars, chChunks[0])
	} else {
		chSplit := make(chan PointProj, 2)
		split := len(points) / 2
		go processChunk(0, points[:split], scalars[:split], chSplit)
		go processChunk(0, points[split:], scalars[split:], chSplit)
		go func() {
			s1 := <-chSplit
			s2 := <-chSplit
			close(chSplit)
			s1.Add(&s1, &s2)
			chChunks[0] <- s1
		}()
	}

	return msmReduceChunkPointAffine(p, c, chChunks[:])
}

func msmC7(p *PointProj,points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 7                   // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks + 1]chan PointProj
	for i := 0; i < len(chChunks); i++ {
		chChunks[i] = make(chan PointProj, 1)
	}

	// c doesn't divide 256, last window is smaller we can allocate less buckets
	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
	go func(j uint64, points []PointAffine, scalars []fr.Element) {
		var buckets [1 << (lastC - 1)]PointProj
		msmProcessChunkPointAffine(j, chChunks[j], buckets[:], c, points, scalars)
	}(uint64(nbChunks), points, scalars)

	processChunk := func(j int, points []PointAffine, scalars []fr.Element, chChunk chan PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
	}

	for j := int(nbChunks - 1); j > 0; j-- {
		go processChunk(j, points, scalars, chChunks[j])
	}

	if !splitFirstChunk {
		go processChunk(0, points, scalars, chChunks[0])
	} else {
		chSplit := make(chan PointProj, 2)
		split := len(points) / 2
		go processChunk(0, points[:split], scalars[:split], chSplit)
		go processChunk(0, points[split:], scalars[split:], chSplit)
		go func() {
			s1 := <-chSplit
			s2 := <-chSplit
			close(chSplit)
			s1.Add(&s1, &s2)
			chChunks[0] <- s1
		}()
	}

	return msmReduceChunkPointAffine(p, c, chChunks[:])
}

func msmC8(p *PointProj,points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 8                   // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks]chan PointProj
	for i := 0; i < len(chChunks); i++ {
		chChunks[i] = make(chan PointProj, 1)
	}

	processChunk := func(j int, points []PointAffine, scalars []fr.Element, chChunk chan PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
	}

	for j := int(nbChunks - 1); j > 0; j-- {
		go processChunk(j, points, scalars, chChunks[j])
	}

	if !splitFirstChunk {
		go processChunk(0, points, scalars, chChunks[0])
	} else {
		chSplit := make(chan PointProj, 2)
		split := len(points) / 2
		go processChunk(0, points[:split], scalars[:split], chSplit)
		go processChunk(0, points[split:], scalars[split:], chSplit)
		go func() {
			s1 := <-chSplit
			s2 := <-chSplit
			close(chSplit)
			s1.Add(&s1, &s2)
			chChunks[0] <- s1
		}()
	}

	return msmReduceChunkPointAffine(p, c, chChunks[:])
}

func msmC9(p *PointProj,points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 9                   // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks + 1]chan PointProj
	for i := 0; i < len(chChunks); i++ {
		chChunks[i] = make(chan PointProj, 1)
	}

	// c doesn't divide 256, last window is smaller we can allocate less buckets
	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
	go func(j uint64, points []PointAffine, scalars []fr.Element) {
		var buckets [1 << (lastC - 1)]PointProj
		msmProcessChunkPointAffine(j, chChunks[j], buckets[:], c, points, scalars)
	}(uint64(nbChunks), points, scalars)

	processChunk := func(j int, points []PointAffine, scalars []fr.Element, chChunk chan PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
	}

	for j := int(nbChunks - 1); j > 0; j-- {
		go processChunk(j, points, scalars, chChunks[j])
	}

	if !splitFirstChunk {
		go processChunk(0, points, scalars, chChunks[0])
	} else {
		chSplit := make(chan PointProj, 2)
		split := len(points) / 2
		go processChunk(0, points[:split], scalars[:split], chSplit)
		go processChunk(0, points[split:], scalars[split:], chSplit)
		go func() {
			s1 := <-chSplit
			s2 := <-chSplit
			close(chSplit)
			s1.Add(&s1, &s2)
			chChunks[0] <- s1
		}()
	}

	return msmReduceChunkPointAffine(p, c, chChunks[:])
}

func msmC10(p *PointProj,points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 10                  // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks + 1]chan PointProj
	for i := 0; i < len(chChunks); i++ {
		chChunks[i] = make(chan PointProj, 1)
	}

	// c doesn't divide 256, last window is smaller we can allocate less buckets
	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
	go func(j uint64, points []PointAffine, scalars []fr.Element) {
		var buckets [1 << (lastC - 1)]PointProj
		msmProcessChunkPointAffine(j, chChunks[j], buckets[:], c, points, scalars)
	}(uint64(nbChunks), points, scalars)

	processChunk := func(j int, points []PointAffine, scalars []fr.Element, chChunk chan PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
	}

	for j := int(nbChunks - 1); j > 0; j-- {
		go processChunk(j, points, scalars, chChunks[j])
	}

	if !splitFirstChunk {
		go processChunk(0, points, scalars, chChunks[0])
	} else {
		chSplit := make(chan PointProj, 2)
		split := len(points) / 2
		go processChunk(0, points[:split], scalars[:split], chSplit)
		go processChunk(0, points[split:], scalars[split:], chSplit)
		go func() {
			s1 := <-chSplit
			s2 := <-chSplit
			close(chSplit)
			s1.Add(&s1, &s2)
			chChunks[0] <- s1
		}()
	}

	return msmReduceChunkPointAffine(p, c, chChunks[:])
}

func msmC11(p *PointProj,points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 11                  // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks + 1]chan PointProj
	for i := 0; i < len(chChunks); i++ {
		chChunks[i] = make(chan PointProj, 1)
	}

	// c doesn't divide 256, last window is smaller we can allocate less buckets
	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
	go func(j uint64, points []PointAffine, scalars []fr.Element) {
		var buckets [1 << (lastC - 1)]PointProj
		msmProcessChunkPointAffine(j, chChunks[j], buckets[:], c, points, scalars)
	}(uint64(nbChunks), points, scalars)

	processChunk := func(j int, points []PointAffine, scalars []fr.Element, chChunk chan PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
	}

	for j := int(nbChunks - 1); j > 0; j-- {
		go processChunk(j, points, scalars, chChunks[j])
	}

	if !splitFirstChunk {
		go processChunk(0, points, scalars, chChunks[0])
	} else {
		chSplit := make(chan PointProj, 2)
		split := len(points) / 2
		go processChunk(0, points[:split], scalars[:split], chSplit)
		go processChunk(0, points[split:], scalars[split:], chSplit)
		go func() {
			s1 := <-chSplit
			s2 := <-chSplit
			close(chSplit)
			s1.Add(&s1, &s2)
			chChunks[0] <- s1
		}()
	}

	return msmReduceChunkPointAffine(p, c, chChunks[:])
}

func  msmC12(p *PointProj,points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 12                  // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks + 1]chan PointProj
	for i := 0; i < len(chChunks); i++ {
		chChunks[i] = make(chan PointProj, 1)
	}

	// c doesn't divide 256, last window is smaller we can allocate less buckets
	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
	go func(j uint64, points []PointAffine, scalars []fr.Element) {
		var buckets [1 << (lastC - 1)]PointProj
		msmProcessChunkPointAffine(j, chChunks[j], buckets[:], c, points, scalars)
	}(uint64(nbChunks), points, scalars)

	processChunk := func(j int, points []PointAffine, scalars []fr.Element, chChunk chan PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
	}

	for j := int(nbChunks - 1); j > 0; j-- {
		go processChunk(j, points, scalars, chChunks[j])
	}

	if !splitFirstChunk {
		go processChunk(0, points, scalars, chChunks[0])
	} else {
		chSplit := make(chan PointProj, 2)
		split := len(points) / 2
		go processChunk(0, points[:split], scalars[:split], chSplit)
		go processChunk(0, points[split:], scalars[split:], chSplit)
		go func() {
			s1 := <-chSplit
			s2 := <-chSplit
			close(chSplit)
			s1.Add(&s1, &s2)
			chChunks[0] <- s1
		}()
	}

	return msmReduceChunkPointAffine(p, c, chChunks[:])
}

func msmC13(p *PointProj,points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 13                  // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks + 1]chan PointProj
	for i := 0; i < len(chChunks); i++ {
		chChunks[i] = make(chan PointProj, 1)
	}

	// c doesn't divide 256, last window is smaller we can allocate less buckets
	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
	go func(j uint64, points []PointAffine, scalars []fr.Element) {
		var buckets [1 << (lastC - 1)]PointProj
		msmProcessChunkPointAffine(j, chChunks[j], buckets[:], c, points, scalars)
	}(uint64(nbChunks), points, scalars)

	processChunk := func(j int, points []PointAffine, scalars []fr.Element, chChunk chan PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
	}

	for j := int(nbChunks - 1); j > 0; j-- {
		go processChunk(j, points, scalars, chChunks[j])
	}

	if !splitFirstChunk {
		go processChunk(0, points, scalars, chChunks[0])
	} else {
		chSplit := make(chan PointProj, 2)
		split := len(points) / 2
		go processChunk(0, points[:split], scalars[:split], chSplit)
		go processChunk(0, points[split:], scalars[split:], chSplit)
		go func() {
			s1 := <-chSplit
			s2 := <-chSplit
			close(chSplit)
			s1.Add(&s1, &s2)
			chChunks[0] <- s1
		}()
	}

	return msmReduceChunkPointAffine(p, c, chChunks[:])
}

func  msmC14(p *PointProj,points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 14                  // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks + 1]chan PointProj
	for i := 0; i < len(chChunks); i++ {
		chChunks[i] = make(chan PointProj, 1)
	}

	// c doesn't divide 256, last window is smaller we can allocate less buckets
	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
	go func(j uint64, points []PointAffine, scalars []fr.Element) {
		var buckets [1 << (lastC - 1)]PointProj
		msmProcessChunkPointAffine(j, chChunks[j], buckets[:], c, points, scalars)
	}(uint64(nbChunks), points, scalars)

	processChunk := func(j int, points []PointAffine, scalars []fr.Element, chChunk chan PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
	}

	for j := int(nbChunks - 1); j > 0; j-- {
		go processChunk(j, points, scalars, chChunks[j])
	}

	if !splitFirstChunk {
		go processChunk(0, points, scalars, chChunks[0])
	} else {
		chSplit := make(chan PointProj, 2)
		split := len(points) / 2
		go processChunk(0, points[:split], scalars[:split], chSplit)
		go processChunk(0, points[split:], scalars[split:], chSplit)
		go func() {
			s1 := <-chSplit
			s2 := <-chSplit
			close(chSplit)
			s1.Add(&s1, &s2)
			chChunks[0] <- s1
		}()
	}

	return msmReduceChunkPointAffine(p, c, chChunks[:])
}

func  msmC15(p *PointProj,points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 15                  // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks + 1]chan PointProj
	for i := 0; i < len(chChunks); i++ {
		chChunks[i] = make(chan PointProj, 1)
	}

	// c doesn't divide 256, last window is smaller we can allocate less buckets
	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
	go func(j uint64, points []PointAffine, scalars []fr.Element) {
		var buckets [1 << (lastC - 1)]PointProj
		msmProcessChunkPointAffine(j, chChunks[j], buckets[:], c, points, scalars)
	}(uint64(nbChunks), points, scalars)

	processChunk := func(j int, points []PointAffine, scalars []fr.Element, chChunk chan PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
	}

	for j := int(nbChunks - 1); j > 0; j-- {
		go processChunk(j, points, scalars, chChunks[j])
	}

	if !splitFirstChunk {
		go processChunk(0, points, scalars, chChunks[0])
	} else {
		chSplit := make(chan PointProj, 2)
		split := len(points) / 2
		go processChunk(0, points[:split], scalars[:split], chSplit)
		go processChunk(0, points[split:], scalars[split:], chSplit)
		go func() {
			s1 := <-chSplit
			s2 := <-chSplit
			close(chSplit)
			s1.Add(&s1, &s2)
			chChunks[0] <- s1
		}()
	}

	return msmReduceChunkPointAffine(p, c, chChunks[:])
}

func  msmC16(p *PointProj,points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 16                  // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks]chan PointProj
	for i := 0; i < len(chChunks); i++ {
		chChunks[i] = make(chan PointProj, 1)
	}

	processChunk := func(j int, points []PointAffine, scalars []fr.Element, chChunk chan PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
	}

	for j := int(nbChunks - 1); j > 0; j-- {
		go processChunk(j, points, scalars, chChunks[j])
	}

	if !splitFirstChunk {
		go processChunk(0, points, scalars, chChunks[0])
	} else {
		chSplit := make(chan PointProj, 2)
		split := len(points) / 2
		go processChunk(0, points[:split], scalars[:split], chSplit)
		go processChunk(0, points[split:], scalars[split:], chSplit)
		go func() {
			s1 := <-chSplit
			s2 := <-chSplit
			close(chSplit)
			s1.Add(&s1, &s2)
			chChunks[0] <- s1
		}()
	}

	return msmReduceChunkPointAffine(p, c, chChunks[:])
}

func  msmC20(p *PointProj,points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 20                  // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks + 1]chan PointProj
	for i := 0; i < len(chChunks); i++ {
		chChunks[i] = make(chan PointProj, 1)
	}

	// c doesn't divide 256, last window is smaller we can allocate less buckets
	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
	go func(j uint64, points []PointAffine, scalars []fr.Element) {
		var buckets [1 << (lastC - 1)]PointProj
		msmProcessChunkPointAffine(j, chChunks[j], buckets[:], c, points, scalars)
	}(uint64(nbChunks), points, scalars)

	processChunk := func(j int, points []PointAffine, scalars []fr.Element, chChunk chan PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
	}

	for j := int(nbChunks - 1); j > 0; j-- {
		go processChunk(j, points, scalars, chChunks[j])
	}

	if !splitFirstChunk {
		go processChunk(0, points, scalars, chChunks[0])
	} else {
		chSplit := make(chan PointProj, 2)
		split := len(points) / 2
		go processChunk(0, points[:split], scalars[:split], chSplit)
		go processChunk(0, points[split:], scalars[split:], chSplit)
		go func() {
			s1 := <-chSplit
			s2 := <-chSplit
			close(chSplit)
			s1.Add(&s1, &s2)
			chChunks[0] <- s1
		}()
	}

	return msmReduceChunkPointAffine(p, c, chChunks[:])
}

func  msmC21(p *PointProj,points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 21                  // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks + 1]chan PointProj
	for i := 0; i < len(chChunks); i++ {
		chChunks[i] = make(chan PointProj, 1)
	}

	// c doesn't divide 256, last window is smaller we can allocate less buckets
	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
	go func(j uint64, points []PointAffine, scalars []fr.Element) {
		var buckets [1 << (lastC - 1)]PointProj
		msmProcessChunkPointAffine(j, chChunks[j], buckets[:], c, points, scalars)
	}(uint64(nbChunks), points, scalars)

	processChunk := func(j int, points []PointAffine, scalars []fr.Element, chChunk chan PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
	}

	for j := int(nbChunks - 1); j > 0; j-- {
		go processChunk(j, points, scalars, chChunks[j])
	}

	if !splitFirstChunk {
		go processChunk(0, points, scalars, chChunks[0])
	} else {
		chSplit := make(chan PointProj, 2)
		split := len(points) / 2
		go processChunk(0, points[:split], scalars[:split], chSplit)
		go processChunk(0, points[split:], scalars[split:], chSplit)
		go func() {
			s1 := <-chSplit
			s2 := <-chSplit
			close(chSplit)
			s1.Add(&s1, &s2)
			chChunks[0] <- s1
		}()
	}

	return msmReduceChunkPointAffine(p, c, chChunks[:])
}

func msmC22(p *PointProj,points []PointAffine, scalars []fr.Element, splitFirstChunk bool) *PointProj {
	const (
		c        = 22                  // scalars partitioned into c-bit radixes
		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
	)

	// for each chunk, spawn one go routine that'll loop through all the scalars in the
	// corresponding bit-window
	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
	// critical for performance

	// each go routine sends its result in chChunks[i] channel
	var chChunks [nbChunks + 1]chan PointProj
	for i := 0; i < len(chChunks); i++ {
		chChunks[i] = make(chan PointProj, 1)
	}

	// c doesn't divide 256, last window is smaller we can allocate less buckets
	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
	go func(j uint64, points []PointAffine, scalars []fr.Element) {
		var buckets [1 << (lastC - 1)]PointProj
		msmProcessChunkPointAffine(j, chChunks[j], buckets[:], c, points, scalars)
	}(uint64(nbChunks), points, scalars)

	processChunk := func(j int, points []PointAffine, scalars []fr.Element, chChunk chan PointProj) {
		var buckets [1 << (c - 1)]PointProj
		msmProcessChunkPointAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
	}

	for j := int(nbChunks - 1); j > 0; j-- {
		go processChunk(j, points, scalars, chChunks[j])
	}

	if !splitFirstChunk {
		go processChunk(0, points, scalars, chChunks[0])
	} else {
		chSplit := make(chan PointProj, 2)
		split := len(points) / 2
		go processChunk(0, points[:split], scalars[:split], chSplit)
		go processChunk(0, points[split:], scalars[split:], chSplit)
		go func() {
			s1 := <-chSplit
			s2 := <-chSplit
			close(chSplit)
			s1.Add(&s1, &s2)
			chChunks[0] <- s1
		}()
	}

	return msmReduceChunkPointAffine(p, c, chChunks[:])
}