diff --git a/algo/uidlist.go b/algo/uidlist.go index 9c0fcd8d5dc..1465d4ff2fe 100644 --- a/algo/uidlist.go +++ b/algo/uidlist.go @@ -92,6 +92,7 @@ func IntersectCompressedWithLinJump(dec *codec.Decoder, v []uint64, o *[]uint64) // IntersectCompressedWithBin is based on the paper // "Fast Intersection Algorithms for Sorted Sequences" // https://link.springer.com/chapter/10.1007/978-3-642-12476-1_3 +// Call seek on dec before calling this function func IntersectCompressedWithBin(dec *codec.Decoder, q []uint64, o *[]uint64) { ld := dec.ApproxLen() lq := len(q) @@ -105,34 +106,45 @@ func IntersectCompressedWithBin(dec *codec.Decoder, q []uint64, o *[]uint64) { // Pick the shorter list and do binary search if ld < lq { - uids := dec.Uids() - for len(uids) > 0 { - for _, u := range uids { - qidx := sort.Search(len(q), func(idx int) bool { - return q[idx] >= u - }) - if qidx >= len(q) { - return - } - if q[qidx] == u { - *o = append(*o, u) - qidx++ - } - q = q[qidx:] + for { + blockUids := dec.Uids() + if len(blockUids) == 0 { + break } - uids = dec.Next() + IntersectWithBin(blockUids, q, o) + lastUid := blockUids[len(blockUids)-1] + qidx := sort.Search(len(q), func(idx int) bool { + return q[idx] >= lastUid + }) + if qidx >= len(q) { + return + } + q = q[qidx:] + dec.Next() } return } + var uids []uint64 for _, u := range q { - uids := dec.Seek(u, codec.SeekStart) - if len(uids) == 0 { - return + if len(uids) == 0 || u > uids[len(uids)-1] { + uids = dec.Seek(u, codec.SeekStart) + if len(uids) == 0 { + return + } + } + uidIdx := sort.Search(len(uids), func(idx int) bool { + return uids[idx] >= u + }) + if uidIdx >= len(uids) { + // We know that u < max(uids). If we didn't find it here, it's not here. + continue } - if uids[0] == u { + if uids[uidIdx] == u { *o = append(*o, u) + uidIdx++ } + uids = uids[uidIdx:] } } diff --git a/algo/uidlist_test.go b/algo/uidlist_test.go index 015a265a787..05eafba68e3 100644 --- a/algo/uidlist_test.go +++ b/algo/uidlist_test.go @@ -367,6 +367,52 @@ func BenchmarkListIntersectRandom(b *testing.B) { randomTests(1024000, 0.01) } +func BenchmarkListIntersectCompressBin(b *testing.B) { + randomTests := func(sz int, overlap float64) { + rs := []float64{0.01, 0.1, 1, 10, 100} + for _, r := range rs { + sz1 := sz + sz2 := int(float64(sz) * r) + if sz2 > 1000000 || sz2 == 0 { + break + } + + u1, v1 := make([]uint64, sz1), make([]uint64, sz2) + limit := int64(float64(sz) / overlap) + for i := 0; i < sz1; i++ { + u1[i] = uint64(rand.Int63n(limit)) + } + for i := 0; i < sz2; i++ { + v1[i] = uint64(rand.Int63n(limit)) + } + sort.Slice(u1, func(i, j int) bool { return u1[i] < u1[j] }) + sort.Slice(v1, func(i, j int) bool { return v1[i] < v1[j] }) + + dst2 := &pb.List{} + compressedUids := codec.Encode(v1, 256) + + b.Run(fmt.Sprintf("compressed:IntersectWith:ratio=%v:size=%d:overlap=%.2f:", r, sz, overlap), + func(b *testing.B) { + for k := 0; k < b.N; k++ { + dec := codec.Decoder{Pack: compressedUids} + dec.Seek(0, codec.SeekStart) + IntersectCompressedWithBin(&dec, u1, &dst2.Uids) + } + }) + fmt.Println() + + codec.FreePack(compressedUids) + } + } + + randomTests(10, 0.01) + randomTests(100, 0.01) + randomTests(1000, 0.01) + randomTests(10000, 0.01) + randomTests(100000, 0.01) + randomTests(1000000, 0.01) +} + func BenchmarkListIntersectRatio(b *testing.B) { randomTests := func(sz int, overlap float64) { rs := []int{1, 10, 50, 100, 500, 1000, 10000, 100000, 1000000}