/
searching.js
276 lines (264 loc) · 8.96 KB
/
searching.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
const faunadb = require('faunadb')
const q = faunadb.query
const {
CreateIndex,
Collection,
Exists,
If,
Index,
Delete,
Lambda,
Var,
Query,
Length,
Select,
Subtract,
Let,
NGram,
LowerCase,
Filter,
GT,
Union,
Distinct
} = q
/** ********************** Searching ************************/
/* We'll create indexes for an autocompletion functionality.
* We gradually increase the complexity of the index to epxlain the logic.
* Only the final version is usd in the app.
***/
// --- Step 1 ---
// We can index an array!
// the index would work by 'unwrapping' the values in wordpart.
// meaning that we a match on one of the wordparts will find the original hashtag reference.
// Imagine we stored the wordparts of a tag alongside with the tag. It would look like:
// {
// "name": "lockdown",
// "wordparts": [
// "l",
// "lo",
// ....
// "lockdown",
// "o",
// "oc",
// ...
// ]
// }
const CreateHashtagsByWordparts = CreateIndex({
name: 'hashtags_by_wordparts',
source: Collection('hashtags'),
// wordparts is an array. Using it in an index will index
// each of the parts in this array. As a result any part of the word will match in this index.
terms: [
{
field: ['data', 'wordparts']
}
],
// this is for searching, it doesn't matter that our index is a few milliseconds/seconds behind.
// serialized false means that our reads will return faster which is important for searching.
// but you should not rely on it for logic as you can't expect to read immediately what you have written.
serialized: false
})
// --- Step 2 ---
// Sorting the results with a binding!
// We actually want to favor matches on shorter words (since a shorter word is closer to what we typed)
// The length of the original word is the determining factor. We do not have to store this length in the document,
// we can get it with a calculated field or in fauna terms: 'a binding'.
const CreateHashtagsByWordpartsWithBinding = CreateIndex({
name: 'hashtags_by_wordparts',
// we actually want to sort on the longest match, how do we do that?
// With a binding!
source: [
{
collection: Collection('hashtags'),
fields: {
length: Query(
Lambda(
'hashtag',
q.Map(Select(['data', 'wordparts'], Var('hashtag')), wordpart => Length(Var('wordpart')))
)
)
}
}
],
terms: [
{
field: ['data', 'wordparts']
}
],
// values are 'range indexed' and therefore sorted in the order you provide them
// we will also need the reference to the actual tag! We will place this second
// since else the elements will be sorted by reference first.
values: [
{
binding: 'length',
reverse: true
},
{ field: ['ref'] }
],
serialized: false
})
// --- Step 3 ---
// We can index multiple collections !
// We want to search for user aliases as well, not only for tagS!
// Since we can put multiple collections in one index, we can do that.
// In this case we decided to let users and accounts have the same property 'wordparts'.
// the users property has a different name, we can easily fix that with bindings as well.
const CreateHashtagsAndUsersByWordpartsWithBinding = CreateIndex({
name: 'hashtags_and_users_by_wordparts',
// we actually want to sort to get the shortest word that matches first
source: [
{
collection: Collection('hashtags'),
fields: {
// We can use bindings to make sure that fields of different collections
// have the same name in our index (e.g. in case wordparts in users and hashtags)
// would be stored slightly differently (user has an extra key in the path 'alias' in the below example)
length: Query(Lambda('hashtag', Length(Select(['data', 'name'], Var('hashtag'))))),
wordparts: Query(Lambda('hashtag', Select(['data', 'wordparts'], Var('hashtag'))))
}
},
{
collection: Collection('users'),
fields: {
length: Query(Lambda('user', Length(Select(['data', 'alias', 'name'], Var('user'))))),
wordparts: Query(Lambda('user', Select(['data', 'alias', 'wordparts'], Var('user'))))
}
}
],
terms: [
{
binding: 'wordparts'
}
],
values: [
{
binding: 'length'
},
{ field: ['ref'] }
],
serialized: false
})
// --- Step 4 ---
// We do not need to store these wordparts.. that can actually be hanlded
// by a binding as well. There is an NGram function available to generate those.
// At this point the NGram function is quite limited and therefore still undocumented since it will change.
// The limitation is that max - min can't be bigger than 1.
// We can easily combine a few NGgrams ourselves though. Since we
// want to 'limit' the amount of prefixes we can do that fairly easy.
// When a word is longer than 10 chars, for example 15 the shortest prefix will be min 5 chars long.
function WordPartGenerator(WordVar) {
return Let(
{
indexes: q.Map(
// Reduce this array if you want less ngrams per word.
// Setting it to [ 0 ] would only create the word itself, Setting it to [0, 1] would result in the word itself
// and all ngrams that are one character shorter, etc..
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
Lambda('index', Subtract(Length(WordVar), Var('index')))
),
indexesFiltered: Filter(
Var('indexes'),
// filter out the ones below 0
Lambda('l', GT(Var('l'), 0))
),
ngramsArray: q.Map(Var('indexesFiltered'), Lambda('l', NGram(LowerCase(WordVar), Var('l'), Var('l'))))
},
Var('ngramsArray')
)
}
const CreateHashtagsAndUsersByWordpartsWithBinding2 = CreateIndex({
name: 'hashtags_and_users_by_wordparts',
// we actually want to sort to get the shortest word that matches first
source: [
{
// If your collections have the same property tht you want to access you can pass a list to the collection
collection: [Collection('hashtags'), Collection('users')],
fields: {
length: Query(Lambda('hashtagOrUser', Length(Select(['data', 'name'], Var('hashtagOrUser'))))),
wordparts: Query(
Lambda('hashtagOrUser', Distinct(Union(WordPartGenerator(Select(['data', 'name'], Var('hashtagOrUser'))))))
)
}
}
],
terms: [
{
binding: 'wordparts'
}
],
// values are 'range indexed' and therefore sorted in the order you provide them
// we will also need the reference to the actual tag! We will place this second
// since else the elements will be sorted by reference first.
values: [
{
binding: 'length'
},
{ field: ['ref'] }
],
// serialized for an index that we will use for searching is not necessary.
// false is the fault.
serialized: false
})
/* Finally, if we need the binding to be different depending on the collection
* we can do that as well. In this case we are not using this example since this type of
* index does not build instantly (other indexes that are < 128 entities build instantly,
* but ranging over multiple collections like this is a special case)]
* (this delay might make people think the app is broken when they launch it themselves)
*/
const CreateHashtagsAndUsersByWordpartsWithBinding3 = CreateIndex({
name: 'hashtags_and_users_by_wordparts',
// we actually want to sort to get the shortest word that matches first
source: [
{
collection: Collection('hashtags'),
fields: {
length: Query(Lambda('hashtag', Length(Select(['data', 'name'], Var('hashtag'))))),
wordparts: Query(Lambda('hashtag', Union(WordPartGenerator(Select(['data', 'name'], Var('hashtag'))))))
}
},
{
collection: Collection('users'),
fields: {
length: Query(Lambda('user', Length(Select(['data', 'name'], Var('user'))))),
wordparts: Query(
Lambda(
'user',
Union(
// We'll search both on the name as the alias.
Union(WordPartGenerator(Select(['data', 'name'], Var('user')))),
Union(WordPartGenerator(Select(['data', 'alias'], Var('user'))))
)
)
)
}
}
],
terms: [
{
binding: 'wordparts'
}
],
// values are 'range indexed' and therefore sorted in the order you provide them
// we will also need the reference to the actual tag! We will place this second
// since else the elements will be sorted by reference first.
values: [
{
binding: 'length'
},
{ field: ['ref'] }
],
// serialized for an index that we will use for searching is not necessary.
// false is the fault.
serialized: false
})
async function createSearchIndexes(client) {
await client.query(
If(Exists(Index('hashtags_and_users_by_wordparts')), true, CreateHashtagsAndUsersByWordpartsWithBinding2)
)
}
async function deleteSearchIndexes(client) {
await client.query(
If(Exists(Index('hashtags_and_users_by_wordparts')), Delete(Index('hashtags_and_users_by_wordparts')), true)
)
}
export { createSearchIndexes, deleteSearchIndexes }