Skip to content

Commit

Permalink
when caseSensitive:true tokenise in lower case when writing index
Browse files Browse the repository at this point in the history
  • Loading branch information
fergiemcdowall committed Sep 4, 2023
1 parent 699d169 commit 93041fe
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 114 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "fergies-inverted-index",
"version": "13.0.0-rc.3",
"version": "13.0.0-rc.4",
"description": "An inverted index that allows javascript objects to be easily serialised and retrieved using promises and map-reduce",
"browser": "src/entrypoints/browser.js",
"main": "src/entrypoints/node.js",
Expand Down
9 changes: 5 additions & 4 deletions src/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@ export class Main {
keyEncoding: charwise,
valueEncoding: 'json'
}),
tokenParser: new TokenParser(),
...ops
}

const r = read(ops)
const w = write(ops)
const tokenParser = new TokenParser(ops.caseSensitive)

const r = read(ops, tokenParser)
const w = write(ops, tokenParser)

// timestamp with time of creation (if not created already)
// note: async, so this is "fire and forget"
Expand Down Expand Up @@ -58,7 +59,7 @@ export class Main {
this.SORT = r.SORT
this.STORE = ops.db
this.TIMESTAMP_LAST_UPDATED = w.TIMESTAMP_LAST_UPDATED
this.TOKEN_PARSER = ops.tokenParser
this.TOKEN_PARSER = tokenParser
}

flattenMatchArrayInResults (results) {
Expand Down
35 changes: 25 additions & 10 deletions src/parseToken.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,29 @@ charwise.HI = undefined
// <fieldname>:<value>. Turn key into json object that is of the
// format {FIELD: ..., VALUE: {GTE: ..., LTE ...}}
export class TokenParser {
constructor (availableFields = []) {
this.setAvailableFields(availableFields)
availableFields = []
#caseSensitive

constructor (caseSensitive) {
this.#caseSensitive = caseSensitive
}

setAvailableFields (availableFields) {
setAvailableFields = availableFields => {
this.availableFields = availableFields
}

#setCaseSensitivity = token => {
const setCase = str =>
this.#caseSensitive || typeof str !== 'string' ? str : str.toLowerCase()
return {
FIELD: token.FIELD.map(setCase),
VALUE: {
GTE: setCase(token.VALUE.GTE),
LTE: setCase(token.VALUE.LTE)
}
}
}

parse (token) {
// case: <value>
// case: <FIELD>:<VALUE>
Expand All @@ -30,23 +45,23 @@ export class TokenParser {
// a part of the value. This accounts for occasions where the value itself
// has a ':'.
if (token.indexOf(':') === -1) {
return {
return this.#setCaseSensitivity({
FIELD: this.availableFields,
VALUE: {
GTE: token,
LTE: token
}
}
})
}

const [field, ...value] = token.split(':')
return {
return this.#setCaseSensitivity({
FIELD: [field],
VALUE: {
GTE: value.join(':'),
LTE: value.join(':')
}
}
})
}

if (typeof token === 'number') {
Expand Down Expand Up @@ -103,14 +118,14 @@ export class TokenParser {

// parse object FIELD
if (typeof token.FIELD === 'undefined') {
return {
return this.#setCaseSensitivity({
FIELD: this.availableFields,
...token
}
})
}
// Allow FIELD to be an array or a string
token.FIELD = [token.FIELD].flat()

return token
return this.#setCaseSensitivity(token)
}
}
27 changes: 8 additions & 19 deletions src/read.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import charwise from 'charwise'
charwise.LO = null
charwise.HI = undefined

export default function (ops) {
export default function (ops, tokenParser) {
const isString = s => typeof s === 'string'

const queryReplace = token => {
Expand Down Expand Up @@ -32,18 +32,6 @@ export default function (ops) {
return token
}

const setCaseSensitivity = token => {
const setCase = str =>
ops.caseSensitive || typeof str !== 'string' ? str : str.toLowerCase()
return {
FIELD: token.FIELD.map(setCase),
VALUE: {
GTE: setCase(token.VALUE.GTE),
LTE: setCase(token.VALUE.LTE)
}
}
}

// If this token is a stopword then return 'undefined'
const removeStopwords = token =>
token.VALUE.GTE === token.VALUE.LTE &&
Expand All @@ -67,9 +55,10 @@ export default function (ops) {
try {
testForBreak(token)

token = ops.tokenParser.parse(token)
token = tokenParser.parse(token)

// testForBreak(token) // ?
token = await setCaseSensitivity(token)
// token = await setCaseSensitivity(token)
// testForBreak(token) // ?
token = await removeStopwords(token)
// testForBreak(token) // ?
Expand Down Expand Up @@ -216,7 +205,7 @@ export default function (ops) {
// return a bucket of IDs. Key is an object like this:
// {gte:..., lte:...} (gte/lte == greater/less than or equal)
const BUCKET = token => {
token = ops.tokenParser.parse(token)
token = tokenParser.parse(token)
return GET(token).then(result => ({
_id: [...result.reduce((acc, cur) => acc.add(cur._id), new Set())].sort(),
VALUE: token.VALUE,
Expand Down Expand Up @@ -245,7 +234,7 @@ export default function (ops) {

const BOUNDING_VALUE = (token, reverse) =>
RANGE({
...ops.tokenParser.parse(token),
...tokenParser.parse(token),
LIMIT: 1,
REVERSE: reverse
}).then(max =>
Expand All @@ -265,7 +254,7 @@ export default function (ops) {
)

const DIST = token => {
token = ops.tokenParser.parse(token)
token = tokenParser.parse(token)
return Promise.all(
token.FIELD.map(field => {
let lte = token.VALUE.LTE
Expand Down Expand Up @@ -313,7 +302,7 @@ export default function (ops) {
)

const FACET = token => {
token = ops.tokenParser.parse(token)
token = tokenParser.parse(token)
return Promise.all(
token.FIELD.map(field =>
getRange({
Expand Down
14 changes: 10 additions & 4 deletions src/write.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import trav from 'traverse'
import reader from './read.js'

export default function (ops) {
export default function (ops, tokenParser) {
// TODO: set reset this to the max value every time the DB is restarted
let incrementalId = 0

Expand Down Expand Up @@ -32,7 +32,13 @@ export default function (ops) {
if (!ops.stopwords.includes(this.node)) {
const key = JSON.stringify([
fieldName,
[this.node].flat(Infinity)
[this.node]
.flat(Infinity)
.map(item =>
typeof item === 'string' && !ops.caseSensitive
? item.toLowerCase()
: item
)
])
// bump to lower case if not case sensitive
keys.push(ops.caseSensitive ? key : key.toLowerCase())
Expand Down Expand Up @@ -207,7 +213,7 @@ export default function (ops) {
)
)
.then(() => reader(ops).FIELDS())
.then(fields => ops.tokenParser.setAvailableFields(fields))
.then(fields => tokenParser.setAvailableFields(fields))

const PUT = (docs, putOptions = {}) =>
writer(
Expand All @@ -223,7 +229,7 @@ export default function (ops) {
.then(TIMESTAMP_LAST_UPDATED)
.then(async passThrough => {
// TODO: reader should not be inited here
ops.tokenParser.setAvailableFields(await reader(ops).FIELDS())
tokenParser.setAvailableFields(await reader(ops).FIELDS())
return passThrough
})

Expand Down
Loading

0 comments on commit 93041fe

Please sign in to comment.