diff --git a/package-lock.json b/package-lock.json index b6803a30983a..15c78ee56040 100644 --- a/package-lock.json +++ b/package-lock.json @@ -102,7 +102,7 @@ "@github/markdownlint-github": "^0.6.2", "@graphql-inspector/core": "^5.0.0", "@graphql-tools/load": "^8.0.0", - "@octokit/rest": "^20.0.2", + "@octokit/rest": "^20.1.0", "@playwright/test": "1.43.0", "@types/imurmurhash": "^0.1.4", "@types/js-cookie": "^3.0.6", @@ -2199,15 +2199,15 @@ } }, "node_modules/@octokit/core": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/@octokit/core/-/core-5.0.1.tgz", - "integrity": "sha512-lyeeeZyESFo+ffI801SaBKmCfsvarO+dgV8/0gD8u1d87clbEdWsP5yC+dSj3zLhb2eIf5SJrn6vDz9AheETHw==", + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/@octokit/core/-/core-5.2.0.tgz", + "integrity": "sha512-1LFfa/qnMQvEOAdzlQymH0ulepxbxnCYAKJZfMci/5XJyIHWgEYnDmgnKakbTh7CH2tFQ5O60oYDvns4i9RAIg==", "dependencies": { "@octokit/auth-token": "^4.0.0", - "@octokit/graphql": "^7.0.0", - "@octokit/request": "^8.0.2", - "@octokit/request-error": "^5.0.0", - "@octokit/types": "^12.0.0", + "@octokit/graphql": "^7.1.0", + "@octokit/request": "^8.3.1", + "@octokit/request-error": "^5.1.0", + "@octokit/types": "^13.0.0", "before-after-hook": "^2.2.0", "universal-user-agent": "^6.0.0" }, @@ -2233,7 +2233,7 @@ "node": ">= 18" } }, - "node_modules/@octokit/core/node_modules/@octokit/request-error/node_modules/@octokit/types": { + "node_modules/@octokit/core/node_modules/@octokit/types": { "version": "13.4.1", "resolved": "https://registry.npmjs.org/@octokit/types/-/types-13.4.1.tgz", "integrity": "sha512-Y73oOAzRBAUzR/iRAbGULzpNkX8vaxKCqEtg6K74Ff3w9f5apFnWtE/2nade7dMWWW3bS5Kkd6DJS4HF04xreg==", @@ -2242,64 +2242,119 @@ } }, "node_modules/@octokit/endpoint": { - "version": "9.0.1", - "resolved": "https://registry.npmjs.org/@octokit/endpoint/-/endpoint-9.0.1.tgz", - "integrity": "sha512-hRlOKAovtINHQPYHZlfyFwaM8OyetxeoC81lAkBy34uLb8exrZB50SQdeW3EROqiY9G9yxQTpp5OHTV54QD+vA==", + "version": "9.0.5", + "resolved": "https://registry.npmjs.org/@octokit/endpoint/-/endpoint-9.0.5.tgz", + "integrity": "sha512-ekqR4/+PCLkEBF6qgj8WqJfvDq65RH85OAgrtnVp1mSxaXF03u2xW/hUdweGS5654IlC0wkNYC18Z50tSYTAFw==", "dependencies": { - "@octokit/types": "^12.0.0", - "is-plain-object": "^5.0.0", + "@octokit/types": "^13.1.0", "universal-user-agent": "^6.0.0" }, "engines": { "node": ">= 18" } }, + "node_modules/@octokit/endpoint/node_modules/@octokit/openapi-types": { + "version": "22.1.0", + "resolved": "https://registry.npmjs.org/@octokit/openapi-types/-/openapi-types-22.1.0.tgz", + "integrity": "sha512-pGUdSP+eEPfZiQHNkZI0U01HLipxncisdJQB4G//OAmfeO8sqTQ9KRa0KF03TUPCziNsoXUrTg4B2Q1EX++T0Q==" + }, + "node_modules/@octokit/endpoint/node_modules/@octokit/types": { + "version": "13.4.1", + "resolved": "https://registry.npmjs.org/@octokit/types/-/types-13.4.1.tgz", + "integrity": "sha512-Y73oOAzRBAUzR/iRAbGULzpNkX8vaxKCqEtg6K74Ff3w9f5apFnWtE/2nade7dMWWW3bS5Kkd6DJS4HF04xreg==", + "dependencies": { + "@octokit/openapi-types": "^22.1.0" + } + }, "node_modules/@octokit/graphql": { - "version": "7.0.2", - "resolved": "https://registry.npmjs.org/@octokit/graphql/-/graphql-7.0.2.tgz", - "integrity": "sha512-OJ2iGMtj5Tg3s6RaXH22cJcxXRi7Y3EBqbHTBRq+PQAqfaS8f/236fUrWhfSn8P4jovyzqucxme7/vWSSZBX2Q==", + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/@octokit/graphql/-/graphql-7.1.0.tgz", + "integrity": "sha512-r+oZUH7aMFui1ypZnAvZmn0KSqAUgE1/tUXIWaqUCa1758ts/Jio84GZuzsvUkme98kv0WFY8//n0J1Z+vsIsQ==", "dependencies": { - "@octokit/request": "^8.0.1", - "@octokit/types": "^12.0.0", + "@octokit/request": "^8.3.0", + "@octokit/types": "^13.0.0", "universal-user-agent": "^6.0.0" }, "engines": { "node": ">= 18" } }, + "node_modules/@octokit/graphql/node_modules/@octokit/openapi-types": { + "version": "22.1.0", + "resolved": "https://registry.npmjs.org/@octokit/openapi-types/-/openapi-types-22.1.0.tgz", + "integrity": "sha512-pGUdSP+eEPfZiQHNkZI0U01HLipxncisdJQB4G//OAmfeO8sqTQ9KRa0KF03TUPCziNsoXUrTg4B2Q1EX++T0Q==" + }, + "node_modules/@octokit/graphql/node_modules/@octokit/types": { + "version": "13.4.1", + "resolved": "https://registry.npmjs.org/@octokit/types/-/types-13.4.1.tgz", + "integrity": "sha512-Y73oOAzRBAUzR/iRAbGULzpNkX8vaxKCqEtg6K74Ff3w9f5apFnWtE/2nade7dMWWW3bS5Kkd6DJS4HF04xreg==", + "dependencies": { + "@octokit/openapi-types": "^22.1.0" + } + }, "node_modules/@octokit/openapi-types": { "version": "19.0.0", "resolved": "https://registry.npmjs.org/@octokit/openapi-types/-/openapi-types-19.0.0.tgz", "integrity": "sha512-PclQ6JGMTE9iUStpzMkwLCISFn/wDeRjkZFIKALpvJQNBGwDoYYi2fFvuHwssoQ1rXI5mfh6jgTgWuddeUzfWw==" }, "node_modules/@octokit/plugin-paginate-rest": { - "version": "9.0.0", - "resolved": "https://registry.npmjs.org/@octokit/plugin-paginate-rest/-/plugin-paginate-rest-9.0.0.tgz", - "integrity": "sha512-oIJzCpttmBTlEhBmRvb+b9rlnGpmFgDtZ0bB6nq39qIod6A5DP+7RkVLMOixIgRCYSHDTeayWqmiJ2SZ6xgfdw==", + "version": "9.2.1", + "resolved": "https://registry.npmjs.org/@octokit/plugin-paginate-rest/-/plugin-paginate-rest-9.2.1.tgz", + "integrity": "sha512-wfGhE/TAkXZRLjksFXuDZdmGnJQHvtU/joFQdweXUgzo1XwvBCD4o4+75NtFfjfLK5IwLf9vHTfSiU3sLRYpRw==", "dev": true, "dependencies": { - "@octokit/types": "^12.0.0" + "@octokit/types": "^12.6.0" }, "engines": { "node": ">= 18" }, "peerDependencies": { - "@octokit/core": ">=5" + "@octokit/core": "5" + } + }, + "node_modules/@octokit/plugin-paginate-rest/node_modules/@octokit/openapi-types": { + "version": "20.0.0", + "resolved": "https://registry.npmjs.org/@octokit/openapi-types/-/openapi-types-20.0.0.tgz", + "integrity": "sha512-EtqRBEjp1dL/15V7WiX5LJMIxxkdiGJnabzYx5Apx4FkQIFgAfKumXeYAqqJCj1s+BMX4cPFIFC4OLCR6stlnA==", + "dev": true + }, + "node_modules/@octokit/plugin-paginate-rest/node_modules/@octokit/types": { + "version": "12.6.0", + "resolved": "https://registry.npmjs.org/@octokit/types/-/types-12.6.0.tgz", + "integrity": "sha512-1rhSOfRa6H9w4YwK0yrf5faDaDTb+yLyBUKOCV4xtCDB5VmIPqd/v9yr9o6SAzOAlRxMiRiCic6JVM1/kunVkw==", + "dev": true, + "dependencies": { + "@octokit/openapi-types": "^20.0.0" } }, "node_modules/@octokit/plugin-rest-endpoint-methods": { - "version": "10.0.0", - "resolved": "https://registry.npmjs.org/@octokit/plugin-rest-endpoint-methods/-/plugin-rest-endpoint-methods-10.0.0.tgz", - "integrity": "sha512-16VkwE2v6rXU+/gBsYC62M8lKWOphY5Lg4wpjYnVE9Zbu0J6IwiT5kILoj1YOB53XLmcJR+Nqp8DmifOPY4H3g==", + "version": "10.4.1", + "resolved": "https://registry.npmjs.org/@octokit/plugin-rest-endpoint-methods/-/plugin-rest-endpoint-methods-10.4.1.tgz", + "integrity": "sha512-xV1b+ceKV9KytQe3zCVqjg+8GTGfDYwaT1ATU5isiUyVtlVAO3HNdzpS4sr4GBx4hxQ46s7ITtZrAsxG22+rVg==", "dev": true, "dependencies": { - "@octokit/types": "^12.0.0" + "@octokit/types": "^12.6.0" }, "engines": { "node": ">= 18" }, "peerDependencies": { - "@octokit/core": ">=5" + "@octokit/core": "5" + } + }, + "node_modules/@octokit/plugin-rest-endpoint-methods/node_modules/@octokit/openapi-types": { + "version": "20.0.0", + "resolved": "https://registry.npmjs.org/@octokit/openapi-types/-/openapi-types-20.0.0.tgz", + "integrity": "sha512-EtqRBEjp1dL/15V7WiX5LJMIxxkdiGJnabzYx5Apx4FkQIFgAfKumXeYAqqJCj1s+BMX4cPFIFC4OLCR6stlnA==", + "dev": true + }, + "node_modules/@octokit/plugin-rest-endpoint-methods/node_modules/@octokit/types": { + "version": "12.6.0", + "resolved": "https://registry.npmjs.org/@octokit/types/-/types-12.6.0.tgz", + "integrity": "sha512-1rhSOfRa6H9w4YwK0yrf5faDaDTb+yLyBUKOCV4xtCDB5VmIPqd/v9yr9o6SAzOAlRxMiRiCic6JVM1/kunVkw==", + "dev": true, + "dependencies": { + "@octokit/openapi-types": "^20.0.0" } }, "node_modules/@octokit/plugin-retry": { @@ -2345,14 +2400,13 @@ } }, "node_modules/@octokit/request": { - "version": "8.1.4", - "resolved": "https://registry.npmjs.org/@octokit/request/-/request-8.1.4.tgz", - "integrity": "sha512-M0aaFfpGPEKrg7XoA/gwgRvc9MSXHRO2Ioki1qrPDbl1e9YhjIwVoHE7HIKmv/m3idzldj//xBujcFNqGX6ENA==", + "version": "8.4.0", + "resolved": "https://registry.npmjs.org/@octokit/request/-/request-8.4.0.tgz", + "integrity": "sha512-9Bb014e+m2TgBeEJGEbdplMVWwPmL1FPtggHQRkV+WVsMggPtEkLKPlcVYm/o8xKLkpJ7B+6N8WfQMtDLX2Dpw==", "dependencies": { - "@octokit/endpoint": "^9.0.0", - "@octokit/request-error": "^5.0.0", - "@octokit/types": "^12.0.0", - "is-plain-object": "^5.0.0", + "@octokit/endpoint": "^9.0.1", + "@octokit/request-error": "^5.1.0", + "@octokit/types": "^13.1.0", "universal-user-agent": "^6.0.0" }, "engines": { @@ -2401,7 +2455,7 @@ "node": ">= 18" } }, - "node_modules/@octokit/request/node_modules/@octokit/request-error/node_modules/@octokit/types": { + "node_modules/@octokit/request/node_modules/@octokit/types": { "version": "13.4.1", "resolved": "https://registry.npmjs.org/@octokit/types/-/types-13.4.1.tgz", "integrity": "sha512-Y73oOAzRBAUzR/iRAbGULzpNkX8vaxKCqEtg6K74Ff3w9f5apFnWtE/2nade7dMWWW3bS5Kkd6DJS4HF04xreg==", @@ -2410,15 +2464,15 @@ } }, "node_modules/@octokit/rest": { - "version": "20.0.2", - "resolved": "https://registry.npmjs.org/@octokit/rest/-/rest-20.0.2.tgz", - "integrity": "sha512-Ux8NDgEraQ/DMAU1PlAohyfBBXDwhnX2j33Z1nJNziqAfHi70PuxkFYIcIt8aIAxtRE7KVuKp8lSR8pA0J5iOQ==", + "version": "20.1.0", + "resolved": "https://registry.npmjs.org/@octokit/rest/-/rest-20.1.0.tgz", + "integrity": "sha512-STVO3itHQLrp80lvcYB2UIKoeil5Ctsgd2s1AM+du3HqZIR35ZH7WE9HLwUOLXH0myA0y3AGNPo8gZtcgIbw0g==", "dev": true, "dependencies": { - "@octokit/core": "^5.0.0", - "@octokit/plugin-paginate-rest": "^9.0.0", + "@octokit/core": "^5.0.2", + "@octokit/plugin-paginate-rest": "^9.1.5", "@octokit/plugin-request-log": "^4.0.0", - "@octokit/plugin-rest-endpoint-methods": "^10.0.0" + "@octokit/plugin-rest-endpoint-methods": "^10.2.0" }, "engines": { "node": ">= 18" @@ -7925,13 +7979,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/is-plain-object": { - "version": "5.0.0", - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/is-regex": { "version": "1.1.4", "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz", @@ -13653,8 +13700,9 @@ } }, "node_modules/universal-user-agent": { - "version": "6.0.0", - "license": "ISC" + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/universal-user-agent/-/universal-user-agent-6.0.1.tgz", + "integrity": "sha512-yCzhz6FN2wU1NiiQRogkTQszlQSlpWaw8SvVegAc+bDxbzHgh1vX8uIe8OYyMH6DwH+sdTJsgMl36+mSMdRJIQ==" }, "node_modules/universalify": { "version": "2.0.0", diff --git a/package.json b/package.json index 6aeb93f943fb..a0ebcb6fb56e 100644 --- a/package.json +++ b/package.json @@ -33,6 +33,7 @@ "find-past-built-pr": "tsx src/workflows/find-past-built-pr.ts", "fixture-dev": "cross-env ROOT=src/fixtures/fixtures npm start", "fixture-test": "cross-env ROOT=src/fixtures/fixtures npm test -- src/fixtures/tests", + "index": "tsx src/search/scripts/index/index.ts", "index-elasticsearch": "node src/search/scripts/index-elasticsearch.js", "index-test-fixtures": "npm run index-elasticsearch -- -l en -l ja -V ghec -V dotcom --index-prefix tests -- src/search/tests/fixtures/search-indexes", "lint": "eslint '**/*.{js,mjs,ts,tsx}'", @@ -285,7 +286,7 @@ "@github/markdownlint-github": "^0.6.2", "@graphql-inspector/core": "^5.0.0", "@graphql-tools/load": "^8.0.0", - "@octokit/rest": "^20.0.2", + "@octokit/rest": "^20.1.0", "@playwright/test": "1.43.0", "@types/imurmurhash": "^0.1.4", "@types/js-cookie": "^3.0.6", diff --git a/src/search/scripts/index/index-autocomplete.ts b/src/search/scripts/index/index-autocomplete.ts new file mode 100644 index 000000000000..b32830560fef --- /dev/null +++ b/src/search/scripts/index/index-autocomplete.ts @@ -0,0 +1,128 @@ +import fs from 'node:fs' +import path from 'node:path' + +import { Client, estypes } from '@elastic/elasticsearch' + +import { getClient } from './lib/get-client' +import { utcTimestamp } from './lib/utils' +import { populate } from './lib/populate' + +import { type Version, Records } from './types' + +export const shortVersionNames = { + 'enterprise-server': 'ghes', + 'enterprise-cloud': 'ghec', + 'free-pro-team': 'fpt', +} as const + +const DEFAULT_SLEEPTIME_SECONDS = 30 + +type Options = { + dataRepoRoot: string + languages: string[] + versions: Version[] + retries?: number + sleepTime?: number + verbose?: boolean +} + +export async function indexAutocomplete(options: Options) { + // The data repo has a predictable structure of + // `hydro/rollups/user-searches/$language/$version/rollup.json` + // But note that the "version" might be a prefix, like enterprise-server. + // const { verbose } = options + + const client = getClient() + + const { dataRepoRoot, versions, languages } = options + for (const language of languages) { + for (const version of versions) { + const records = loadRecords({ version, language, dataRepoRoot }) + const { alias, name } = await createIndex(client, language, version) + await populate(client, records, { + alias, + name, + retries: options.retries || 0, + sleepTime: options.sleepTime || DEFAULT_SLEEPTIME_SECONDS, + }) + } + } +} + +type LoadOptions = { + dataRepoRoot: string + language: string + version: string +} + +function loadRecords(options: LoadOptions): Records { + const filePath = path.join( + options.dataRepoRoot, + 'hydro/rollups/user-searches', + options.language, + options.version, + 'rollup.json', + ) + const terms: Records = JSON.parse(fs.readFileSync(filePath, 'utf8')) + return terms +} + +type IndexInfo = { + alias: string + name: string +} + +async function createIndex(client: Client, language: string, version: Version): Promise { + const settings: estypes.IndicesIndexSettings = { + analysis: { + // char_filter: { + // // This will turn `runs-on` into `runs_on` so that it can't be + // // tokenized to `runs` because `on` is a stop word. + // // It also means that prose terms, in English, like `opt-in` + // // not be matched if someone searches for `opt in`. But this + // // is why we have multiple different analyzers. So it becomes + // // `opt_in` in the `text_analyzer_explicit` analyzer, but is + // // left as `opt` in the `text_analyzer` analyzer. + // hyphenation_filter: { + // type: 'mapping', + // mappings: ['- => _'], + // }, + // }, + analyzer: { + text_analyzer: { + filter: ['lowercase'], + tokenizer: 'standard', + type: 'custom', + }, + }, + }, + // filter: { + // // Will later, conditionally, put the snowball configuration here. + // }, + // XXX SNOWBALL? + } + + const indexName = `github-autocomplete-${language}-${shortVersionNames[version] || version}` + const thisAlias = `${indexName}__${utcTimestamp()}` + + const mappings: estypes.MappingTypeMapping = { + properties: { + term: { + type: 'text', + analyzer: 'text_analyzer', + // This is used for fast highlighting. Uses more space but makes + // the searches faster. + term_vector: 'with_positions_offsets', + }, + popularity: { type: 'float' }, + }, + } + + await client.indices.create({ + index: thisAlias, + mappings, + settings, + }) + + return { alias: thisAlias, name: indexName } +} diff --git a/src/search/scripts/index/index.ts b/src/search/scripts/index/index.ts new file mode 100644 index 000000000000..b5b488301b70 --- /dev/null +++ b/src/search/scripts/index/index.ts @@ -0,0 +1,42 @@ +import { program, Option } from 'commander' + +import { languageKeys } from '@/languages/lib/languages.js' +import { indexAutocomplete } from './index-autocomplete' +import { type Version } from './types' + +const defaultVersions: Version[] = ['free-pro-team', 'enterprise-server', 'enterprise-cloud'] +const shortAlias = new Map() +shortAlias.set('ghes', 'enterprise-server') +shortAlias.set('fpt', 'free-pro-team') +shortAlias.set('ghec', 'enterprise-cloud') + +program.name('index').description('CLI scripts for indexing to Elasticsearch') + +program + .command('autocomplete') + .description('Index for autocomplete') + .addOption( + new Option('-l, --language ', 'Specific languages(s)').choices(languageKeys), + ) + .addOption( + new Option('-v, --version ', 'Specific version prefix(es)').choices([ + ...defaultVersions, + ...shortAlias.keys(), + ]), + ) + .option('--verbose', 'Verbose output') + .argument('', 'path to the docs-internal-data repo') + .action((root: string, options) => { + const languages = options.language ? options.language : languageKeys + const versions: Version[] = [] + for (const v of options.version || defaultVersions) { + if (shortAlias.has(v)) { + versions.push(shortAlias.get(v)!) + } else { + versions.push(v) + } + } + return indexAutocomplete({ dataRepoRoot: root, languages, versions }) + }) + +program.parse(process.argv) diff --git a/src/search/scripts/index/lib/get-client.ts b/src/search/scripts/index/lib/get-client.ts new file mode 100644 index 000000000000..4f9b79034430 --- /dev/null +++ b/src/search/scripts/index/lib/get-client.ts @@ -0,0 +1,27 @@ +import { Client } from '@elastic/elasticsearch' + +export function getClient(): Client { + const node = getElasticsearchURL() + const client = new Client({ node }) + return client +} + +function getElasticsearchURL() { + if (!process.env.ELASTICSEARCH_URL) { + throw new Error( + 'Must passed the elasticsearch URL option or ' + + 'set the environment variable ELASTICSEARCH_URL', + ) + } + let node = process.env.ELASTICSEARCH_URL + + // Allow the user to lazily set it to `localhost:9200` for example. + if (!node.startsWith('http') && !node.startsWith('://') && node.split(':').length === 2) { + node = `http://${node}` + } + + const parsed = new URL(node) + if (!parsed.hostname) throw new Error('no valid hostname') + + return node +} diff --git a/src/search/scripts/index/lib/populate.ts b/src/search/scripts/index/lib/populate.ts new file mode 100644 index 000000000000..252b90cefbcd --- /dev/null +++ b/src/search/scripts/index/lib/populate.ts @@ -0,0 +1,107 @@ +import chalk from 'chalk' +import { Client, errors } from '@elastic/elasticsearch' + +import type { Records, RetryConfig } from '../types' +import { retryOnErrorTest } from './retry-on-error-test' +import { repointAlias } from './repoint-alias' +import { formatTime, sleep } from './utils' + +type PopulateOptions = RetryConfig & { + verbose?: boolean + alias: string + name: string +} + +export async function populate(client: Client, records: Records, options: PopulateOptions) { + const { alias, name } = options + + const allRecords = Object.entries(records).sort((a, b) => b[1] - a[1]) + const operations = allRecords.flatMap(([term, count]) => { + const popularity = count / allRecords[0][1] // Normalize to 1.0 for the highest count + return [ + { index: { _index: alias } }, + { + term, + popularity, + }, + ] + }) + + const bulkOptions = { + // Default is 'false'. + // It means that the index is NOT refreshed as documents are inserted. + // Which makes sense in our case because we do not intend to search on + // this index until after we've pointed the alias to this new index. + refresh: false, + // Default is '1m' but we have no reason *not* to be patient. It's run + // by a bot on a schedeule (GitHub Actions). + timeout: '5m', + } + + const attempts = options.retries + const sleepTime = options.sleepTime * 1000 + + console.log(`About to bulk index ${allRecords.length.toLocaleString()} records with retry %O`, { + attempts, + sleepTime, + }) + const t0 = new Date() + const bulkResponse = await retryOnErrorTest( + (error: Error) => { + // Rate limiting can happen when you're indexing too much at + // same time. + return error instanceof errors.ResponseError && error.meta.statusCode === 429 + }, + () => client.bulk({ operations, ...bulkOptions }), + { + attempts, + sleepTime, + onError: (_, attempts, sleepTime) => { + console.warn( + chalk.yellow( + `Failed to bulk index ${name}. Will attempt ${attempts} more times (after ${ + sleepTime / 1000 + }s sleep).`, + ), + ) + }, + }, + ) + + if (bulkResponse.errors) { + // Some day, when we're more confident how and why this might happen + // we can rewrite this code to "massage" the errors better. + // For now, if it fails, it's "OK". It means we won't be proceeding, + // an error is thrown in Actions and we don't have to worry about + // an incompletion index. + console.error(`Bulk response errors: ${bulkResponse.errors}`) + throw new Error('Bulk errors happened.') + } + const t1 = new Date() + console.log(`Bulk indexed ${alias}. Took ${formatTime(t1.getTime() - t0.getTime())}`) + + // The counting of documents in the index is async and can take a while + // to reflect. So send count requests until we get the right number. + let documentsInIndex = 0 + let countAttempts = 3 + while (documentsInIndex < allRecords.length) { + const { count } = await client.count({ index: alias }) + documentsInIndex = count + if (documentsInIndex >= allRecords.length) break + countAttempts-- + if (!countAttempts) { + console.log(`After ${countAttempts} attempts still haven't matched the expected number.`) + break + } + await sleep(1000) + } + console.log( + `Documents now in ${chalk.bold(alias)}: ${chalk.bold(documentsInIndex.toLocaleString())}`, + ) + + await repointAlias(client, alias, name, { + attempts, + sleepTime, + verbose: Boolean(options.verbose), + }) +} diff --git a/src/search/scripts/index/lib/repoint-alias.ts b/src/search/scripts/index/lib/repoint-alias.ts new file mode 100644 index 000000000000..36af59d2609e --- /dev/null +++ b/src/search/scripts/index/lib/repoint-alias.ts @@ -0,0 +1,77 @@ +import chalk from 'chalk' +import { Client, errors } from '@elastic/elasticsearch' + +import { retryOnErrorTest } from './retry-on-error-test' +import { formatTime } from './utils' + +export async function repointAlias( + client: Client, + alias: string, + name: string, + options: { + attempts: number + sleepTime: number + verbose: boolean + }, +) { + const { attempts, sleepTime, verbose } = options + // To perform an atomic operation that creates the new alias and removes + // the old indexes, we can use the updateAliases API with a body that + // includes an "actions" array. The array includes the added alias + // and the removed indexes. If any of the actions fail, none of the operations + // are performed. + // https://www.elastic.co/guide/en/elasticsearch/reference/master/indices-aliases.html + + type Update = + | { + add: { + index: string + alias: string + } + } + | { + remove_index: { + index: string + } + } + const aliasUpdates: Update[] = [ + { + add: { + index: alias, + alias: name, + }, + }, + ] + console.log(`Alias ${name} -> ${alias}`) + + console.log('About to get indices with retry %O', { attempts, sleepTime }) + const indices = await retryOnErrorTest( + (error: any) => { + // 404 can happen when you're trying to get an index that + // doesn't exist. ...yet! + return error instanceof errors.ResponseError && error.meta.statusCode === 404 + }, + () => client.cat.indices({ format: 'json' }), + { + attempts, + sleepTime, + onError: (error, attempts, sleepTime) => { + console.warn( + chalk.yellow( + `Failed to get index ${name} (${ + error.message || error.toString() + }). Will attempt ${attempts} more times (after ${formatTime(sleepTime)}s sleep).`, + ), + ) + }, + }, + ) + for (const index of indices) { + if (index.index !== alias && index.index.startsWith(name)) { + aliasUpdates.push({ remove_index: { index: index.index } }) + console.log('Deleting index', index.index) + } + } + if (verbose) console.log('Updating alias actions:', aliasUpdates) + await client.indices.updateAliases({ body: { actions: aliasUpdates } }) +} diff --git a/src/search/scripts/index/lib/retry-on-error-test.ts b/src/search/scripts/index/lib/retry-on-error-test.ts new file mode 100644 index 000000000000..b2c88420a4ef --- /dev/null +++ b/src/search/scripts/index/lib/retry-on-error-test.ts @@ -0,0 +1,82 @@ +// [start-readme] +// +// Return a function that you can use to run any code within and if it +// throws you get a chance to say whether to sleep + retry. +// Example: +// +// async function mainFunction() { +// if (Math.random() > 0.9) throw new Error('too large') +// return 'OK' +// } +// +// const errorTest = (err) => err instanceof Error && err.message.includes('too large') +// const config = { // all optional +// attempts: 3, +// sleepTime: 800, +// onError: (err, attempts) => console.warn(`Failed ${attempts} attempts`) +// } +// const ok = await retry(errorTest, mainFunction, config) +// +// Note that, by default, the sleep time is "exponential" by a factor of +// 1.5. So the first sleep will, in the above example, +// be 800ms. Then 1,200ms, Then 1,800ms. etc. +// +// [end-readme] + +import { sleep } from './utils' + +export async function retryOnErrorTest( + errorTest: (error: any) => boolean, + callback: Function, + { + attempts = 4, + sleepTime = 1000, + exponential = 1.5, + jitterPercent = 25, + onError = () => {}, + }: { + attempts?: number + sleepTime?: number + exponential?: number + jitterPercent?: number + onError?: (error: Error, attempts: number, sleepTime: number) => void + } = {}, +) { + while (true) { + try { + return await callback() + } catch (error) { + if (error instanceof Error && attempts > 0 && errorTest(error)) { + if (onError) onError(error, attempts, sleepTime) + attempts-- + // The reason for the jitter is to avoid a thundering herd problem. + // Suppose two independent processes/threads start at the same time. + // They both fail, perhaps due to rate limiting. Now, if they both + // sleep for 30 seconds in the first retry attempt, it'll just + // clash again 30 seconds later. But if you add a bit of jitter, at + // the next attempt these independent processes/threads will now + // start at slightly different times. + + // According to the Oxford English dictionary, they define "jitter" as: + // + // slight irregular movement, variation, or unsteadiness, + // especially in an electrical signal or electronic device. + // + await sleep(addJitter(sleepTime, jitterPercent)) + if (exponential) { + sleepTime *= 2 + } + } else { + throw error + } + } + } +} + +function addJitter(num: number, percent: number) { + // Return the number plus between 0 and $percent of that number. + // For example, for 1,000 with a 20% jitter you might get 1133.4 + // because you start with 1,000 and 13.4% is a random number between + // 0 and 20%. + return num + Math.random() * percent * 0.01 * num +} diff --git a/src/search/scripts/index/lib/utils.ts b/src/search/scripts/index/lib/utils.ts new file mode 100644 index 000000000000..779ba85c6990 --- /dev/null +++ b/src/search/scripts/index/lib/utils.ts @@ -0,0 +1,35 @@ +export async function sleep(ms: number) { + return new Promise((resolve) => setTimeout(resolve, ms)) +} + +export function formatTime(ms: number) { + if (ms < 1000) { + return `${ms.toFixed(1)}ms` + } + const seconds = ms / 1000 + if (seconds > 60) { + return `${Math.round(seconds / 60)}m${Math.round(seconds % 60)}s` + } + return `${seconds.toFixed(1)}s` +} + +// Return '20220719012012' if the current date is +// 2022-07-19T01:20:12.172Z. Note how the 6th month (July) becomes +// '07'. All numbers become 2 character zero-padding strings individually. +export function utcTimestamp() { + const d = new Date() + + return ( + [ + `${d.getUTCFullYear()}`, + d.getUTCMonth() + 1, + d.getUTCDate(), + d.getUTCHours(), + d.getUTCMinutes(), + d.getUTCSeconds(), + ] + // If it's a number make it a zero-padding 2 character string + .map((x) => (typeof x === 'number' ? ('0' + x).slice(-2) : x)) + .join('') + ) +} diff --git a/src/search/scripts/index/types.ts b/src/search/scripts/index/types.ts new file mode 100644 index 000000000000..533fb79d045f --- /dev/null +++ b/src/search/scripts/index/types.ts @@ -0,0 +1,10 @@ +export type Version = 'free-pro-team' | 'enterprise-server' | 'enterprise-cloud' + +export type Records = { + [key: string]: number +} + +export type RetryConfig = { + retries: number + sleepTime: number +}