From bd23217796e6ff442b14ef9d889eaef30e8d6826 Mon Sep 17 00:00:00 2001 From: Kevin Heis Date: Wed, 16 Nov 2022 10:35:42 -0800 Subject: [PATCH 1/2] Create translation-health-report.yml (#32486) --- .../workflows/translation-health-report.yml | 132 ++++++++++++++ .../i18n/create-translation-health-report.js | 163 ++++++++++++++++++ 2 files changed, 295 insertions(+) create mode 100644 .github/workflows/translation-health-report.yml create mode 100755 script/i18n/create-translation-health-report.js diff --git a/.github/workflows/translation-health-report.yml b/.github/workflows/translation-health-report.yml new file mode 100644 index 000000000000..4a1d1127f82d --- /dev/null +++ b/.github/workflows/translation-health-report.yml @@ -0,0 +1,132 @@ +name: Translation health report + +# **What it does**: Provides errors and summary statistics on rendering translated content. +# **Why we have it**: To improve our translations by having clearer visibility. +# **Who does it impact**: Docs engineering, Microsoft translators. + +on: + workflow_dispatch: + schedule: + - cron: '20 16 * * *' # Run every day at 16:20 UTC / 8:20 PST + +permissions: + contents: read + +jobs: + create-translation-health-report: + name: Create translation health report + if: github.repository == 'github/docs-internal' + runs-on: ubuntu-latest + # This sets a maximum execution time of 300 minutes (5 hours) + # to prevent the workflow from running longer than necessary. + timeout-minutes: 300 + strategy: + fail-fast: false + max-parallel: 1 + matrix: + include: + - language: es + language_dir: translations/es-ES + language_repo: github/docs-internal.es-es + + - language: ja + language_dir: translations/ja-JP + language_repo: github/docs-internal.ja-jp + + - language: pt + language_dir: translations/pt-BR + language_repo: github/docs-internal.pt-br + + - language: cn + language_dir: translations/zh-CN + language_repo: github/docs-internal.zh-cn + + # We'll be ready to add the following languages in a future effort. + + # - language: ru + # language_dir: translations/ru-RU + # language_repo: github/docs-internal.ru-ru + + # - language: ko + # language_dir: translations/ko-KR + # language_repo: github/docs-internal.ko-kr + + # - language: fr + # language_dir: translations/fr-FR + # language_repo: github/docs-internal.fr-fr + + # - language: de + # language_dir: translations/de-DE + # language_repo: github/docs-internal.de-de + + steps: + - name: Checkout the docs-internal repo + uses: actions/checkout@dcd71f646680f2efd8db4afa5ad64fdcba30e748 + + - name: Remove all language translations + run: | + git rm -rf --quiet ${{ matrix.language_dir }}/content + git rm -rf --quiet ${{ matrix.language_dir }}/data + + - name: Checkout the language-specific repo + uses: actions/checkout@dcd71f646680f2efd8db4afa5ad64fdcba30e748 + with: + repository: ${{ matrix.language_repo }} + token: ${{ secrets.DOCUBOT_READORG_REPO_WORKFLOW_SCOPES }} + path: ${{ matrix.language_dir }} + + - name: Get language SHA + run: | + gitref=$(cd ${{ matrix.language_dir }} && git rev-parse --short HEAD) + echo "gitref=$gitref" >> $GITHUB_ENV + + - name: 'Setup node' + uses: actions/setup-node@17f8bd926464a1afa4c6a11669539e9c1ba77048 + with: + node-version: '16.17.0' + + - name: npm ci + run: npm ci + + - name: Create translation health report + run: | + translation_health_report=$( \ + node script/i18n/create-translation-health-report.js \ + --language ${{ matrix.language }} \ + --gitref ${{ env.gitref }} \ + | jq -Rsa . + ) + echo "translation_health_report=$translation_health_report" >> $GITHUB_ENV + + - name: Log in to Azure + uses: azure/login@1f63701bf3e6892515f1b7ce2d2bf1708b46beaf + with: + creds: ${{ secrets.PROD_AZURE_CREDENTIALS }} + + - name: Upload to Azure blob storage + uses: azure/CLI@61bb69d64d613b52663984bf12d6bac8fd7b3cc8 + with: + inlineScript: | + az storage blob upload \ + --name "${{ matrix.language }}-latest.json" \ + --data $translation_health_report \ + --container-name translation-health-reports + az storage blob upload \ + --name "${{ matrix.language }}-$(date +%Y-%m-%d).json" \ + --data $translation_health_report \ + --container-name translation-health-reports + + - name: Log out from Azure + if: always() + run: | + az logout + + # Emit a notification for the first responder to triage if the workflow failed. + - name: Send Slack notification if workflow failed + uses: someimportantcompany/github-actions-slack-message@f8d28715e7b8a4717047d23f48c39827cacad340 + if: failure() + with: + channel: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }} + bot-token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }} + color: failure + text: 'The health report for ${{ matrix.language }} failed.' diff --git a/script/i18n/create-translation-health-report.js b/script/i18n/create-translation-health-report.js new file mode 100755 index 000000000000..e9ef2c88bf77 --- /dev/null +++ b/script/i18n/create-translation-health-report.js @@ -0,0 +1,163 @@ +#!/usr/bin/env node + +// [start-readme] +// +// Create a list of errors and summary statistics for errors in a particular language. +// +// [end-readme] + +/* Nota bene: + If you are getting more errors all the sudden, try running this: + $ script/i18n/create-translation-health-report.js -l en -r 000 + If there's any errors, const context = { ... } probably needs more data. +*/ + +import { program } from 'commander' +import fs from 'fs/promises' +import { pick } from 'lodash-es' + +import { loadPages, loadPageMap } from '../../lib/page-data.js' +import loadSiteData from '../../lib/site-data.js' +import loadRedirects from '../../lib/redirects/precompile.js' +import { allVersions, allVersionKeys } from '../../lib/all-versions.js' +import { languageKeys } from '../../lib/languages.js' +import { getProductStringFromPath } from '../../lib/path-utils.js' + +program + .description('Create a translation health report for one language.') + .requiredOption('-l, --language ', 'The language to health check') + .requiredOption('-r, --gitref ', 'Language repo latest git commit short SHA') + .parse(process.argv) + +// Gather popularity data the search uses to prioritize errors +async function fetchPopularityData() { + const output = {} + const popularPagesRaw = await fs.readFile('lib/search/popular-pages.json', 'utf8') + for (const line of popularPagesRaw.split('\n')) { + try { + const row = JSON.parse(line) + output[row.path_article] = row.path_count + } catch {} + } + return output +} + +async function collectPageErrors(page, { language, data, redirects, plainPath, pageMap }) { + // Go through each version... + const promises = allVersionKeys + .filter((version) => page.applicableVersions.includes(version)) + .map(async (version) => { + // Collect if errors + const pageVersionErrors = [] + try { + const path = `/${language}/${version}/${plainPath}` + // Reference middleware/context.js for data shape + const context = { + ...data, // needed for all pages + currentVersion: version, // needed for all pages + currentLanguage: language, // needed for all pages + currentPath: path, // needed for all pages + currentVersionObj: allVersions[version], // needed for ifversion tag + currentProduct: getProductStringFromPath(path), // needed for learning-track on guides pages + pages: pageMap, // needed for learning-track on guides pages + redirects, // needed for learning-track on guides pages + } + await page.render(context, pageVersionErrors) + } catch (err) { + pageVersionErrors.push(err) + } + if (pageVersionErrors.length) { + return [ + version, + // Filter down properties to make it easier for + // translators to get the clearest information on the error + pageVersionErrors.map((err) => pick(err, ['name', 'message', 'token.content'])), + ] + // Other fields: Object.getOwnPropertyNames(err) + } + }) + const arr = (await Promise.all(promises)).filter(Boolean) + if (arr.length) { + return Object.fromEntries(arr) + } +} + +function groupErrors(errors) { + return errors + .map((page) => Object.values(page.versions).flat()) + .flat() + .map((version) => version.message) + .reduce((sum, val) => { + sum[val] = sum[val] || 0 + sum[val]++ + return sum + }, {}) +} + +async function createReport() { + // Check that the language is valid + const { language, gitref } = program.opts() + if (!languageKeys.includes(language)) { + throw new Error(`Language ${language} is not in ${languageKeys.join()}.`) + } + + // Load popularity data to sort errors + const popularity = await fetchPopularityData() + + // Load all pages + const allPages = await loadPages() + const dataErrors = [] + const data = loadSiteData(dataErrors)[language] + const pages = allPages + .filter((page) => page.languageCode === language) + // Early access pages log to the console, which would show in the report + .filter((page) => !page.relativePath.includes('early-access')) + const pageMap = await loadPageMap(pages) + const redirects = await loadRedirects(pages) + + // Try to render each page + const pageErrors = ( + await Promise.all( + pages.map(async (page) => { + const plainPath = page.relativePath.replace('/index.md', '').replace('.md', '') + const errorsByVersion = await collectPageErrors(page, { + language, + data, + redirects, + plainPath, + pageMap, + }) + if (errorsByVersion) { + return { + path: plainPath, + popularity: popularity[plainPath] || 0, + versions: errorsByVersion, + } + } + }) + ) + ) + .filter(Boolean) + // Sort by popularity desc so the translators know what to focus on first + .sort((a, b) => b.popularity - a.popularity) + + // Begin an output report + const report = { + language, + gitref, + datetime: new Date().toJSON(), + totalPages: pages.length, + // totalErrorPages should be around en: 0, es: 1043, ja: 1004, pt: 995, cn: 1063 + totalErrorPages: pageErrors.length, + pageErrors, + // To group errors by message instead + groupedPageErrors: groupErrors(pageErrors), + // Filter down properties to make it easier for + // translators to get the clearest information on the error + dataErrors: dataErrors.map((err) => pick(err, ['name', 'message', 'token.content'])), + } + + return report +} + +console.log(JSON.stringify(await createReport(), null, 2)) From b1b220aff7e50c6c283ede137d099384402a3e70 Mon Sep 17 00:00:00 2001 From: Robert Sese <734194+rsese@users.noreply.github.com> Date: Wed, 16 Nov 2022 13:09:06 -0600 Subject: [PATCH 2/2] Test workflow for WIP languages and the translation pipeline (#32532) --- .../wip-langs-create-translation-batch-pr.yml | 173 ++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 .github/workflows/wip-langs-create-translation-batch-pr.yml diff --git a/.github/workflows/wip-langs-create-translation-batch-pr.yml b/.github/workflows/wip-langs-create-translation-batch-pr.yml new file mode 100644 index 000000000000..c4b9e158625a --- /dev/null +++ b/.github/workflows/wip-langs-create-translation-batch-pr.yml @@ -0,0 +1,173 @@ +name: WIP Languages Create translation Batch Pull Request + +# **What it does**: +# - Creates one pull request per WIP language after running a series of automated checks, +# removing translations that are broken in any known way +# **Why we have it**: +# - To test the translation pipeline for WIP languages +# **Who does it impact**: Helps test how WIP languages will behave in CI + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + create-translation-batch-for-wip-langs: + name: Create translation batch for WIP languages + if: github.repository == 'github/docs-internal' + runs-on: ubuntu-latest + # A sync's average run time is ~3.2 hours. + # This sets a maximum execution time of 300 minutes (5 hours) to prevent the workflow from running longer than necessary. + timeout-minutes: 300 + strategy: + fail-fast: false + max-parallel: 1 + matrix: + include: + - language: ru + language_dir: translations/ru-RU + language_repo: github/docs-internal.ru-ru + + - language: ko + language_dir: translations/ko-KR + language_repo: github/docs-internal.ko-kr + + - language: fr + language_dir: translations/fr-FR + language_repo: github/docs-internal.fr-fr + + - language: de + language_dir: translations/de-DE + language_repo: github/docs-internal.de-de + + steps: + - name: Set branch name + id: set-branch + run: | + echo "::set-output name=BRANCH_NAME::msft-translation-batch-${{ matrix.language }}-$(date +%Y-%m-%d__%H-%M)" + + - run: git config --global user.name "docubot" + - run: git config --global user.email "67483024+docubot@users.noreply.github.com" + + - name: Checkout the docs-internal repo + uses: actions/checkout@dcd71f646680f2efd8db4afa5ad64fdcba30e748 + with: + fetch-depth: 0 + lfs: true + + - name: Create a branch for the current language + run: git checkout -b ${{ steps.set-branch.outputs.BRANCH_NAME }} + + - name: Remove unwanted git hooks + run: rm .git/hooks/post-checkout + + - name: Remove all language translations + run: | + git rm -rf --quiet ${{ matrix.language_dir }}/content + git rm -rf --quiet ${{ matrix.language_dir }}/data + + - name: Checkout the language-specific repo + uses: actions/checkout@dcd71f646680f2efd8db4afa5ad64fdcba30e748 + with: + repository: ${{ matrix.language_repo }} + token: ${{ secrets.DOCUBOT_READORG_REPO_WORKFLOW_SCOPES }} + path: ${{ matrix.language_dir }} + + - name: Remove .git from the language-specific repo + run: rm -rf ${{ matrix.language_dir }}/.git + + - name: Commit translated files + run: | + git add ${{ matrix.language_dir }} + git commit -m "Add translations" || echo "Nothing to commit" + + - name: 'Setup node' + uses: actions/setup-node@17f8bd926464a1afa4c6a11669539e9c1ba77048 + with: + node-version: '16.17.0' + + - run: npm ci + + - name: Homogenize frontmatter + run: | + node script/i18n/homogenize-frontmatter.js + git add ${{ matrix.language_dir }} && git commit -m "Run script/i18n/homogenize-frontmatter.js" || echo "Nothing to commit" + + - name: Fix translation errors + run: | + node script/i18n/fix-translation-errors.js + git add ${{ matrix.language_dir }} && git commit -m "Run script/i18n/fix-translation-errors.js" || echo "Nothing to commit" + + - name: Check rendering + run: | + node script/i18n/lint-translation-files.js --check rendering | tee -a /tmp/batch.log | cat + git add ${{ matrix.language_dir }} && git commit -m "Run script/i18n/lint-translation-files.js --check rendering" || echo "Nothing to commit" + + - name: Reset files with broken liquid tags + run: | + node script/i18n/msft-reset-files-with-broken-liquid-tags.js --language=${{ matrix.language }} | tee -a /tmp/batch.log | cat + git add ${{ matrix.language_dir }} && git commit -m "run script/i18n/msft-reset-files-with-broken-liquid-tags.js --language=${{ matrix.language }}" || echo "Nothing to commit" + + - name: Check in CSV report + run: | + mkdir -p translations/log + csvFile=translations/log/msft-${{ matrix.language }}-resets.csv + script/i18n/msft-report-reset-files.js --report-type=csv --language=${{ matrix.language }} --log-file=/tmp/batch.log > $csvFile + git add -f $csvFile && git commit -m "Check in ${{ matrix.language }} CSV report" || echo "Nothing to commit" + + - name: Write the reported files that were reset to /tmp/pr-body.txt + run: script/i18n/msft-report-reset-files.js --report-type=pull-request-body --language=${{ matrix.language }} --log-file=/tmp/batch.log --csv-path=${{ steps.set-branch.outputs.BRANCH_NAME }}/translations/log/msft-${{ matrix.language }}-resets.csv > /tmp/pr-body.txt + + - name: Push filtered translations + run: git push origin ${{ steps.set-branch.outputs.BRANCH_NAME }} + + - name: Close existing stale batches + uses: lee-dohm/close-matching-issues@e9e43aad2fa6f06a058cedfd8fb975fd93b56d8f + with: + token: ${{ secrets.OCTOMERGER_PAT_WITH_REPO_AND_WORKFLOW_SCOPE }} + query: 'type:pr label:translation-batch-${{ matrix.language }}' + + - name: Create translation batch pull request + env: + GITHUB_TOKEN: ${{ secrets.DOCUBOT_REPO_PAT }} + TITLE: '[DO NOT MERGE - WIP Language test]: New translation batch for ${{ matrix.language }}' + BASE: 'main' + HEAD: ${{ steps.set-branch.outputs.BRANCH_NAME }} + LANGUAGE: ${{ matrix.language }} + BODY_FILE: '/tmp/pr-body.txt' + run: .github/actions-scripts/msft-create-translation-batch-pr.js + + # - name: Approve PR + # if: github.ref_name == 'main' + # env: + # GITHUB_TOKEN: ${{ secrets.OCTOMERGER_PAT_WITH_REPO_AND_WORKFLOW_SCOPE }} + # run: gh pr review --approve || echo "Nothing to approve" + + # - name: Set auto-merge + # if: github.ref_name == 'main' + # env: + # GITHUB_TOKEN: ${{ secrets.OCTOMERGER_PAT_WITH_REPO_AND_WORKFLOW_SCOPE }} + # run: gh pr merge ${{ steps.set-branch.outputs.BRANCH_NAME }} --auto --squash || echo "Nothing to merge" + + # When the maximum execution time is reached for this job, Actions cancels the workflow run. + # This emits a notification for the first responder to triage. + # - name: Send Slack notification if workflow is cancelled + # uses: someimportantcompany/github-actions-slack-message@f8d28715e7b8a4717047d23f48c39827cacad340 + # if: cancelled() + # with: + # channel: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }} + # bot-token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }}🎉 + # color: failure + # text: 'The new translation batch for ${{ matrix.language }} was cancelled.' + + # Emit a notification for the first responder to triage if the workflow failed. + # - name: Send Slack notification if workflow failed + # uses: someimportantcompany/github-actions-slack-message@f8d28715e7b8a4717047d23f48c39827cacad340 + # if: failure() + # with: + # channel: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }} + # bot-token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }} + # color: failure + # text: 'The new translation batch for ${{ matrix.language }} failed.'