diff --git a/.github/workflows/link-checker.yml b/.github/workflows/link-checker.yml new file mode 100644 index 00000000..34712b19 --- /dev/null +++ b/.github/workflows/link-checker.yml @@ -0,0 +1,203 @@ +name: Link Checker + +on: + schedule: + - cron: '0 2 * * *' + workflow_dispatch: + inputs: + check_type: + description: 'Type of link check to perform' + required: true + default: 'deep' + type: choice + options: + - 'quick' + - 'deep' + base_url: + description: 'Base URL to check' + required: false + default: 'https://developers.glean.com' + type: string + +permissions: + contents: read + issues: write + pull-requests: write + +jobs: + link-check: + name: Check Links + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + sparse-checkout: | + scripts/check-links.sh + sparse-checkout-cone-mode: false + + - name: Cache lychee + id: cache-lychee + uses: actions/cache@v4 + with: + path: ~/.cargo/bin/lychee + key: lychee-${{ runner.os }}-v1 + + - name: Install lychee + if: steps.cache-lychee.outputs.cache-hit != 'true' + run: | + curl -sSL https://github.com/lycheeverse/lychee/releases/latest/download/lychee-x86_64-unknown-linux-gnu.tar.gz | tar -xz + mkdir -p ~/.cargo/bin + mv lychee ~/.cargo/bin/ + echo "$HOME/.cargo/bin" >> $GITHUB_PATH + + - name: Verify lychee installation + run: lychee --version + + - name: Set check parameters + id: params + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "check_type=${{ github.event.inputs.check_type }}" >> $GITHUB_OUTPUT + echo "base_url=${{ github.event.inputs.base_url }}" >> $GITHUB_OUTPUT + else + echo "check_type=deep" >> $GITHUB_OUTPUT + echo "base_url=https://developers.glean.com" >> $GITHUB_OUTPUT + fi + + - name: Run link check + id: link-check + run: | + base_url="${{ steps.params.outputs.base_url }}" + check_type="${{ steps.params.outputs.check_type }}" + + echo "πŸ” Running $check_type link check on $base_url" + + if [ "$check_type" = "deep" ]; then + echo "deep_check=true" >> $GITHUB_OUTPUT + ./scripts/check-links.sh "$base_url" true > link-check-results.txt 2>&1 + else + echo "deep_check=false" >> $GITHUB_OUTPUT + ./scripts/check-links.sh "$base_url" false > link-check-results.txt 2>&1 + fi + + exit_code=$? + echo "exit_code=$exit_code" >> $GITHUB_OUTPUT + + if [ $exit_code -eq 0 ]; then + echo "βœ… Link check passed!" + echo "status=success" >> $GITHUB_OUTPUT + else + echo "❌ Link check failed with errors" + echo "status=failure" >> $GITHUB_OUTPUT + fi + + - name: Parse results + id: parse-results + run: | + if [ -f link-check-results.txt ]; then + total_links=$(grep -o "Total: [0-9,]*" link-check-results.txt | tail -1 | grep -o "[0-9,]*" | tr -d ',') + errors=$(grep -o "Errors: [0-9,]*" link-check-results.txt | tail -1 | grep -o "[0-9,]*" | tr -d ',') + excluded=$(grep -o "Excluded: [0-9,]*" link-check-results.txt | tail -1 | grep -o "[0-9,]*" | tr -d ',') + + echo "total_links=${total_links:-0}" >> $GITHUB_OUTPUT + echo "errors=${errors:-0}" >> $GITHUB_OUTPUT + echo "excluded=${excluded:-0}" >> $GITHUB_OUTPUT + + echo "Results Summary:" + echo "Total links checked: ${total_links:-0}" + echo "Errors found: ${errors:-0}" + echo "Links excluded: ${excluded:-0}" + fi + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: link-check-results-${{ steps.params.outputs.check_type }} + path: link-check-results.txt + retention-days: 30 + + - name: Create job summary + if: always() + run: | + echo "# Link Check Results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Check Type:** ${{ steps.params.outputs.check_type }}" >> $GITHUB_STEP_SUMMARY + echo "**Base URL:** ${{ steps.params.outputs.base_url }}" >> $GITHUB_STEP_SUMMARY + echo "**Status:** ${{ steps.link-check.outputs.status }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "## Summary" >> $GITHUB_STEP_SUMMARY + echo "- **Total Links:** ${{ steps.parse-results.outputs.total_links }}" >> $GITHUB_STEP_SUMMARY + echo "- **Errors:** ${{ steps.parse-results.outputs.errors }}" >> $GITHUB_STEP_SUMMARY + echo "- **Excluded:** ${{ steps.parse-results.outputs.excluded }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + if [ "${{ steps.link-check.outputs.status }}" = "failure" ]; then + echo "## ❌ Broken Links Found" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "The following errors were detected:" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY + grep -E "(ERROR|FAILED)" link-check-results.txt | head -20 >> $GITHUB_STEP_SUMMARY || true + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "πŸ“„ Full results are available in the uploaded artifact." >> $GITHUB_STEP_SUMMARY + else + echo "## βœ… All Links Working" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "No broken links were found! πŸŽ‰" >> $GITHUB_STEP_SUMMARY + fi + + - name: Create GitHub issue for broken links + if: failure() && steps.link-check.outputs.status == 'failure' && github.event_name == 'schedule' + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const results = fs.readFileSync('link-check-results.txt', 'utf8'); + const errors = results.split('\n').filter(line => line.includes('ERROR') || line.includes('FAILED')); + + const issueBody = ` + # πŸ”— Broken Links Detected + + The nightly link check found **${{ steps.parse-results.outputs.errors }}** broken links on the site. + + ## Summary + - **Total Links Checked:** ${{ steps.parse-results.outputs.total_links }} + - **Errors Found:** ${{ steps.parse-results.outputs.errors }} + - **Links Excluded:** ${{ steps.parse-results.outputs.excluded }} + - **Check Type:** ${{ steps.params.outputs.check_type }} + - **Base URL:** ${{ steps.params.outputs.base_url }} + + ## ❌ Broken Links + + \`\`\` + ${errors.slice(0, 30).join('\n')} + ${errors.length > 30 ? '\n... and ' + (errors.length - 30) + ' more errors' : ''} + \`\`\` + + ## Next Steps + + 1. Review the full results in the [workflow run](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) + 2. Fix or exclude the broken links + 3. Run \`yarn links:check:deep\` locally to verify fixes + + --- + + *This issue was automatically created by the nightly link checker.* + `; + + await github.rest.issues.create({ + owner: context.repo.owner, + repo: context.repo.repo, + title: `Broken Links Detected (${{ steps.parse-results.outputs.errors }} errors)`, + body: issueBody, + labels: ['bug', 'documentation', 'automated'] + }); + + - name: Fail workflow if links are broken + if: steps.link-check.outputs.status == 'failure' + run: | + echo "❌ Link check failed with ${{ steps.parse-results.outputs.errors }} broken links" + exit 1 \ No newline at end of file diff --git a/package.json b/package.json index 671e8aca..2f468456 100644 --- a/package.json +++ b/package.json @@ -20,7 +20,8 @@ "generate:redirects": "tsx scripts/generate-redirects.ts", "typecheck": "tsc", "format": "prettier --write .", - "format:check": "prettier --check ." + "format:check": "prettier --check .", + "links:check": "scripts/check-links.sh https://developers.glean.com true" }, "dependencies": { "@docusaurus/core": "^3.8.1", diff --git a/scripts/check-links.sh b/scripts/check-links.sh index b2373f27..ca587c8c 100755 --- a/scripts/check-links.sh +++ b/scripts/check-links.sh @@ -2,11 +2,16 @@ # check-links.sh – crawl every in a sitemap (or sitemap‑index) with lychee set -euo pipefail -SITEMAP_URL="${1:-https://developers.glean.com/sitemap.xml}" +BASE_URL="${1:-https://developers.glean.com}" +DEEP_CHECK="${2:-false}" +SITEMAP_URL="${BASE_URL}/sitemap.xml" tmp_dir="$(mktemp -d)" url_list="${tmp_dir}/urls.txt" +echo "Fetching sitemap from: $SITEMAP_URL" +echo "Base URL: $BASE_URL" + # ── Fetch sitemap and extract all elements, namespace‑agnostic ────────── if command -v xmllint >/dev/null 2>&1; then curl -sSL "$SITEMAP_URL" | @@ -31,7 +36,66 @@ fi grep -vE '\.xml(\.gz)?$' "$url_list" >"${url_list}.pages" mv "${url_list}.pages" "$url_list" -# ── Run lychee (file passed POSITIONALLY) ───────────────────────────────────── -lychee --verbose "$url_list" +# ── Build lychee command with appropriate options ───────────────────────────── +lychee_cmd="lychee --verbose --max-redirects 10 --max-concurrency 8" + +# Add exclusions for known problematic sites (using regex patterns) +lychee_cmd="$lychee_cmd --exclude ^https://community\.glean\.com/" +lychee_cmd="$lychee_cmd --exclude ^https://support\.glean\.com/" +lychee_cmd="$lychee_cmd --exclude ^https://app\.glean\.com/" +lychee_cmd="$lychee_cmd --exclude ^https://.*-be\.glean\.com/" +lychee_cmd="$lychee_cmd --exclude ^https://.*\.gleantest\.com/" +lychee_cmd="$lychee_cmd --exclude ^https://.*internal\.company\.com/" +lychee_cmd="$lychee_cmd --exclude ^https://codepen\.io/" + +# Add exclusions for localhost and example URLs +lychee_cmd="$lychee_cmd --exclude ^https?://127\.0\.0\.1" +lychee_cmd="$lychee_cmd --exclude ^https?://localhost" +lychee_cmd="$lychee_cmd --exclude ^https?://company\.com/" +lychee_cmd="$lychee_cmd --exclude ^https?://example\.com/" +lychee_cmd="$lychee_cmd --exclude ^https?://example\.net/" +lychee_cmd="$lychee_cmd --exclude ^https?://your-domain\." +lychee_cmd="$lychee_cmd --exclude ^https?://your-org\." +lychee_cmd="$lychee_cmd --exclude ^https?://domain-be\." +lychee_cmd="$lychee_cmd --exclude ^https?://instance-be\." +lychee_cmd="$lychee_cmd --exclude ^https://.*\.glean\.engineering\.co\.in/" + +# Add exclusions for external API endpoints that require authentication +lychee_cmd="$lychee_cmd --exclude ^https://accounts\.google\.com/" +lychee_cmd="$lychee_cmd --exclude ^https://.*\.googleapis\.com/" +lychee_cmd="$lychee_cmd --exclude ^https://docs\.google\.com/document/d/" +lychee_cmd="$lychee_cmd --exclude ^https://api\.atlassian\.com/" +lychee_cmd="$lychee_cmd --exclude ^https://auth\.atlassian\.com/" +lychee_cmd="$lychee_cmd --exclude ^https://.*\.atlassian\.net/" +lychee_cmd="$lychee_cmd --exclude ^https://portal\.azure\.com/" +lychee_cmd="$lychee_cmd --exclude ^https://docs\.nvidia\.com/" +lychee_cmd="$lychee_cmd --exclude ^https://openid\.net/" +lychee_cmd="$lychee_cmd --exclude ^https://langchain-ai\.github\.io/" +lychee_cmd="$lychee_cmd --exclude ^https://picsum\.photos/" +lychee_cmd="$lychee_cmd --exclude ^https://docs\.lumapps\.com/" + +# Add exclusions for GitHub URLs (will be handled separately if needed) +lychee_cmd="$lychee_cmd --exclude ^https://github\.com/gleanwork/" + +# Add exclusions for dynamic OpenAPI fragment links that are generated by JavaScript +# Escape dots in base URL for regex and add full path patterns +base_url_escaped=$(echo "$BASE_URL" | sed 's/\./\\./g') +lychee_cmd="$lychee_cmd --exclude ^${base_url_escaped}/api/client-api/.*#" +lychee_cmd="$lychee_cmd --exclude ^${base_url_escaped}/api/indexing-api/.*#" + +if [ "$DEEP_CHECK" = "true" ]; then + echo "Running deep link check (crawls content of each page to find all links)..." + lychee_cmd="$lychee_cmd --include-fragments --include-verbatim" + + echo "Restricting to base URL: $BASE_URL" + lychee_cmd="$lychee_cmd --base-url $BASE_URL" + + # Pass URLs directly to lychee (not as a file) so it crawls their content + echo "Found $(wc -l < "$url_list") pages to crawl..." + $lychee_cmd $(cat "$url_list") +else + echo "Running sitemap-only link check..." + $lychee_cmd "$url_list" +fi rm -rf "$tmp_dir"