Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 203 additions & 0 deletions .github/workflows/link-checker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
name: Link Checker

on:
schedule:
- cron: '0 2 * * *'
workflow_dispatch:
inputs:
check_type:
description: 'Type of link check to perform'
required: true
default: 'deep'
type: choice
options:
- 'quick'
- 'deep'
base_url:
description: 'Base URL to check'
required: false
default: 'https://developers.glean.com'
type: string

permissions:
contents: read
issues: write
pull-requests: write

jobs:
link-check:
name: Check Links
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
sparse-checkout: |
scripts/check-links.sh
sparse-checkout-cone-mode: false

- name: Cache lychee
id: cache-lychee
uses: actions/cache@v4
with:
path: ~/.cargo/bin/lychee
key: lychee-${{ runner.os }}-v1

- name: Install lychee
if: steps.cache-lychee.outputs.cache-hit != 'true'
run: |
curl -sSL https://github.com/lycheeverse/lychee/releases/latest/download/lychee-x86_64-unknown-linux-gnu.tar.gz | tar -xz
mkdir -p ~/.cargo/bin
mv lychee ~/.cargo/bin/
echo "$HOME/.cargo/bin" >> $GITHUB_PATH

- name: Verify lychee installation
run: lychee --version

- name: Set check parameters
id: params
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "check_type=${{ github.event.inputs.check_type }}" >> $GITHUB_OUTPUT
echo "base_url=${{ github.event.inputs.base_url }}" >> $GITHUB_OUTPUT
else
echo "check_type=deep" >> $GITHUB_OUTPUT
echo "base_url=https://developers.glean.com" >> $GITHUB_OUTPUT
fi

- name: Run link check
id: link-check
run: |
base_url="${{ steps.params.outputs.base_url }}"
check_type="${{ steps.params.outputs.check_type }}"

echo "🔍 Running $check_type link check on $base_url"

if [ "$check_type" = "deep" ]; then
echo "deep_check=true" >> $GITHUB_OUTPUT
./scripts/check-links.sh "$base_url" true > link-check-results.txt 2>&1
else
echo "deep_check=false" >> $GITHUB_OUTPUT
./scripts/check-links.sh "$base_url" false > link-check-results.txt 2>&1
fi

exit_code=$?
echo "exit_code=$exit_code" >> $GITHUB_OUTPUT

if [ $exit_code -eq 0 ]; then
echo "✅ Link check passed!"
echo "status=success" >> $GITHUB_OUTPUT
else
echo "❌ Link check failed with errors"
echo "status=failure" >> $GITHUB_OUTPUT
fi

- name: Parse results
id: parse-results
run: |
if [ -f link-check-results.txt ]; then
total_links=$(grep -o "Total: [0-9,]*" link-check-results.txt | tail -1 | grep -o "[0-9,]*" | tr -d ',')
errors=$(grep -o "Errors: [0-9,]*" link-check-results.txt | tail -1 | grep -o "[0-9,]*" | tr -d ',')
excluded=$(grep -o "Excluded: [0-9,]*" link-check-results.txt | tail -1 | grep -o "[0-9,]*" | tr -d ',')

echo "total_links=${total_links:-0}" >> $GITHUB_OUTPUT
echo "errors=${errors:-0}" >> $GITHUB_OUTPUT
echo "excluded=${excluded:-0}" >> $GITHUB_OUTPUT

echo "Results Summary:"
echo "Total links checked: ${total_links:-0}"
echo "Errors found: ${errors:-0}"
echo "Links excluded: ${excluded:-0}"
fi

- name: Upload results
if: always()
uses: actions/upload-artifact@v4
with:
name: link-check-results-${{ steps.params.outputs.check_type }}
path: link-check-results.txt
retention-days: 30

- name: Create job summary
if: always()
run: |
echo "# Link Check Results" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "**Check Type:** ${{ steps.params.outputs.check_type }}" >> $GITHUB_STEP_SUMMARY
echo "**Base URL:** ${{ steps.params.outputs.base_url }}" >> $GITHUB_STEP_SUMMARY
echo "**Status:** ${{ steps.link-check.outputs.status }}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "## Summary" >> $GITHUB_STEP_SUMMARY
echo "- **Total Links:** ${{ steps.parse-results.outputs.total_links }}" >> $GITHUB_STEP_SUMMARY
echo "- **Errors:** ${{ steps.parse-results.outputs.errors }}" >> $GITHUB_STEP_SUMMARY
echo "- **Excluded:** ${{ steps.parse-results.outputs.excluded }}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY

if [ "${{ steps.link-check.outputs.status }}" = "failure" ]; then
echo "## ❌ Broken Links Found" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "The following errors were detected:" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
grep -E "(ERROR|FAILED)" link-check-results.txt | head -20 >> $GITHUB_STEP_SUMMARY || true
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "📄 Full results are available in the uploaded artifact." >> $GITHUB_STEP_SUMMARY
else
echo "## ✅ All Links Working" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "No broken links were found! 🎉" >> $GITHUB_STEP_SUMMARY
fi

- name: Create GitHub issue for broken links
if: failure() && steps.link-check.outputs.status == 'failure' && github.event_name == 'schedule'
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const results = fs.readFileSync('link-check-results.txt', 'utf8');
const errors = results.split('\n').filter(line => line.includes('ERROR') || line.includes('FAILED'));

const issueBody = `
# 🔗 Broken Links Detected

The nightly link check found **${{ steps.parse-results.outputs.errors }}** broken links on the site.

## Summary
- **Total Links Checked:** ${{ steps.parse-results.outputs.total_links }}
- **Errors Found:** ${{ steps.parse-results.outputs.errors }}
- **Links Excluded:** ${{ steps.parse-results.outputs.excluded }}
- **Check Type:** ${{ steps.params.outputs.check_type }}
- **Base URL:** ${{ steps.params.outputs.base_url }}

## ❌ Broken Links

\`\`\`
${errors.slice(0, 30).join('\n')}
${errors.length > 30 ? '\n... and ' + (errors.length - 30) + ' more errors' : ''}
\`\`\`

## Next Steps

1. Review the full results in the [workflow run](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})
2. Fix or exclude the broken links
3. Run \`yarn links:check:deep\` locally to verify fixes

---

*This issue was automatically created by the nightly link checker.*
`;

await github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: `Broken Links Detected (${{ steps.parse-results.outputs.errors }} errors)`,
body: issueBody,
labels: ['bug', 'documentation', 'automated']
});

- name: Fail workflow if links are broken
if: steps.link-check.outputs.status == 'failure'
run: |
echo "❌ Link check failed with ${{ steps.parse-results.outputs.errors }} broken links"
exit 1
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
"generate:redirects": "tsx scripts/generate-redirects.ts",
"typecheck": "tsc",
"format": "prettier --write .",
"format:check": "prettier --check ."
"format:check": "prettier --check .",
"links:check": "scripts/check-links.sh https://developers.glean.com true"
},
"dependencies": {
"@docusaurus/core": "^3.8.1",
Expand Down
70 changes: 67 additions & 3 deletions scripts/check-links.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@
# check-links.sh – crawl every <loc> in a sitemap (or sitemap‑index) with lychee
set -euo pipefail

SITEMAP_URL="${1:-https://developers.glean.com/sitemap.xml}"
BASE_URL="${1:-https://developers.glean.com}"
DEEP_CHECK="${2:-false}"
SITEMAP_URL="${BASE_URL}/sitemap.xml"

tmp_dir="$(mktemp -d)"
url_list="${tmp_dir}/urls.txt"

echo "Fetching sitemap from: $SITEMAP_URL"
echo "Base URL: $BASE_URL"

# ── Fetch sitemap and extract all <loc> elements, namespace‑agnostic ──────────
if command -v xmllint >/dev/null 2>&1; then
curl -sSL "$SITEMAP_URL" |
Expand All @@ -31,7 +36,66 @@ fi
grep -vE '\.xml(\.gz)?$' "$url_list" >"${url_list}.pages"
mv "${url_list}.pages" "$url_list"

# ── Run lychee (file passed POSITIONALLY) ─────────────────────────────────────
lychee --verbose "$url_list"
# ── Build lychee command with appropriate options ─────────────────────────────
lychee_cmd="lychee --verbose --max-redirects 10 --max-concurrency 8"

# Add exclusions for known problematic sites (using regex patterns)
lychee_cmd="$lychee_cmd --exclude ^https://community\.glean\.com/"
lychee_cmd="$lychee_cmd --exclude ^https://support\.glean\.com/"
lychee_cmd="$lychee_cmd --exclude ^https://app\.glean\.com/"
lychee_cmd="$lychee_cmd --exclude ^https://.*-be\.glean\.com/"
lychee_cmd="$lychee_cmd --exclude ^https://.*\.gleantest\.com/"
lychee_cmd="$lychee_cmd --exclude ^https://.*internal\.company\.com/"
lychee_cmd="$lychee_cmd --exclude ^https://codepen\.io/"

# Add exclusions for localhost and example URLs
lychee_cmd="$lychee_cmd --exclude ^https?://127\.0\.0\.1"
lychee_cmd="$lychee_cmd --exclude ^https?://localhost"
lychee_cmd="$lychee_cmd --exclude ^https?://company\.com/"
lychee_cmd="$lychee_cmd --exclude ^https?://example\.com/"
lychee_cmd="$lychee_cmd --exclude ^https?://example\.net/"
lychee_cmd="$lychee_cmd --exclude ^https?://your-domain\."
lychee_cmd="$lychee_cmd --exclude ^https?://your-org\."
lychee_cmd="$lychee_cmd --exclude ^https?://domain-be\."
lychee_cmd="$lychee_cmd --exclude ^https?://instance-be\."
lychee_cmd="$lychee_cmd --exclude ^https://.*\.glean\.engineering\.co\.in/"

# Add exclusions for external API endpoints that require authentication
lychee_cmd="$lychee_cmd --exclude ^https://accounts\.google\.com/"
lychee_cmd="$lychee_cmd --exclude ^https://.*\.googleapis\.com/"
lychee_cmd="$lychee_cmd --exclude ^https://docs\.google\.com/document/d/"
lychee_cmd="$lychee_cmd --exclude ^https://api\.atlassian\.com/"
lychee_cmd="$lychee_cmd --exclude ^https://auth\.atlassian\.com/"
lychee_cmd="$lychee_cmd --exclude ^https://.*\.atlassian\.net/"
lychee_cmd="$lychee_cmd --exclude ^https://portal\.azure\.com/"
lychee_cmd="$lychee_cmd --exclude ^https://docs\.nvidia\.com/"
lychee_cmd="$lychee_cmd --exclude ^https://openid\.net/"
lychee_cmd="$lychee_cmd --exclude ^https://langchain-ai\.github\.io/"
lychee_cmd="$lychee_cmd --exclude ^https://picsum\.photos/"
lychee_cmd="$lychee_cmd --exclude ^https://docs\.lumapps\.com/"

# Add exclusions for GitHub URLs (will be handled separately if needed)
lychee_cmd="$lychee_cmd --exclude ^https://github\.com/gleanwork/"

# Add exclusions for dynamic OpenAPI fragment links that are generated by JavaScript
# Escape dots in base URL for regex and add full path patterns
base_url_escaped=$(echo "$BASE_URL" | sed 's/\./\\./g')
lychee_cmd="$lychee_cmd --exclude ^${base_url_escaped}/api/client-api/.*#"
lychee_cmd="$lychee_cmd --exclude ^${base_url_escaped}/api/indexing-api/.*#"

if [ "$DEEP_CHECK" = "true" ]; then
echo "Running deep link check (crawls content of each page to find all links)..."
lychee_cmd="$lychee_cmd --include-fragments --include-verbatim"

echo "Restricting to base URL: $BASE_URL"
lychee_cmd="$lychee_cmd --base-url $BASE_URL"

# Pass URLs directly to lychee (not as a file) so it crawls their content
echo "Found $(wc -l < "$url_list") pages to crawl..."
$lychee_cmd $(cat "$url_list")
else
echo "Running sitemap-only link check..."
$lychee_cmd "$url_list"
fi

rm -rf "$tmp_dir"