diff --git a/.github/workflows/search.yml b/.github/workflows/search.yml index ade8c38a93bd3..cdafa84f2ba93 100644 --- a/.github/workflows/search.yml +++ b/.github/workflows/search.yml @@ -68,8 +68,12 @@ jobs: run: | pnpm run codegen:examples pnpm run embeddings + pnpm run embeddings:nimbus - name: Refresh embeddings working-directory: ./apps/docs if: ${{ inputs.refresh }} - run: pnpm run embeddings:refresh + run: | + pnpm run codegen:examples + pnpm run embeddings:refresh + pnpm run embeddings:nimbus:refresh diff --git a/apps/docs/app/api/ai/docs/route.ts b/apps/docs/app/api/ai/docs/route.ts index b88d7837f82a7..8e3df4206a643 100644 --- a/apps/docs/app/api/ai/docs/route.ts +++ b/apps/docs/app/api/ai/docs/route.ts @@ -3,6 +3,8 @@ import { ApplicationError, UserError, clippy } from 'ai-commands/edge' import { NextRequest, NextResponse } from 'next/server' import OpenAI from 'openai' +import { isFeatureEnabled } from 'common/enabled-features' + export const runtime = 'edge' /* To avoid OpenAI errors, restrict to the Vercel Edge Function regions that overlap with the OpenAI API regions. @@ -54,7 +56,10 @@ export async function POST(req: NextRequest) { throw new UserError('Missing messages in request data') } - const response = await clippy(openai, supabaseClient, messages) + const useAltSearchIndex = !isFeatureEnabled('search:fullIndex') + const response = await clippy(openai, supabaseClient, messages, { + useAltSearchIndex, + }) // Proxy the streamed SSE response from OpenAI return new NextResponse(response.body, { diff --git a/apps/docs/next.config.mjs b/apps/docs/next.config.mjs index ad41748131f93..d1419a3e10455 100644 --- a/apps/docs/next.config.mjs +++ b/apps/docs/next.config.mjs @@ -156,9 +156,10 @@ const nextConfig = { ] }, typescript: { - // WARNING: production builds can successfully complete even there are type errors - // Typechecking is checked separately via .github/workflows/typecheck.yml - ignoreBuildErrors: true, + // On previews, typechecking is run via GitHub Action only for efficiency + // On production, we turn it on to prevent errors from conflicting PRs getting into + // prod + ignoreBuildErrors: process.env.NEXT_PUBLIC_VERCEL_ENV === 'production' ? false : true, }, eslint: { // We are already running linting via GH action, this will skip linting during production build on Vercel diff --git a/apps/docs/package.json b/apps/docs/package.json index fbf1faf4bab6f..5f9eb55ac14f2 100644 --- a/apps/docs/package.json +++ b/apps/docs/package.json @@ -17,7 +17,9 @@ "dev:secrets:pull": "AWS_PROFILE=supa-dev node ../../scripts/getSecrets.js -n local/docs", "dev:watch:troubleshooting": "node ./scripts/troubleshooting/watch.mjs", "embeddings": "tsx --conditions=react-server scripts/search/generate-embeddings.ts", + "embeddings:nimbus": "ENABLED_FEATURES_OVERRIDE_DISABLE_ALL=true pnpm run embeddings", "embeddings:refresh": "pnpm run embeddings --refresh", + "embeddings:nimbus:refresh": "ENABLED_FEATURES_OVERRIDE_DISABLE_ALL=true pnpm run embeddings:refresh", "last-changed": "tsx scripts/last-changed.ts", "last-changed:reset": "pnpm run last-changed -- --reset", "lint": "next lint", diff --git a/apps/docs/public/humans.txt b/apps/docs/public/humans.txt index 2aa31e79e6f06..eb85aba6dbd5e 100644 --- a/apps/docs/public/humans.txt +++ b/apps/docs/public/humans.txt @@ -139,6 +139,7 @@ Tyler Fontaine Tyler Shukert TzeYiing L Wen Bo Xie +Yuliya Marinova Yuri Santana ____________ diff --git a/apps/docs/resources/globalSearch/globalSearchModel.ts b/apps/docs/resources/globalSearch/globalSearchModel.ts index c6243d530c9a3..8b46f5d870721 100644 --- a/apps/docs/resources/globalSearch/globalSearchModel.ts +++ b/apps/docs/resources/globalSearch/globalSearchModel.ts @@ -3,6 +3,8 @@ import { convertPostgrestToApiError, type ApiErrorGeneric } from '~/app/api/util import { Result } from '~/features/helpers.fn' import { openAI } from '~/lib/openAi' import { supabase, type DatabaseCorrected } from '~/lib/supabase' + +import { isFeatureEnabled } from 'common/enabled-features' import { GuideModel } from '../guide/guideModel' import { DB_METADATA_TAG_PLATFORM_CLI, @@ -13,6 +15,9 @@ import { ReferenceSDKFunctionModel, SDKLanguageValues } from '../reference/refer import { TroubleshootingModel } from '../troubleshooting/troubleshootingModel' import { SearchResultInterface } from './globalSearchInterface' +type SearchFunction = 'search_content' | 'search_content_nimbus' +type SearchHybridFunction = 'search_content_hybrid' | 'search_content_hybrid_nimbus' + export abstract class SearchResultModel { static async search( args: RootQueryTypeSearchDocsArgs, @@ -22,9 +27,14 @@ export abstract class SearchResultModel { const includeFullContent = requestedFields.includes('content') const embeddingResult = await openAI().createContentEmbedding(query) + const useAltSearchIndex = !isFeatureEnabled('search:fullIndex') + const searchFunction: SearchFunction = useAltSearchIndex + ? 'search_content_nimbus' + : 'search_content' + return embeddingResult.flatMapAsync(async ({ embedding }) => { const matchResult = new Result( - await supabase().rpc('search_content', { + await supabase().rpc(searchFunction, { embedding, include_full_content: includeFullContent, max_result: args.limit ?? undefined, @@ -49,9 +59,14 @@ export abstract class SearchResultModel { const includeFullContent = requestedFields.includes('content') const embeddingResult = await openAI().createContentEmbedding(query) + const useAltSearchIndex = !isFeatureEnabled('search:fullIndex') + const searchFunction: SearchHybridFunction = useAltSearchIndex + ? 'search_content_hybrid_nimbus' + : 'search_content_hybrid' + return embeddingResult.flatMapAsync(async ({ embedding }) => { const matchResult = new Result( - await supabase().rpc('search_content_hybrid', { + await supabase().rpc(searchFunction, { query_text: query, query_embedding: embedding, include_full_content: includeFullContent, diff --git a/apps/docs/scripts/search/embeddings/utils.ts b/apps/docs/scripts/search/embeddings/utils.ts new file mode 100644 index 0000000000000..63f8578d2be08 --- /dev/null +++ b/apps/docs/scripts/search/embeddings/utils.ts @@ -0,0 +1,106 @@ +export interface PageInfo { + pageId: number + path: string + checksum: string + sectionsCount: number +} + +export interface PageSectionForEmbedding { + pageId: number + path: string + slug?: string + heading?: string + content: string + input: string + ragIgnore: boolean +} + +export interface PageSectionWithEmbedding extends PageSectionForEmbedding { + embedding: number[] +} + +export interface ProcessingResult { + successfulPages: Set + failedPages: Set + totalSectionsProcessed: number + totalSectionsInserted: number +} + +export function createBatches(array: T[], batchSize: number): T[][] { + const batches: T[][] = [] + for (let i = 0; i < array.length; i += batchSize) { + batches.push(array.slice(i, i + batchSize)) + } + return batches +} + +export function mapEmbeddingsToSections( + batch: PageSectionForEmbedding[], + data: Array<{ embedding?: number[] }>, + batchNumber: number +): { + sectionsWithEmbeddings: PageSectionWithEmbedding[] + failedSectionIndexes: Set +} { + const sectionsWithEmbeddings: PageSectionWithEmbedding[] = [] + const failedSectionIndexes: Set = new Set() + + if (batch.length !== data.length) { + console.error( + `Ignoring all embeddings returned from batch ${batchNumber} because returned number doesn't match input number` + ) + batch.forEach((_, index) => { + failedSectionIndexes.add(index) + }) + } + + for (let i = 0; i < batch.length; i++) { + if (data[i].embedding) { + sectionsWithEmbeddings.push({ ...batch[i], embedding: data[i].embedding! }) + } else { + failedSectionIndexes.add(i) + } + } + + return { sectionsWithEmbeddings, failedSectionIndexes } +} + +export function updatePageInsertionCounts( + pageSectionsInserted: Map, + sectionsWithEmbeddings: PageSectionWithEmbedding[] +) { + sectionsWithEmbeddings.forEach((section) => { + const current = pageSectionsInserted.get(section.pageId) || 0 + pageSectionsInserted.set(section.pageId, current + 1) + }) +} + +export function computePageResults( + pageInfoMap: Map, + pageSectionsInserted: Map, + result: ProcessingResult +) { + for (const [pageId, pageInfo] of pageInfoMap) { + const insertedCount = pageSectionsInserted.get(pageId) || 0 + if (insertedCount === pageInfo.sectionsCount && !result.failedPages.has(pageId)) { + result.successfulPages.add(pageId) + } else { + result.failedPages.add(pageId) + console.warn( + `Page ${pageInfo.path}: inserted ${insertedCount}/${pageInfo.sectionsCount} sections` + ) + } + } +} + +export function logFailedSections( + batch: PageSectionForEmbedding[], + inputs: string[], + failedSectionIndexes: Set +) { + failedSectionIndexes.forEach((i) => { + console.error( + `Failed to process section: ${batch[i].path}#${batch[i].slug} (content: "${inputs[i]?.slice(0, 50)}...")` + ) + }) +} diff --git a/apps/docs/scripts/search/generate-embeddings.ts b/apps/docs/scripts/search/generate-embeddings.ts index d8a7bfdf90920..a5abac64019b8 100644 --- a/apps/docs/scripts/search/generate-embeddings.ts +++ b/apps/docs/scripts/search/generate-embeddings.ts @@ -1,24 +1,453 @@ import '../utils/dotenv.js' -import { createClient } from '@supabase/supabase-js' +import { createClient, type SupabaseClient } from '@supabase/supabase-js' import { parseArgs } from 'node:util' import { OpenAI } from 'openai' import { v4 as uuidv4 } from 'uuid' + import type { Section } from '../helpers.mdx.js' +import { + type PageInfo, + type PageSectionForEmbedding, + type PageSectionWithEmbedding, + type ProcessingResult, + createBatches, + mapEmbeddingsToSections, + updatePageInsertionCounts, + computePageResults, + logFailedSections, +} from './embeddings/utils.js' import { fetchAllSources } from './sources/index.js' +const CONFIG = { + // OpenAI settings + EMBEDDING_MODEL: 'text-embedding-ada-002' as const, + EMBEDDING_DIMENSION: 1536, // Keep in sync with EMBEDDING_MODEL + OPENAI_BATCH_SIZE: 128, + OPENAI_MAX_RETRIES: 3, + OPENAI_BASE_DELAY_MS: 500, + /** + * If context length is exceeded, truncate inputs over this character length + * and retry. This is a character-based heuristic, not token-exact. + */ + EMBEDDING_TRUNCATE_CHAR_LIMIT: 16_000, + + // Supabase settings + SUPABASE_MAX_RETRIES: 2, + SUPABASE_BASE_DELAY_MS: 100, + + // Processing settings + SOURCE_CONCURRENCY: 10, +} as const + +function delay(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)) +} + +function exponentialBackoff(attempt: number, baseDelay: number, maxDelay: number = 30_000): number { + const exponentialDelay = baseDelay * Math.pow(2, attempt) + const jitter = (Math.random() - 0.5) * 0.1 * exponentialDelay + return Math.min(Math.max(0, exponentialDelay + jitter), maxDelay) +} + +async function withRetry( + operation: () => Promise, + maxRetries: number, + baseDelay: number, + operationName: string, + shouldRetryOnError: (error: unknown) => boolean = () => true +): Promise { + let lastError: Error + + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + return await operation() + } catch (error) { + lastError = error as Error + + // Allow caller to prevent redundant retries for specific errors + if (!shouldRetryOnError?.(error)) { + console.warn(`${operationName} encountered non-retryable error:`, lastError.message) + throw lastError + } + + if (attempt === maxRetries) { + console.error(`${operationName} failed after ${maxRetries + 1} attempts:`, lastError) + throw lastError + } + + const delayMs = exponentialBackoff(attempt, baseDelay) + console.warn( + `${operationName} attempt ${attempt + 1} failed, retrying in ${delayMs}ms:`, + lastError.message + ) + await delay(delayMs) + } + } + + throw lastError! +} + +function isNimbusMode(): boolean { + return process.env.ENABLED_FEATURES_OVERRIDE_DISABLE_ALL === 'true' +} + +function getPageTables() { + const nimbus = isNimbusMode() + return { + pageTable: nimbus ? 'page_nimbus' : 'page', + pageSectionTable: nimbus ? 'page_section_nimbus' : 'page_section', + } as const +} + +function requireEnvOrThrow(names: string[]): void { + const missing = names.filter((n) => !process.env[n]) + if (missing.length) { + throw new Error( + `Environment variables ${missing.join(', ')} are required: skipping embeddings generation` + ) + } +} + +function initSupabase(): SupabaseClient { + return createClient(process.env.NEXT_PUBLIC_SUPABASE_URL!, process.env.SUPABASE_SECRET_KEY!, { + auth: { persistSession: false, autoRefreshToken: false }, + }) +} + +type PreparedSections = { + allSectionsToProcess: PageSectionForEmbedding[] + pageInfoMap: Map +} + +async function prepareSections( + supabaseClient: SupabaseClient, + pageTable: string, + pageSectionTable: string, + shouldRefresh: boolean, + refreshVersion: string, + refreshDate: Date, + fullIndex = true, + debug = false +): Promise { + const embeddingSources = await fetchAllSources(fullIndex) + console.log(`Discovered ${embeddingSources.length} sources`) + + const allSectionsToProcess: PageSectionForEmbedding[] = [] + const pageInfoMap = new Map() + + for (const sourceBatch of createBatches(embeddingSources, CONFIG.SOURCE_CONCURRENCY)) { + await Promise.all( + sourceBatch.map(async (embeddingSource) => { + const { type, source, path } = embeddingSource + + try { + const { + checksum, + sections, + meta = {}, + ragIgnore = false, + }: { + checksum: string + sections: Section[] + ragIgnore?: boolean + meta?: Record + } = await embeddingSource.process() + + const { error: fetchPageError, data: existingPage } = await supabaseClient + .from(pageTable) + .select('id, path, checksum') + .filter('path', 'eq', path) + .limit(1) + .maybeSingle() + + if (fetchPageError) throw fetchPageError + + if (!shouldRefresh && existingPage?.checksum === checksum) { + const { error: updatePageError } = await supabaseClient + .from(pageTable) + .update({ + type, + source, + meta, + version: refreshVersion, + last_refresh: refreshDate, + }) + .filter('id', 'eq', existingPage.id) + if (updatePageError) throw updatePageError + return + } + + if (existingPage && debug) { + console.log( + !shouldRefresh + ? `[${path}] Docs have changed, removing old page sections and their embeddings` + : `[${path}] Refresh flag set, removing old page sections and their embeddings` + ) + + const { error: deletePageSectionError } = await supabaseClient + .from(pageSectionTable) + .delete() + .filter('page_id', 'eq', existingPage.id) + if (deletePageSectionError) throw deletePageSectionError + } + + const { error: upsertPageError, data: page } = await supabaseClient + .from(pageTable) + .upsert( + { + checksum: null, + path, + type, + source, + meta, + content: embeddingSource.extractIndexedContent(), + version: refreshVersion, + last_refresh: refreshDate, + }, + { onConflict: 'path' } + ) + .select() + .limit(1) + .single() + if (upsertPageError) throw upsertPageError + + if (debug) { + console.log(`[${path}] Preparing ${sections.length} page sections for processing`) + } + + pageInfoMap.set(page.id, { + pageId: page.id, + path, + checksum, + sectionsCount: sections.length, + }) + + const sectionsForBatching = sections.map(({ slug, heading, content }) => ({ + pageId: page.id, + path, + slug, + heading, + content, + input: content.replace(/\n/g, ' '), + ragIgnore, + })) + allSectionsToProcess.push(...sectionsForBatching) + } catch (err) { + console.error(`Error preparing path '${path}' for processing.`) + console.error(err) + } + }) + ) + } + + console.log( + `Prepared ${allSectionsToProcess.length} sections for processing from ${pageInfoMap.size} pages` + ) + return { allSectionsToProcess, pageInfoMap } +} + +async function processAndInsertEmbeddings( + openai: OpenAI, + supabaseClient: SupabaseClient, + pageSectionTable: string, + allSections: PageSectionForEmbedding[], + pageInfoMap: Map +): Promise { + if (allSections.length === 0) { + return { + successfulPages: new Set(), + failedPages: new Set(), + totalSectionsProcessed: 0, + totalSectionsInserted: 0, + } + } + + console.log(`Processing ${allSections.length} sections with embeddings + insertion`) + + const embeddingBatches = createBatches(allSections, CONFIG.OPENAI_BATCH_SIZE) + const result: ProcessingResult = { + successfulPages: new Set(), + failedPages: new Set(), + totalSectionsProcessed: 0, + totalSectionsInserted: 0, + } + + // Track sections inserted per page + const pageSectionsInserted = new Map() + + for (let batchIndex = 0; batchIndex < embeddingBatches.length; batchIndex++) { + const batch = embeddingBatches[batchIndex] + try { + const batchResult = await processEmbeddingBatch( + openai, + batch, + batchIndex, + embeddingBatches.length + ) + + result.totalSectionsProcessed += batchResult.processedCount + + if (batchResult.sectionsWithEmbeddings.length > 0) { + const insertedCount = await insertSectionBatch( + supabaseClient, + pageSectionTable, + batchResult.sectionsWithEmbeddings + ) + result.totalSectionsInserted += insertedCount + updatePageInsertionCounts(pageSectionsInserted, batchResult.sectionsWithEmbeddings) + } + + // Mark failed section pages + batchResult.failedSectionIndexes.forEach((i) => { + result.failedPages.add(batch[i].pageId) + }) + } catch (error) { + console.error(`Batch ${batchIndex + 1} completely failed:`, error) + batch.forEach((section) => result.failedPages.add(section.pageId)) + } + + if (batchIndex < embeddingBatches.length - 1) { + await delay(CONFIG.OPENAI_BASE_DELAY_MS) + } + } + + computePageResults(pageInfoMap, pageSectionsInserted, result) + + return result +} + +type BatchEmbeddingResult = { + sectionsWithEmbeddings: PageSectionWithEmbedding[] + failedSectionIndexes: Set + processedCount: number +} + +async function processEmbeddingBatch( + openai: OpenAI, + batch: PageSectionForEmbedding[], + batchIndex: number, + totalBatches: number +): Promise { + const inputs = batch.map((section) => section.input) + console.log( + `Processing embedding batch ${batchIndex + 1}/${totalBatches} (${inputs.length} sections)` + ) + + // Helper to identify context length exceeded errors from OpenAI + const isContextLengthError = (err: unknown) => { + if (!(err instanceof OpenAI.APIError)) return false + + const message = err.error?.message as string + const status = err.status + return status === 400 && message.toLowerCase().includes('context') + } + + let embeddingResponse: OpenAI.Embeddings.CreateEmbeddingResponse + try { + embeddingResponse = await withRetry( + () => + openai.embeddings.create({ + model: CONFIG.EMBEDDING_MODEL, + input: inputs, + }), + CONFIG.OPENAI_MAX_RETRIES, + CONFIG.OPENAI_BASE_DELAY_MS, + `OpenAI embedding batch ${batchIndex + 1}`, + (err) => !isContextLengthError(err) + ) + } catch (err) { + if (!isContextLengthError(err)) { + throw err + } + + // Context length exceeded: truncate problematic sections and try once more + const limit = CONFIG.EMBEDDING_TRUNCATE_CHAR_LIMIT + const truncatedInputs = inputs.map((s) => (s.length > limit ? s.slice(0, limit) : s)) + const truncatedCount = truncatedInputs.filter((s, i) => s !== inputs[i]).length + console.warn( + `OpenAI embedding batch ${batchIndex + 1}: context length exceeded. ` + + `Truncating ${truncatedCount} overly long section(s) to ${limit} chars and retrying once.` + ) + + embeddingResponse = await openai.embeddings.create({ + model: CONFIG.EMBEDDING_MODEL, + input: truncatedInputs, + }) + + // Replace inputs with truncated inputs for downstream bookkeeping + for (let i = 0; i < inputs.length; i++) inputs[i] = truncatedInputs[i] + } + + const { sectionsWithEmbeddings, failedSectionIndexes } = mapEmbeddingsToSections( + batch, + embeddingResponse.data, + batchIndex + ) + logFailedSections(batch, inputs, failedSectionIndexes) + + return { + sectionsWithEmbeddings, + failedSectionIndexes, + processedCount: inputs.length, + } +} + +async function insertSectionBatch( + supabaseClient: SupabaseClient, + pageSectionTable: string, + sectionsWithEmbeddings: PageSectionWithEmbedding[] +): Promise { + if (sectionsWithEmbeddings.length === 0) { + return 0 + } + + const pageSectionsToInsert = sectionsWithEmbeddings.map((section) => ({ + page_id: section.pageId, + slug: section.slug, + heading: section.heading, + content: section.content, + embedding: section.embedding, + rag_ignore: section.ragIgnore, + })) + + await withRetry( + async () => { + const { error } = await supabaseClient.from(pageSectionTable).insert(pageSectionsToInsert) + + if (error) { + throw new Error(`Supabase insert error: ${error.message}`) + } + }, + CONFIG.SUPABASE_MAX_RETRIES, + CONFIG.SUPABASE_BASE_DELAY_MS, + `Insert batch of ${sectionsWithEmbeddings.length} sections` + ) + + return sectionsWithEmbeddings.length +} + const args = parseArgs({ options: { refresh: { type: 'boolean', }, + debug: { + type: 'boolean', + }, }, }) async function generateEmbeddings() { const shouldRefresh = Boolean(args.values.refresh) + const debug = Boolean(args.values.debug) + + const nimbus = isNimbusMode() + if (nimbus) { + console.log('Running in Nimbus mode - will filter content based on disabled feature flags') + } - const requiredEnvVars = [ + requireEnvOrThrow([ 'DOCS_GITHUB_APP_ID', 'DOCS_GITHUB_APP_INSTALLATION_ID', 'DOCS_GITHUB_APP_PRIVATE_KEY', @@ -27,215 +456,134 @@ async function generateEmbeddings() { 'NEXT_PUBLIC_SUPABASE_URL', 'OPENAI_API_KEY', 'SUPABASE_SECRET_KEY', - ] - - const missingEnvVars = requiredEnvVars.filter((name) => !process.env[name]) - if (missingEnvVars.length > 0) { - throw new Error( - `Environment variables ${missingEnvVars.join( - ', ' - )} are required: skipping embeddings generation` - ) - } + ]) - const supabaseClient = createClient( - process.env.NEXT_PUBLIC_SUPABASE_URL!, - process.env.SUPABASE_SECRET_KEY!, - { - auth: { - persistSession: false, - autoRefreshToken: false, - }, - } - ) + const supabaseClient = initSupabase() - // Use this version to track which pages to purge - // after the refresh const refreshVersion = uuidv4() - const refreshDate = new Date() - const embeddingSources = await fetchAllSources() - - console.log(`Discovered ${embeddingSources.length} pages`) - - if (!shouldRefresh) { - console.log('Checking which pages are new or have changed') - } else { - console.log('Refresh flag set, re-generating all pages') - } + const { pageTable, pageSectionTable } = getPageTables() + const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }) - for (const embeddingSource of embeddingSources) { - const { type, source, path } = embeddingSource + console.log( + shouldRefresh + ? 'Refresh flag set, re-generating all pages' + : 'Checking which pages are new or have changed' + ) - try { - const { - checksum, - sections, - meta = {}, - ragIgnore = false, - }: { - checksum: string - sections: Section[] - ragIgnore?: boolean - meta?: Record - } = await embeddingSource.process() - - // Check for existing page in DB and compare checksums - const { error: fetchPageError, data: existingPage } = await supabaseClient - .from('page') - .select('id, path, checksum') - .filter('path', 'eq', path) - .limit(1) - .maybeSingle() - - if (fetchPageError) { - throw fetchPageError - } + const { allSectionsToProcess, pageInfoMap } = await prepareSections( + supabaseClient, + pageTable, + pageSectionTable, + shouldRefresh, + refreshVersion, + refreshDate, + !nimbus, + debug + ) - // We use checksum to determine if this page & its sections need to be regenerated - if (!shouldRefresh && existingPage?.checksum === checksum) { - // No content/embedding update required on this page - // Update other meta info - const { error: updatePageError } = await supabaseClient - .from('page') - .update({ - type, - source, - meta, - version: refreshVersion, - last_refresh: refreshDate, - }) - .filter('id', 'eq', existingPage.id) + let processingResult: ProcessingResult + try { + processingResult = await processAndInsertEmbeddings( + openai, + supabaseClient, + pageSectionTable, + allSectionsToProcess, + pageInfoMap + ) + console.log( + `Processing complete: ${processingResult.totalSectionsInserted}/${processingResult.totalSectionsProcessed} sections inserted successfully` + ) + console.log( + `Page summary: ${processingResult.successfulPages.size} successful, ${processingResult.failedPages.size} failed` + ) + } catch (error) { + console.error('Critical error during embedding processing:', error) + console.log('Exiting due to complete processing failure') + return + } - if (updatePageError) { - throw updatePageError - } + console.log(`\nUpdating checksums for ${processingResult.successfulPages.size} successful pages`) + const successfulChecksumUpdates = await updateSuccessfulChecksums( + supabaseClient, + pageTable, + pageInfoMap, + processingResult + ) + console.log( + `Successfully updated checksums for ${successfulChecksumUpdates}/${processingResult.successfulPages.size} successful pages` + ) - continue - } + logFailedPages(pageInfoMap, processingResult) - if (existingPage) { - if (!shouldRefresh) { - console.log( - `[${path}] Docs have changed, removing old page sections and their embeddings` - ) - } else { - console.log(`[${path}] Refresh flag set, removing old page sections and their embeddings`) - } + await purgeOldPages(supabaseClient, pageTable, refreshVersion) - const { error: deletePageSectionError } = await supabaseClient - .from('page_section') - .delete() - .filter('page_id', 'eq', existingPage.id) + console.log('Embedding generation complete') +} - if (deletePageSectionError) { - throw deletePageSectionError +async function updateSuccessfulChecksums( + supabaseClient: SupabaseClient, + pageTable: string, + pageInfoMap: Map, + processingResult: ProcessingResult +): Promise { + let successfulChecksumUpdates = 0 + const pageIds = Array.from(processingResult.successfulPages) + const batches = createBatches(pageIds, CONFIG.SOURCE_CONCURRENCY) + + for (const batch of batches) { + const results = await Promise.all( + batch.map(async (pageId) => { + const pageInfo = pageInfoMap.get(pageId) + if (!pageInfo) { + console.error(`Missing page info for pageId ${pageId}`) + return 0 } - } - - // Create/update page record. Intentionally clear checksum until we - // have successfully generated all page sections. - const { error: upsertPageError, data: page } = await supabaseClient - .from('page') - .upsert( - { - checksum: null, - path, - type, - source, - meta, - content: embeddingSource.extractIndexedContent(), - version: refreshVersion, - last_refresh: refreshDate, - }, - { onConflict: 'path' } - ) - .select() - .limit(1) - .single() - - if (upsertPageError) { - throw upsertPageError - } - - console.log(`[${path}] Adding ${sections.length} page sections (with embeddings)`) - for (const { slug, heading, content } of sections) { - // OpenAI recommends replacing newlines with spaces for best results (specific to embeddings) - // force a redeploy - const input = content.replace(/\n/g, ' ') try { - const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }) - - const embeddingResponse = await openai.embeddings.create({ - model: 'text-embedding-ada-002', - input, - }) - - const [responseData] = embeddingResponse.data - - const { error: insertPageSectionError } = await supabaseClient - .from('page_section') - .insert({ - page_id: page.id, - slug, - heading, - content, - token_count: embeddingResponse.usage.total_tokens, - embedding: responseData.embedding, - rag_ignore: ragIgnore, - }) - .select() - .limit(1) - .single() - - if (insertPageSectionError) { - throw insertPageSectionError + const { error: updatePageError } = await supabaseClient + .from(pageTable) + .update({ checksum: pageInfo.checksum }) + .eq('id', pageId) + if (updatePageError) { + console.error(`Failed to update checksum for page ${pageInfo.path}:`, updatePageError) + return 0 } - } catch (err) { - // TODO: decide how to better handle failed embeddings - console.error( - `Failed to generate embeddings for '${path}' page section starting with '${input.slice( - 0, - 40 - )}...'` - ) - - throw err + return 1 + } catch (error) { + console.error(`Error updating checksum for page ${pageInfo.path}:`, error) + return 0 } - } + }) + ) - // Set page checksum so that we know this page was stored successfully - const { error: updatePageError } = await supabaseClient - .from('page') - .update({ checksum }) - .filter('id', 'eq', page.id) + successfulChecksumUpdates += results.reduce((sum, x) => sum + x, 0) + } - if (updatePageError) { - throw updatePageError - } - } catch (err) { - console.error( - `Page '${path}' or one/multiple of its page sections failed to store properly. Page has been marked with null checksum to indicate that it needs to be re-generated.` - ) - console.error(err) - } + return successfulChecksumUpdates +} + +function logFailedPages(pageInfoMap: Map, processingResult: ProcessingResult) { + if (processingResult.failedPages.size === 0) return + console.log(`\nFailed pages:`) + for (const pageId of processingResult.failedPages) { + const pageInfo = pageInfoMap.get(pageId) + if (pageInfo) console.log(` - ${pageInfo.path}`) } +} +async function purgeOldPages( + supabaseClient: SupabaseClient, + pageTable: string, + refreshVersion: string +) { console.log(`Removing old pages and their sections`) - - // Delete pages that have been removed (and their sections via cascade) const { error: deletePageError } = await supabaseClient - .from('page') + .from(pageTable) .delete() .filter('version', 'neq', refreshVersion) - - if (deletePageError) { - throw deletePageError - } - - console.log('Embedding generation complete') + if (deletePageError) throw deletePageError } async function main() { diff --git a/apps/docs/scripts/search/sources/index.ts b/apps/docs/scripts/search/sources/index.ts index 9dc5541b27495..88a1de69c2f79 100644 --- a/apps/docs/scripts/search/sources/index.ts +++ b/apps/docs/scripts/search/sources/index.ts @@ -1,3 +1,4 @@ +import { type GuideModel } from '../../../resources/guide/guideModel.js' import { GuideModelLoader } from '../../../resources/guide/guideModelLoader.js' import { GitHubDiscussionLoader, @@ -28,7 +29,7 @@ export type SearchSource = export async function fetchGuideSources() { const guides = (await GuideModelLoader.allFromFs()).unwrapLeft() - return guides.map((guide) => MarkdownLoader.fromGuideModel('guide', guide)) + return guides.map((guide: GuideModel) => MarkdownLoader.fromGuideModel('guide', guide)) } export async function fetchOpenApiReferenceSource() { @@ -125,27 +126,29 @@ export async function fetchLintWarningsGuideSources() { /** * Fetches all the sources we want to index for search */ -export async function fetchAllSources() { +export async function fetchAllSources(fullIndex: boolean) { const guideSources = fetchGuideSources() const lintWarningsGuideSources = fetchLintWarningsGuideSources() const openApiReferenceSource = fetchOpenApiReferenceSource() const jsLibReferenceSource = fetchJsLibReferenceSource() - const dartLibReferenceSource = fetchDartLibReferenceSource() - const pythonLibReferenceSource = fetchPythonLibReferenceSource() - const cSharpLibReferenceSource = fetchCSharpLibReferenceSource() - const swiftLibReferenceSource = fetchSwiftLibReferenceSource() - const ktLibReferenceSource = fetchKtLibReferenceSource() - const cliReferenceSource = fetchCliLibReferenceSource() - - const partnerIntegrationSources = fetchPartners() - .then((partners) => - partners - ? Promise.all( - partners.map((partner) => new IntegrationLoader(partner.slug, partner).load()) - ) - : [] - ) - .then((data) => data.flat()) + const dartLibReferenceSource = fullIndex ? fetchDartLibReferenceSource() : [] + const pythonLibReferenceSource = fullIndex ? fetchPythonLibReferenceSource() : [] + const cSharpLibReferenceSource = fullIndex ? fetchCSharpLibReferenceSource() : [] + const swiftLibReferenceSource = fullIndex ? fetchSwiftLibReferenceSource() : [] + const ktLibReferenceSource = fullIndex ? fetchKtLibReferenceSource() : [] + const cliReferenceSource = fullIndex ? fetchCliLibReferenceSource() : [] + + const partnerIntegrationSources = fullIndex + ? fetchPartners() + .then((partners) => + partners + ? Promise.all( + partners.map((partner) => new IntegrationLoader(partner.slug, partner).load()) + ) + : [] + ) + .then((data) => data.flat()) + : [] const githubDiscussionSources = fetchDiscussions( 'supabase', diff --git a/apps/docs/spec/common-cli-sections.json b/apps/docs/spec/common-cli-sections.json index 1f658681d3d6a..b6cd3b5c097c9 100644 --- a/apps/docs/spec/common-cli-sections.json +++ b/apps/docs/spec/common-cli-sections.json @@ -820,7 +820,7 @@ { "id": "supabase-postgres-config", "title": "Manage Postgres configurations", - "slug": "supabase-ssl-enforcement", + "slug": "supabase-postgres-config", "type": "cli-command" }, { diff --git a/apps/docs/spec/supabase_js_v2.yml b/apps/docs/spec/supabase_js_v2.yml index 49a7909b4d69c..e676ae78ec957 100644 --- a/apps/docs/spec/supabase_js_v2.yml +++ b/apps/docs/spec/supabase_js_v2.yml @@ -1356,7 +1356,7 @@ functions: - If the session's access token is expired or is about to expire, this method will use the refresh token to refresh the session. - When using in a browser, or you've called `startAutoRefresh()` in your environment (React Native, etc.) this function always returns a valid access token without refreshing the session itself, as this is done in the background. This function returns very fast. - **IMPORTANT SECURITY NOTICE:** If using an insecure storage medium, such as cookies or request headers, the user object returned by this function **must not be trusted**. Always verify the JWT using `getClaims()` or your own JWT verification library to securely establish the user's identity and access. You can also use `getUser()` to fetch the user object directly from the Auth server for this purpose. - - When using in a browser, this function is synchronized accross all tabs using the [LockManager](https://developer.mozilla.org/en-US/docs/Web/API/LockManager) API. In other environments make sure you've defined a proper `lock` property, if necessary, to make sure there are no race conditions while the session is being refreshed. + - When using in a browser, this function is synchronized across all tabs using the [LockManager](https://developer.mozilla.org/en-US/docs/Web/API/LockManager) API. In other environments make sure you've defined a proper `lock` property, if necessary, to make sure there are no race conditions while the session is being refreshed. examples: - id: get-the-session-data name: Get the session data diff --git a/apps/docs/turbo.json b/apps/docs/turbo.json index 1cf047c8b07b6..d980120ebdc86 100644 --- a/apps/docs/turbo.json +++ b/apps/docs/turbo.json @@ -46,6 +46,7 @@ "DOCS_GITHUB_APP_PRIVATE_KEY", "DOCS_REVALIDATION_KEYS", "DOCS_REVALIDATION_OVERRIDE_KEYS", + "ENABLED_FEATURES_OVERRIDE_DISABLE_ALL", "GITHUB_ACTIONS", "FORCE_ASSET_CDN", "LOGFLARE_INGESTION_API_KEY", diff --git a/apps/docs/vitest.config.ts b/apps/docs/vitest.config.ts index bbadb9591ec53..25dac66db1868 100644 --- a/apps/docs/vitest.config.ts +++ b/apps/docs/vitest.config.ts @@ -1,10 +1,19 @@ import tsconfigPaths from 'vite-tsconfig-paths' import { defineConfig } from 'vitest/config' +// eslint-disable-next-line no-restricted-exports export default defineConfig({ test: { + // Exclude examples from test discovery (does not affect tsconfig scanning) exclude: ['examples/**/*', '**/node_modules/**'], setupFiles: ['vitest.setup.ts'], }, - plugins: [tsconfigPaths({ root: import.meta.dirname })], + // Restrict tsconfig-paths to only use this app's tsconfig + plugins: [ + tsconfigPaths({ + root: import.meta.dirname, + // Prevent scanning tsconfig files in subfolders like examples/** + projects: ['tsconfig.json'], + }), + ], }) diff --git a/apps/studio/components/interfaces/Auth/SessionsAuthSettingsForm/SessionsAuthSettingsForm.tsx b/apps/studio/components/interfaces/Auth/SessionsAuthSettingsForm/SessionsAuthSettingsForm.tsx index 7ae7a7cc95825..3cea989af221f 100644 --- a/apps/studio/components/interfaces/Auth/SessionsAuthSettingsForm/SessionsAuthSettingsForm.tsx +++ b/apps/studio/components/interfaces/Auth/SessionsAuthSettingsForm/SessionsAuthSettingsForm.tsx @@ -44,10 +44,7 @@ function HoursOrNeverText({ value }: { value: number }) { const RefreshTokenSchema = z.object({ REFRESH_TOKEN_ROTATION_ENABLED: z.boolean(), - SECURITY_REFRESH_TOKEN_REUSE_INTERVAL: z.coerce - .number() - .positive() - .min(0, 'Must be a value more than 0'), + SECURITY_REFRESH_TOKEN_REUSE_INTERVAL: z.coerce.number().min(0, 'Must be a value more than 0'), }) const UserSessionsSchema = z.object({ @@ -202,32 +199,30 @@ export const SessionsAuthSettingsForm = () => { )} /> - {refreshTokenForm.watch('REFRESH_TOKEN_ROTATION_ENABLED') && ( - - ( - - - - - - - - )} - /> - - )} + + ( + + + + + + + + )} + /> + {refreshTokenForm.formState.isDirty && (