Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEV: Move to single table per embeddings type #561

Merged
merged 11 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions db/migrate/20240611170904_upgrade_pgvector_070.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# frozen_string_literal: true

class UpgradePgvector070 < ActiveRecord::Migration[7.0]
def up
minimum_target_version = "0.7.0"
installed_version =
DB.query_single("SELECT extversion FROM pg_extension WHERE extname = 'vector';").first

if Gem::Version.new(installed_version) < Gem::Version.new(minimum_target_version)
DB.exec("ALTER EXTENSION vector UPDATE TO '0.7.0';")
end
end

def down
raise ActiveRecord::IrreversibleMigration
end
end
158 changes: 158 additions & 0 deletions db/migrate/20240611170905_move_embeddings_to_single_table_per_type.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# frozen_string_literal: true

class MoveEmbeddingsToSingleTablePerType < ActiveRecord::Migration[7.0]
def up
create_table :ai_topic_embeddings, id: false do |t|
t.integer :topic_id, null: false
t.integer :model_id, null: false
t.integer :model_version, null: false
t.integer :strategy_id, null: false
t.integer :strategy_version, null: false
t.text :digest, null: false
t.column :embeddings, "halfvec", null: false
t.timestamps

t.index %i[model_id strategy_id topic_id],
unique: true,
name: "index_ai_topic_embeddings_on_model_strategy_topic"
end

create_table :ai_post_embeddings, id: false do |t|
t.integer :post_id, null: false
t.integer :model_id, null: false
t.integer :model_version, null: false
t.integer :strategy_id, null: false
t.integer :strategy_version, null: false
t.text :digest, null: false
t.column :embeddings, "halfvec", null: false
t.timestamps

t.index %i[model_id strategy_id post_id],
unique: true,
name: "index_ai_post_embeddings_on_model_strategy_post"
end

create_table :ai_document_fragment_embeddings, id: false do |t|
t.integer :rag_document_fragment_id, null: false
t.integer :model_id, null: false
t.integer :model_version, null: false
t.integer :strategy_id, null: false
t.integer :strategy_version, null: false
t.text :digest, null: false
t.column :embeddings, "halfvec", null: false
t.timestamps

t.index %i[model_id strategy_id rag_document_fragment_id],
unique: true,
name: "index_ai_fragment_embeddings_on_model_strategy_fragment"
end

# Copy data from old tables to new tables
execute <<-SQL
INSERT INTO ai_topic_embeddings (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT topic_id, 1, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_topic_embeddings_1_1;

INSERT INTO ai_topic_embeddings (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT topic_id, 2, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_topic_embeddings_2_1;

INSERT INTO ai_topic_embeddings (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT topic_id, 3, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_topic_embeddings_3_1;

INSERT INTO ai_topic_embeddings (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT topic_id, 4, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_topic_embeddings_4_1;

INSERT INTO ai_topic_embeddings (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT topic_id, 5, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_topic_embeddings_5_1;

INSERT INTO ai_topic_embeddings (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT topic_id, 6, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_topic_embeddings_6_1;

INSERT INTO ai_topic_embeddings (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT topic_id, 7, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_topic_embeddings_7_1;

INSERT INTO ai_topic_embeddings (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT topic_id, 8, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_topic_embeddings_8_1;

INSERT INTO ai_post_embeddings (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT post_id, 1, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_post_embeddings_1_1;

INSERT INTO ai_post_embeddings (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT post_id, 2, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_post_embeddings_2_1;

INSERT INTO ai_post_embeddings (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT post_id, 3, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_post_embeddings_3_1;

INSERT INTO ai_post_embeddings (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT post_id, 4, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_post_embeddings_4_1;

INSERT INTO ai_post_embeddings (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT post_id, 5, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_post_embeddings_5_1;

INSERT INTO ai_post_embeddings (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT post_id, 6, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_post_embeddings_6_1;

INSERT INTO ai_post_embeddings (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT post_id, 7, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_post_embeddings_7_1;

INSERT INTO ai_post_embeddings (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT post_id, 8, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_post_embeddings_8_1;

INSERT INTO ai_document_fragment_embeddings (rag_document_fragment_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT rag_document_fragment_id, 1, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_document_fragment_embeddings_1_1;

INSERT INTO ai_document_fragment_embeddings (rag_document_fragment_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT rag_document_fragment_id, 2, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_document_fragment_embeddings_2_1;

INSERT INTO ai_document_fragment_embeddings (rag_document_fragment_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT rag_document_fragment_id, 3, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_document_fragment_embeddings_3_1;

INSERT INTO ai_document_fragment_embeddings (rag_document_fragment_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT rag_document_fragment_id, 4, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_document_fragment_embeddings_4_1;

INSERT INTO ai_document_fragment_embeddings (rag_document_fragment_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT rag_document_fragment_id, 5, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_document_fragment_embeddings_5_1;

INSERT INTO ai_document_fragment_embeddings (rag_document_fragment_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT rag_document_fragment_id, 6, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_document_fragment_embeddings_6_1;

INSERT INTO ai_document_fragment_embeddings (rag_document_fragment_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT rag_document_fragment_id, 7, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_document_fragment_embeddings_7_1;

INSERT INTO ai_document_fragment_embeddings (rag_document_fragment_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT rag_document_fragment_id, 8, model_version, 1, strategy_version, digest, embeddings, created_at, updated_at
FROM ai_document_fragment_embeddings_8_1;
SQL

begin
strategy = DiscourseAi::Embeddings::Strategies::Truncation.new
vector_rep =
DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(strategy)
vector_rep.consider_indexing
rescue StandardError => e
Rails.logger.error("Failed to index embeddings: #{e}")
end
end
end
30 changes: 30 additions & 0 deletions db/post_migrate/20240611170906_drop_old_embeddings_tables.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# frozen_string_literal: true

class DropOldEmbeddingsTables < ActiveRecord::Migration[7.0]
def up
drop_table :ai_topic_embeddings_1_1
drop_table :ai_topic_embeddings_2_1
drop_table :ai_topic_embeddings_3_1
drop_table :ai_topic_embeddings_4_1
drop_table :ai_topic_embeddings_5_1
drop_table :ai_topic_embeddings_6_1
drop_table :ai_topic_embeddings_7_1
drop_table :ai_topic_embeddings_8_1
drop_table :ai_post_embeddings_1_1
drop_table :ai_post_embeddings_2_1
drop_table :ai_post_embeddings_3_1
drop_table :ai_post_embeddings_4_1
drop_table :ai_post_embeddings_5_1
drop_table :ai_post_embeddings_6_1
drop_table :ai_post_embeddings_7_1
drop_table :ai_post_embeddings_8_1
drop_table :ai_document_fragment_embeddings_1_1
drop_table :ai_document_fragment_embeddings_2_1
drop_table :ai_document_fragment_embeddings_3_1
drop_table :ai_document_fragment_embeddings_4_1
drop_table :ai_document_fragment_embeddings_5_1
drop_table :ai_document_fragment_embeddings_6_1
drop_table :ai_document_fragment_embeddings_7_1
drop_table :ai_document_fragment_embeddings_8_1
end
end
2 changes: 1 addition & 1 deletion lib/embeddings/vector_representations/all_mpnet_base_v2.rb
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def pg_function
end

def pg_index_type
"vector_ip_ops"
"halfvec_ip_ops"
end

def tokenizer
Expand Down
Loading