Permalink
Browse files

dropping full_text ... yikes.

  • Loading branch information...
1 parent 62ac381 commit b84126e87a3d1759564f94772982c085fa6f2568 @jashkenas jashkenas committed with knowtheory May 12, 2011
@@ -49,7 +49,6 @@ def process_images
def process_text
@pages = []
# Destroy existing text and pages to make way for the new.
- document.full_text.destroy if document.full_text
document.pages.destroy_all if document.pages.count > 0
begin
opts = {:pages => 'all', :output => 'text'}
@@ -68,14 +67,14 @@ def process_text
queue_page_text(text, page_number)
end
save_page_text!
+ text = @pages.map{|p| p[:text] }.join('')
document.page_count = @pages.length
- document.full_text = FullText.create!(:text => @pages.map{|p| p[:text] }.join(''), :document => document)
Page.refresh_page_map(document)
EntityDate.refresh(document)
document.save!
pages = document.reload.pages
Sunspot.index pages
- DC::Import::EntityExtractor.new.extract(document) unless options['secure']
+ DC::Import::EntityExtractor.new.extract(document, text) unless options['secure']
document.upload_text_assets(pages)
document.id
end
@@ -4,7 +4,7 @@ class ReprocessEntities < CloudCrowd::Action
def process
puts "Reprocessing Entities: #{document.title}"
- DC::Import::EntityExtractor.new.extract document
+ DC::Import::EntityExtractor.new.extract document, document.combined_page_text
true
end
@@ -164,7 +164,7 @@ def reprocess_text
def send_pdf
return not_found unless current_document(true)
- redirect_to(current_document.pdf_url(:direct))
+ redirect_to current_document.pdf_url(:direct)
end
def send_page_image
@@ -176,7 +176,7 @@ def send_page_image
def send_full_text
return not_found unless current_document(true)
- send_data(current_document.text, :disposition => 'inline', :type => :txt)
+ redirect_to document.full_text_url(:direct)
end
def send_page_text
@@ -28,7 +28,7 @@ def send_pdfs
def send_text
package("#{package_name}.zip") do |zip|
@documents.each do |doc|
- zip.get_output_stream("#{doc.slug}.txt") {|f| f.write(doc.text) }
+ zip.get_output_stream("#{doc.slug}.txt") {|f| f.write(doc.combined_page_text) }
end
end
end
View
@@ -32,7 +32,6 @@ class Document < ActiveRecord::Base
belongs_to :account
belongs_to :organization
- has_one :full_text, :dependent => :destroy
has_one :docdata, :dependent => :destroy
has_many :pages, :dependent => :destroy
has_many :entities, :dependent => :destroy
@@ -57,7 +56,6 @@ class Document < ActiveRecord::Base
text_attr :source, :related_article, :remote_url
html_attr :description
- delegate :text, :to => :full_text, :allow_nil => true
delegate :slug, :to => :organization, :allow_nil => true, :prefix => true
delegate :slug, :to => :account, :allow_nil => true, :prefix => true
@@ -115,7 +113,7 @@ class Document < ActiveRecord::Base
text :source
text :description
text :full_text, {:more_like_this => true} do
- self.text
+ self.combined_page_text
end
# Attributes...
@@ -436,8 +434,10 @@ def private_full_text_url
File.join(DC.server_root, full_text_path)
end
- def full_text_url
- public? ? public_full_text_url : private_full_text_url
+ def full_text_url(direct=false)
+ return public_full_text_url if public? || Rails.env.development?
+ return private_full_text_url unless direct
+ DC::Store::AssetStore.new.authorized_url(full_text_path)
end
def document_viewer_url(opts={})
@@ -535,7 +535,6 @@ def reprocess_text(force_ocr = false)
end
def reindex_all!(access=nil)
- full_text.refresh
Page.refresh_page_map(self)
EntityDate.refresh(self)
pages = self.reload.pages
View
@@ -12,8 +12,6 @@ class Entity < ActiveRecord::Base
belongs_to :document
- has_one :full_text, :through => :document
-
validates_inclusion_of :kind, :in => DC::VALID_KINDS
text_attr :value
@@ -5,14 +5,13 @@ class EntityDate < ActiveRecord::Base
belongs_to :document
- has_one :full_text, :through => :document
-
# Destroy and recreate all of a document's dates, from the text. Save the
# document after running this method in order to save the dates.
def self.refresh(document)
- return false unless document.text
+ text = document.combined_page_text
+ return false unless text
document.entity_dates.destroy_all
- DC::Import::DateExtractor.new.extract_dates(document.text).each do |hash|
+ DC::Import::DateExtractor.new.extract_dates(text).each do |hash|
model = self.new(:document => document, :date => hash[:date], :occurrences => Occurrence.to_csv(hash[:occurrences]))
document.entity_dates << model
end
View
@@ -1,22 +0,0 @@
-# The Full Text table keeps the full text column off of the document, keeping
-# fast document lookups fast.
-class FullText < ActiveRecord::Base
-
- include DC::Store::DocumentResource
-
- set_table_name "full_text"
-
- belongs_to :document
-
- # The first 255 characters of the text.
- def summarize
- text[0...1000].gsub(/\s+/, ' ').mb_chars[0...255]
- end
-
- # Refresh the full text index from the contents of the document's pages.
- def refresh
- update_attribute :text, document.combined_page_text
- DC::Store::AssetStore.new.save_full_text(document, access)
- end
-
-end
View
@@ -21,8 +21,6 @@ class Page < ActiveRecord::Base
before_update :track_text_changes
- after_update :refresh_full_text_index
-
searchable do
text :text
integer :document_id
@@ -111,11 +109,4 @@ def track_text_changes
@text_changed = true
end
- # When page text changes, we need to update the document's full text index.
- def refresh_full_text_index
- return true unless @text_changed
- document.full_text.refresh
- @text_changed = false
- end
-
end
@@ -0,0 +1,10 @@
+class DropFullText < ActiveRecord::Migration
+ def self.up
+ remove_index "full_text", :name => "index_full_text_on_document_id"
+ drop_table :full_text
+ end
+
+ def self.down
+ # This migration is irreversible.
+ end
+end
@@ -12,9 +12,9 @@ class EntityExtractor
# Public API: Pass in a document, either with full_text or rdf already
# attached.
- def extract(document)
+ def extract(document, text)
@entities = {}
- chunks = CalaisFetcher.new.fetch_rdf(document.text)
+ chunks = CalaisFetcher.new.fetch_rdf(text)
chunks.compact.each_with_index do |chunk, i|
extract_information(document, chunk, i) if i == 0
extract_entities(document, chunk, i)
@@ -51,7 +51,7 @@ def delete_insert_pdfs(document)
def save_full_text(document, access=nil)
ensure_directory(document.path)
- File.open(local(document.full_text_path), 'w+') {|f| f.write(document.text) }
+ File.open(local(document.full_text_path), 'w+') {|f| f.write(document.combined_page_text) }
end
def save_rdf(document, rdf, access=nil)
View
@@ -64,7 +64,7 @@ def delete_insert_pdfs(document)
end
def save_full_text(document, access=DEFAULT_ACCESS)
- save_file(document.text, document.full_text_path, access, :string => true)
+ save_file(document.combined_page_text, document.full_text_path, access, :string => true)
end
def save_rdf(document, rdf, access=DEFAULT_ACCESS)
View
@@ -63,7 +63,6 @@ def import_document(client, record)
end
puts "#{ref} -- #{pages.length} pages..."
doc.page_count = pages.length
- doc.full_text = FullText.create!(:text => pages.map{|p| p[:text] }.join(''), :document => doc, :access => access)
if doc.page_count <= 0
puts "#{ref} -- zero pages, aborting..."
@@ -80,7 +79,7 @@ def import_document(client, record)
Sunspot.index pages
puts "#{ref} -- extracting entities from Calais, uploading text to S3..."
- DC::Import::EntityExtractor.new.extract(doc)
+ DC::Import::EntityExtractor.new.extract(doc, doc.combined_page_text)
doc.upload_text_assets(pages)
sql = ["access = #{access}", "document_id = #{doc.id}"]
Entity.update_all(*sql)
@@ -4,7 +4,6 @@ class DocumentTest < ActiveSupport::TestCase
context "A Document" do
- should_have_one :full_text
should_have_many :pages
should_have_many :entities

0 comments on commit b84126e

Please sign in to comment.