Skip to content

Commit

Permalink
Fix ingest script and migrations
Browse files Browse the repository at this point in the history
I'm a little wary of hard-coding the order of migrations like this, but
it was easier than redoing them by hand, I guess.
  • Loading branch information
pletcher committed Jul 31, 2017
1 parent eaffa2d commit 5d0f8a9
Show file tree
Hide file tree
Showing 8 changed files with 188 additions and 49 deletions.
12 changes: 10 additions & 2 deletions app/models/text_node.rb
@@ -1,12 +1,20 @@
class TextNode < ApplicationRecord
MULTIPLIER = 2**24

before_validation :ensure_key!

validates :key, presence: true, uniqueness: true

def ensure_key!
return unless key.nil?

update(key: TextNode.gen_key)
end

private

def gen_key
self.key = loop do
def self.gen_key
loop do
random_key = rand(MULTIPLIER).to_s(32)
break random_key unless TextNode.exists?(key: random_key)
end
Expand Down
File renamed without changes.
@@ -1,14 +1,13 @@
class CreateAuthors < ActiveRecord::Migration[5.1]
def change
create_table :authors do |t|
t.string :language, null: false
t.string :englishname
t.string :originalname, null: false
# references language, see language migration
t.string :name, null: false
t.string :slug, null: false

t.timestamps
end

add_index :authors, :slug, unique: true
add_reference :authors, :language, foreign_key: true
end
end
@@ -1,14 +1,13 @@
class CreateCorpora < ActiveRecord::Migration[5.1]
def change
create_table :corpora do |t|
t.string :language, null: false
t.string :link
t.string :slug, null: false
t.string :title, null: false

t.timestamps
end

add_index :corpora, :slug, unique: true
add_reference :corpora, :language, foreign_key: true
end
end
Expand Up @@ -2,22 +2,23 @@ class CreateWorks < ActiveRecord::Migration[5.1]
def change
create_table :works do |t|
t.string :edition
t.string :englishtitle
t.string :english_title
t.string :filename
t.string :form
t.string :hash
t.string :originaltitle, null: false
t.string :md5_hash
t.string :original_title, null: false
t.string :slug, null: false
t.string :structure
t.string :urn

t.timestamps
end

add_reference :text_nodes, :work, foreign_key: true

add_reference :works, :author, foreign_key: true
add_reference :works, :corpus, foreign_key: true
add_reference :works, :language, foreign_key: true

add_index :text_nodes, [:key, :work_id], unique: true
add_index :works, :urn, unique: true
end
end
Expand Up @@ -6,17 +6,18 @@ def change
t.json :data
t.json :entity_ranges, array: true, default: [], null: false
t.json :inline_style_ranges, array: true, default: [], null: false
t.string :type, default: 'unstyled', null: false
t.string :text_node_type, default: 'unstyled', null: false
t.string :key, null: false
t.text :text

t.timestamps
end

add_index :text_nodes, :key, unique: true

add_reference :text_nodes, :author, foreign_key: true
add_reference :text_nodes, :corpus, foreign_key: true
add_reference :text_nodes, :language, foreign_key: true
add_reference :text_nodes, :work, foreign_key: true

add_index :text_nodes, [:key, :work_id], unique: true
end
end
44 changes: 38 additions & 6 deletions db/schema.rb
Expand Up @@ -10,34 +10,36 @@
#
# It's strongly recommended that you check this file into your version control system.

ActiveRecord::Schema.define(version: 20170728024652) do
ActiveRecord::Schema.define(version: 5) do

# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"

create_table "authors", force: :cascade do |t|
t.string "language", null: false
t.string "englishname"
t.string "originalname", null: false
t.string "name", null: false
t.string "slug", null: false
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.bigint "language_id"
t.index ["language_id"], name: "index_authors_on_language_id"
end

create_table "corpora", force: :cascade do |t|
t.string "language", null: false
t.string "link"
t.string "slug", null: false
t.string "title", null: false
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.bigint "language_id"
t.index ["language_id"], name: "index_corpora_on_language_id"
end

create_table "languages", force: :cascade do |t|
t.string "slug", null: false
t.string "title", null: false
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["slug"], name: "index_languages_on_slug", unique: true
end

create_table "text_nodes", force: :cascade do |t|
Expand All @@ -46,20 +48,50 @@
t.json "data"
t.json "entity_ranges", default: [], null: false, array: true
t.json "inline_style_ranges", default: [], null: false, array: true
t.string "type", default: "unstyled", null: false
t.string "text_node_type", default: "unstyled", null: false
t.string "key", null: false
t.text "text"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.bigint "author_id"
t.bigint "corpus_id"
t.bigint "language_id"
t.bigint "work_id"
t.index ["author_id"], name: "index_text_nodes_on_author_id"
t.index ["corpus_id"], name: "index_text_nodes_on_corpus_id"
t.index ["key", "work_id"], name: "index_text_nodes_on_key_and_work_id", unique: true
t.index ["language_id"], name: "index_text_nodes_on_language_id"
t.index ["work_id"], name: "index_text_nodes_on_work_id"
end

create_table "works", force: :cascade do |t|
t.string "edition"
t.string "english_title"
t.string "filename"
t.string "form"
t.string "md5_hash"
t.string "original_title", null: false
t.string "slug", null: false
t.string "structure"
t.string "urn"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.bigint "author_id"
t.bigint "corpus_id"
t.bigint "language_id"
t.index ["author_id"], name: "index_works_on_author_id"
t.index ["corpus_id"], name: "index_works_on_corpus_id"
t.index ["language_id"], name: "index_works_on_language_id"
t.index ["urn"], name: "index_works_on_urn", unique: true
end

add_foreign_key "authors", "languages"
add_foreign_key "corpora", "languages"
add_foreign_key "text_nodes", "authors"
add_foreign_key "text_nodes", "corpora"
add_foreign_key "text_nodes", "languages"
add_foreign_key "text_nodes", "works"
add_foreign_key "works", "authors"
add_foreign_key "works", "corpora"
add_foreign_key "works", "languages"
end
153 changes: 126 additions & 27 deletions lib/tasks/text_nodes.rake
@@ -1,70 +1,169 @@
namespace :text_nodes do
require 'json'
require 'tmpdir'
require 'uri'

def make_corpus_link(corpus)
if corpus.downcase == "open greek and latin"
"https://github.com/OpenGreekAndLatin"
elsif corpus.downcase == "the first 1k years of greek"
"http://opengreekandlatin.github.io/First1KGreek/"
elsif corpus.downcase == "perseus digital library"
"http://www.perseus.tufts.edu/hopper/"
else
puts "----- No link for #{corpus}. -----"
""
def format_text_nodes(text, nodes=[], location=[])
text.each_with_index do |(k, v), i|
_location = location + [k.to_s.to_i]

return format_text_nodes(v, nodes, _location) if v.is_a?(Hash)

puts "location: #{_location.to_s}"

nodes << {
location: _location,
text: v,
}
end

nodes
end

def import_cltk_json(filename, repo)
data = begin
JSON.load(filename, nil, symbolize_names: true)
JSON.load(File.new(filename), nil, create_additions: false, symbolize_names: true)
rescue => exception
puts "----- Failed to read #{filename}. ------"
puts "#{exception}"
return nil
end

work = data.get(:work) ||
data.get(:english_title) ||
data.get(:englishTitle) ||
data.get(:title)
work = data[:work] ||
data[:english_title] ||
data[:englishTitle] ||
data[:title]

return "------ broken file #{filename} ------" unless work

original_title = data.get(:originalTitle) || data.get(:original_title) || work
original_title = data[:originalTitle] ||
data[:original_title] ||
work

edition = data.get(:edition) || ""
author = data.get(:author)
structure = data.get(:meta).downcase || ""
corpus = data.get(:source) || repo
edition = data[:edition] || ""
author = data[:author]
structure = data[:meta].downcase || ""
corpus = data[:source] || repo
.replace("texts", "").replace("text", "")
.replace(".git", "").replace("_", " ").strip.titleize
corpus_link = data.get(:sourceLink) || make_corpus_link(corpus)
language = data.get(:language).downcase
corpus_link = data[:sourceLink] || make_corpus_link(corpus)
language = data[:language].downcase
language = "greek" if language == "grc"
language = "german" if language == "ger"
form = if structure.include?("line")
"poetry"
else
"prose"
end
urn = data[:urn] || ""

# ingest
ingest({
author: author,
corpus: corpus,
corpus_link: corpus_link,
edition: edition,
english_title: work,
filename: filename,
form: form,
md5_hash: Digest::MD5.hexdigest(filename),
language: language,
original_title: original_title,
structure: structure,
urn: urn,
}, data)
end

task :ingest, [:repo] => :environment do |_, args|
require 'tmpdir'
def ingest(meta, data)
language = Language.find_or_create_by(
slug: slugify(meta[:language]),
title: data[:language]
)
author = Author.find_or_create_by(
language_id: language.id,
name: meta[:author].titleize,
slug: slugify(meta[:author])
)
corpus = Corpus.find_or_create_by(
language_id: language.id,
link: meta[:corpus_link],
slug: slugify(meta[:corpus]),
title: meta[:corpus].titleize
)
work = Work.find_or_initialize_by(
author_id: author.id,
corpus_id: corpus.id,
language_id: language.id,
slug: slugify(meta[:english_title])
) do |w|
w.edition = meta[:edition]
w.english_title = meta[:english_title]
w.filename = meta[:filename]
w.form = meta[:form]
w.md5_hash = meta[:md5_hash]
w.original_title = meta[:original_title]
w.urn = meta[:urn]
w.save!
end

text_nodes = begin
format_text_nodes(data.fetch(:text))
rescue => exception
puts "----- No `text` field in #{meta[:english_title]} -----"
puts "#{exception}"

return nil
end

text_nodes.each_with_index do |node, i|
puts "node: #{node}"
TextNode.find_or_create_by!(
author_id: author.id,
corpus_id: corpus.id,
index: i,
language_id: language.id,
location: node[:location],
text: node[:text],
work_id: work.id
)
end
end

def make_corpus_link(corpus)
if corpus.downcase == "open greek and latin"
"https://github.com/OpenGreekAndLatin"
elsif corpus.downcase == "the first 1k years of greek"
"http://opengreekandlatin.github.io/First1KGreek/"
elsif corpus.downcase == "perseus digital library"
"http://www.perseus.tufts.edu/hopper/"
else
puts "----- No link for #{corpus}. -----"
""
end
end

def slugify(s)
# https://docs.djangoproject.com/en/1.11/_modules/django/utils/text/#slugify
s.gsub(/[^\w\s-]/, '').downcase.strip.gsub(/[-\s]+/, '-')
end

task :ingest, [:repo] => :environment do |_, args|
repo = args[:repo]

puts "----- Cloning from #{repo} -----"

Dir.mktmpdir do |dir|
dest = "#{dir}/texts"
`git clone #{repo} #{dest}`

Dir.each_child(dest) do |f|
json_dir = "#{dest}/cltk_json"
next unless Dir.exists?(json_dir)

Dir.new(json_dir).each do |f|
next unless File.extname(f) == ".json"

import_cltk_json("#{dest}/#{f}", URI.parse(repo).path.split("/")[1:])
puts "----- Reading #{json_dir}/#{f} -----"

import_cltk_json("#{json_dir}/#{f}", URI.parse(repo).path.split("/")[2].sub(".git", ""))
end
end
end
Expand Down

0 comments on commit 5d0f8a9

Please sign in to comment.