/
epwing_conv.rb
executable file
·67 lines (60 loc) · 2.59 KB
/
epwing_conv.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/ruby
#coding:utf-8
#ebd2htmlの出力結果を
#Mac OS X v10.5 "Leopard"の辞書アプリケーション(Dictionary.app)用のXMLに変換する
#by Tats_y (http://www.binword.com/blog/)
#v0.01
#2008/02/11
require 'nkf' # to convert Zenkaku-alphabet to Hankaku
#select appropriate one
#SJIS
ARGF.set_encoding("Windows-31J","UTF-8", :invalid => :replace, :undef => :replace)
#JIS
#ARGF.set_encoding("ISO-2022-JP","UTF-8", :invalid => :replace, :undef => :replace)
id = ""
title = ""
key = Array.new
body = ""
print '<?xml version="1.0" encoding="UTF-8"?>' + "\n"
print '<d:dictionary xmlns="http://www.w3.org/1999/xhtml" xmlns:d="http://www.apple.com/DTDs/DictionaryService-1.0.rng">' + "\n"
while line = gets
next if line.strip.empty? #空行を読み飛ばす
line1 = line #ISO-2022-JPからUTF-8へ変換
if line1 =~ /^<dt id=/ then
id = line1.slice(/<dt id="([A-Z\d]+)">/, 1) #idを取得
title = line1.slice(/<dt id=".+">(.+)<\/dt>/, 1) #項目名を取得
end
if line1 =~ /^<key title/ then
key.push(NKF.nkf('-wZ1', line1.slice(/<key title=".+" type=".+">(.+)<\/key>/, 1).to_s)) #項目のキーを配列に格納
end
if line1 =~ /^<dd>/ then
body = "\t<p>\n"
title2 = gets.gsub(/<br>|<nobr>|<\/nobr>/,"").chomp #本文の1行目はタイトルに使う
while line2 = gets
if line2 =~ /^<\/p><\/dd>|^<\/dl>/ then #項目の最終行は</dd>もしくは</dl>で判定する(ebd2htmlの出力では最後の項目が</dd>で閉じられていないため)
body = body.gsub(/Ἁ�|<nobr>|<\/nobr>/,"") #各行に入っている余計な文字列を削除
body = body.gsub(/Ἁ�/," ") #各行に入っている余計な文字列を削除
body = body.gsub(/<br>/, "<br/>")
body = body + "\t</p>" #末尾を</p>で閉じる
body = body.gsub(/<a href="#([0-9A-Z]+)">/) { "<a href=\"x-dictionary:r:" + $1 + "\">"} #リンクの形式を変換
title3 = (title + title2.gsub(/<sub>|<\/sub>/, "")).strip
print '<d:entry id="' + id + '" d:title="' + title3 + "\">\n"
print "\t<d:index d:value=\"" + title + '" d:title="' + title3 + "\"/>\n"
key.each { |elem|
print "\t<d:index d:value=\"" + elem + "\" d:title=\"" + title3 +"\"/>\n"
}
print "\t<h1>" + (title + title2).strip + "</h1>\n"
print body + "\n"
print "</d:entry>\n"
id = ""
title = ""
title2 = ""
key = []
body = ""
break
end
body = body + "\t" + line2
end
end
end
print "</d:dictionary>\n"