Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Tree: d9cebdb02a
Fetching contributors…

Cannot retrieve contributors at this time

379 lines (360 sloc) 10.298 kB
require 'rubygems'
require 'rjb'
# -- JAVA --
ENV['JAVA_HOME'] = ENV['JDK_HOME']
Rjb::load(classpath = "#{ENV["JAVANLP_HOME"]}/projects/core/classes", ['-Xmx3000m'])
Runtime = Rjb::import('java.lang.Runtime')
MaxentTagger = Rjb::import('edu.stanford.nlp.tagger.maxent.MaxentTagger')
DocumentPreprocessor = Rjb::import('edu.stanford.nlp.process.DocumentPreprocessor')
Word = Rjb::import('edu.stanford.nlp.ling.Word')
HasWord = Rjb::import('edu.stanford.nlp.ling.HasWord')
List = Rjb::import('java.util.List')
ArrayList = Rjb::import('java.util.ArrayList')
TAGGER = MaxentTagger.new('/home/gabor/lib/data/bidirectional-distsim-wsj-0-18.tagger')
def parse(str,ground)
str = str.chomp
# -- TIME --
if str.match /^T[0-9]{4}$/ then
hr = str[0..1].to_i
min = str[2..3].to_i
[:INSTANT, DateTime.civil(ground.year,ground.month,ground.day,hr,min)]
# -- REAL TIMES --
#(year)
elsif str.match /^[0-9]{2,4}$/ then
str = "19"+str if str.length == 2
if str.length == 3 then
decade = str.to_i
[:RANGE,
DateTime.civil(decade*10),
DateTime.civil((decade+1)*10)
]
else
[:RANGE,
DateTime.civil(str.to_i),
DateTime.civil(y=str.to_i+1)
]
end
#(year-?month)
elsif str.match /^[0-9]{4}-?[0-9]{2}$/ then
year = str[0..3].to_i
month = str[-2..-1].to_i
[:RANGE,
DateTime.civil(y=year,m=month),
DateTime.civil((month==12 ? year+1 : year),(month%12)+1)
]
#(year-?month-?day-time?)
elsif str.match /^([0-9]{4})-?([0-9]{2})-?([0-9]{2})T?(MO|AF|EV|NI)?$/ then
#(variables)
year = $1.to_i
month = $2.to_i
day = $3.to_i
time = $4
hrStart = 0
hrEnd = 0
incrDay = true
#(process time)
if time == "MO" then
hrStart = 8; hrEnd = 13; incrDay = false
elsif time == "AF" then
hrStart = 12; hrEnd = 17; incrDay = false
elsif time == "EV" then
hrStart = 16; hrEnd = 21; incrDay = false
elsif time == "NI" then
hrStart = 20; hrEnd = 0; incrDay = true
end
#(set range)
begin
[:RANGE,
DateTime.civil(y=year,m=month,d=day,h=hrStart),
DateTime.civil(y=year,m=month,d=(incrDay ? day+1 : day),h=hrEnd),
]
rescue ArgumentError
raise "Unexpected error" if not incrDay
[:RANGE,
DateTime.civil(y=year,m=month,d=day),
DateTime.civil(y=(month==12 ? year+1 : year),m=(month%12)+1,d=1)
]
end
#(year-?week-we?)
elsif str.match /^([0-9]{4})-?W([0-9]{2})-?(WE)?$/ then
year=$1.to_i
week=$2.to_i
if $4 then
[:RANGE,
DateTime.commercial(year,week, 6),
DateTime.commercial(year == 52 ? year+1 : year, (week.to_i%52)+1, 1)
]
else
[:RANGE,
DateTime.commercial(year,week, 1),
DateTime.commercial(year == 52 ? year+1 : year, (week.to_i%52)+1, 1)
]
end
#(year-quarter)
elsif str.match /^[0-9]{4}-Q[0-9]$/ then
terms = str.split(/-/)
yr = terms[0].to_i
qstart = (terms[1][1,2].to_i-1)*3 + 1
qend = (((terms[1][1,2].to_i)*3)%12) + 1
[:RANGE,
DateTime.civil(yr,qstart),
DateTime.civil(qend < qstart ? yr+1 : yr, qend)
]
elsif str.match /^[0-9]{4}-H[0-9]$/ then
terms = str.split(/-/)
yr = terms[0].to_i
qstart = (terms[1][1,2].to_i-1)*6 + 1
qend = (((terms[1][1,2].to_i)*6)%12) + 1
[:RANGE,
DateTime.civil(yr,qstart),
DateTime.civil(qend < qstart ? yr+1 : yr, qend)
]
#(year-season)
elsif str.match /^[0-9]{4}-?((SP)|(SU)|(FA)|(WI))$/ then
terms = [str[0..3],str[-2..-1]]
yr = terms[0]
if(terms[1] == "WI") then
parse("#{yr}-Q1",ground)
elsif(terms[1] == "SP") then
parse("#{yr}-Q2",ground)
elsif(terms[1] == "SU") then
parse("#{yr}-Q3",ground)
elsif(terms[1] == "FA") then
parse("#{yr}-Q4",ground)
else
raise "UNKNOWN SEASON #{terms[1]} : #{str}"
end
# -- RELATIVE TIMES --
#(past)
elsif str == "PAST_REF" then
[:RANGE, :x, :NOW]
#(future)
elsif str == "FUTURE_REF" then
[:RANGE, :NOW, :x]
#(present)
elsif str == "PRESENT_REF" then
[:INSTANT, :NOW]
# -- PERIOD --
#(grounded period)
elsif str.match /^P(([0-9]*|X)(D|W|M|Q|Y|E|C|L|H|S|T))+$/ then
time = false
period = [:PERIOD, 0, 0, 0, 0, 0, 0, 0] #tag|year|month|week|day|hr|min|sec
str.scan(/([0-9]*|X)([DWMQYECLHST])/).each{ |v,d|
if d == "L" then
period[1] = (v == "X") ? "x" : v.to_i*1000
elsif d == "C" then
period[1] = (v == "X") ? "x" : v.to_i*100
elsif d == "E" then
period[1] = (v == "X") ? "x" : v.to_i*10
elsif d == "Y" then
period[1] = (v == "X") ? "x" : v.to_i
elsif d == "Q" then
period[2] = (v == "X") ? "x" : v.to_i*3
elsif d == "M" then
period[time ? 6 : 2] = (v == "X") ? "x" : v.to_i
elsif d == "W" then
period[3] = (v == "X") ? "x" : v.to_i
elsif d == "D" then
period[4] = (v == "X") ? "x" : v.to_i
elsif d == "H" then
period[5] = (v == "X") ? "x" : v.to_i
elsif d == "S" then
period[7] = (v == "X") ? "x" : v.to_i
elsif d == "T" then
time = true
else
raise "UNKNOWN INTERVAL #{d}"
end
}
period
elsif str.include? "X" then
[:UNK, str]
else
begin
#(ruby's parse time)
[:INSTANT, DateTime.parse(str)]
rescue Exception
raise "UNKNOWN TIME EXPRESSION #{str}"
end
end
end
def parseStr(str)
array = str[1...-1].split /,/
array[0] = array[0].to_sym
array
end
def sameTime(time1, time2, grounding)
def ground(time,grounding)
if time[0] == :INSTANT and not time[1].is_a? DateTime then
if time[1].to_sym == :NOW then
time[1] = grounding
end
elsif time[0] == :RANGE and\
not time[1].is_a? DateTime and not time[2].is_a? DateTime then
if time[1].to_sym == :NOW and time[2].to_sym == :x then
time[1] = grounding
time[2] = DateTime.civil(10000)
elsif time[1].to_sym == :x and time[2].to_sym == :NOW then
time[1] = DateTime.civil(-10000)
time[2] = grounding
end
end
time
end
time1 = ground(time1,grounding)
time2 = ground(time2,grounding)
time1 = [:RANGE, time1[1], time1[1]] if time1[0] == :INSTANT
time2 = [:RANGE, time2[1], time2[1]] if time2[0] == :INSTANT
if time1[0] == :RANGE and time2[0] == :RANGE then
#(range: max day total offset match)
start1 = DateTime.parse(time1[1].to_s)
start2 = DateTime.parse(time2[1].to_s)
end1 = DateTime.parse(time1[2].to_s)
end2 = DateTime.parse(time2[2].to_s)
startGap = (start2-start1).to_f.abs
endGap = (end2-end1).to_f.abs
(startGap+endGap) <= 1.0
elsif time1[0] == :PERIOD and time2[0] == :PERIOD then
#(duration: day match)
year1,month1,week1,day1,hour1,minute1,second1 = time1[1..-1]
year2,month2,week2,day2,hour2,minute2,second2 = time1[1..-1]
return (year1==year2 and month1==month2 and week1==week2 and day2==day1)
else
#(times and durations are not convertable)
false
end
end
def tokenize(text)
#(process)
if text.is_a? Array then
text = text.map do |sent|
sent.join(' ') if sent.is_a? Array
end
text = text.join('
')
end
#(write to file)
tmp = `mktemp`
File.open(tmp, 'w') {|f| f.write(text) }
#(tokenize)
sents = []
tokenizer = DocumentPreprocessor.new(tmp)
iter = tokenizer.iterator
while(iter.hasNext) do
lst = iter.next
iterSent = lst.iterator
sent = []
while iterSent.hasNext do
w = iterSent.next
sent << w.word
end
sents << sent
end
sents
end
def tokenizeSentence(sent)
sent = sent.join(' ') if sent.is_a? Array
tokenize(sent).flatten
end
def tagSentence(words)
words = words.split(/\s+/) if words.is_a? String
pos = []
#(input)
sent = ArrayList.new
words.each do |w|
sent.add(Word.new(w))
end
#(tag)
tagged = TAGGER.tagSentence(sent)
iter = tagged.iterator
while(iter.hasNext) do
pos << iter.next.tag
end
pos
end
def offsetMap(orig,retoken,greedy=true)
#(simple maps)
orig = orig.map do |word| word == '"' ? "''" : word; end
orig = orig.map do |word| word == '(' ? "-LXB-" : word; end
orig = orig.map do |word| word == ')' ? "-RXB-" : word; end
orig = orig.map do |word| word == '{' ? "-LXB-" : word; end
orig = orig.map do |word| word == '}' ? "-RXB-" : word; end
orig = orig.map do |word| word == '[' ? "-LXB-" : word; end
orig = orig.map do |word| word == ']' ? "-RXB-" : word; end
retoken = retoken.map do |word| word == '-LRB-' ? "-LXB-" : word; end
retoken = retoken.map do |word| word == '-RRB-' ? "-RXB-" : word; end
retoken = retoken.map do |word| word == '-LCB-' ? "-LXB-" : word; end
retoken = retoken.map do |word| word == '-RCB-' ? "-RXB-" : word; end
retoken = retoken.map do |word| word == '-LSB-' ? "-LXB-" : word; end
retoken = retoken.map do |word| word == '-RSB-' ? "-RXB-" : word; end
#(bloody brits)
orig = orig.map do |word| word == 'theatre' ? 'theater' : word; end
#--Util
#(variables)
infoOrig = [0,0]
infoRetoken = [0,0]
#(iterator)
def nextChar(array,info)
w,offset = info
info[1] += 1
if info[0] < array.length and info[1] >= array[info[0]].length then
info[0] += 1
info[1] = 0
end
if w >= array.length then
nil
else
array[w][offset].chr
end
end
#--Setup
mapping = retoken.map do |i| -42; end
#--Map
chOrig = nextChar(orig,infoOrig)
chRetok = nextChar(retoken,infoRetoken)
while chOrig and chRetok do
#(fixes)
chRetok = 's' if chRetok == 'z' and chOrig == 's'\
if "#{chRetok}".strip != "#{chOrig}".strip #bloody brits
#(align)
if "#{chRetok}".strip != "#{chOrig}".strip then
sav = chRetok
mapping[infoRetoken[0]-1] = -1\
if greedy or mapping[infoRetoken[0]-1] == -42
chRetok = nextChar(retoken,infoRetoken)
puts orig.join(' ') if not chOrig == chRetok
puts retoken.join(' ') if not chOrig == chRetok
raise "not same #{chOrig} #{chRetok} [ #{sav} ]" if not chOrig == chRetok
else
#(save)
mapping[infoRetoken[0]] = infoOrig[0]\
if infoOrig[0] < orig.length and\
(greedy or mapping[infoRetoken[0]] == -42)
end
#(iter)
chOrig = nextChar(orig,infoOrig)
chRetok = nextChar(retoken,infoRetoken)
end
#--Clean Up
#(end)
while nextChar(retoken,infoRetoken) do
mapping[infoRetoken[0]] = orig.length-1
end
#(skipped terms)
mapping.each_with_index do |term,i|
if term == -42 then
if i == 0 then
mapping[i] = 0
else
mapping[i] = mapping[i-1]
end
end
end
#--Return
mapping
end
#orig = 'It\'s a sentence Inc. New sentence for $50'.split /\s+/
#retok = 'It \'s a sentence Inc. . New sentence for $ 50'.split /\s+/
#offsetMap(orig,retok).each_with_index do |o,r|
# puts "#{r} #{o}: #{retok[r]} -> #{orig[o]}"
#end
Jump to Line
Something went wrong with that request. Please try again.