-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentence_chunk.rb
219 lines (180 loc) · 7.18 KB
/
sentence_chunk.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# frozen_string_literal: true
# Much of this code is very similar to that in WordChunk. It may be worth
# refactoring into a Chunk parent class eventually
# for a Sentence Chunk, a 'chunk' is an ordered collection of words, spaces and
# punctuation (all called tokens)
class SentenceChunk < ApplicationRecord # rubocop:disable Metrics/ClassLength
belongs_to :text_sample
validates :token_ids, presence: true # may normalise this later
validates :size, presence: true
validates :count, presence: true
# serialize(:token_ids, Array)
CHUNK_SIZE_RANGE = (2..8).freeze
def self.analyse(text_sample)
# TODO: it might be better to get the upper limit from a setting, or
# according to how many unique chunks we got for the previous chunk_size
chunk_sizes = CHUNK_SIZE_RANGE
# break the text_sample up into an array of token IDs
text_sample_token_ids = Token.id_ise(text_sample.text, :sentence)
chunk_sizes.each do |chunk_size|
next if text_sample_token_ids.size < chunk_size
count_chunks(text_sample_token_ids, text_sample.id, chunk_size)
end
end
def self.count_chunks(token_ids, text_sample_id, chunk_size)
chunks_hash = build_chunks_hash(token_ids, chunk_size)
save_chunks(chunks_hash, text_sample_id, chunk_size, :insert_all)
end
def self.build_chunks_hash(token_ids, chunk_size)
hash = Hash.new(0)
limit = token_ids.size - chunk_size
(0..limit).each do |i|
# iterate through the sentence chunks one token at a time
# chunk_text = token_ids[i, chunk_size].join
chunk_text = token_ids[i, chunk_size]
# increment the count of the chunk we just found
# use fetch to retrieve hash values if the array is a key
# h = {[1]=> 2}
# h.fetch([1]) # => 2
hash[chunk_text] += 1
end
hash
end
def self.save_chunks(chunks_hash, text_sample_id, chunk_size, save_strategy = :insert_all)
case save_strategy
when :insert_all
# uses lots of database rows
save_chunks_by_insert_all(chunks_hash, text_sample_id, chunk_size)
# when :create!
# # VEEERY slow and uses lots of DB rows
# save_chunks_by_create(chunks_hash, text_sample_id, chunk_size)
else
raise "Unknown save_strategy: #{save_strategy}"
end
end
def self.save_chunks_by_insert_all( # rubocop:disable Metrics/MethodLength
chunks_hash, text_sample_id, chunk_size
)
current_time = DateTime.now
import_array = []
chunks_hash.each do |token_ids, count|
import_hash = {
token_ids: token_ids,
size: chunk_size,
count: count,
text_sample_id: text_sample_id,
created_at: current_time,
updated_at: current_time
}
import_array << import_hash
end
SentenceChunk.insert_all! import_array
end
def self.reanalyse(text_sample)
# clear out anything from previous analysis
SentenceChunk.where('text_sample_id = ?', text_sample.id).delete_all
SentenceChunk.analyse(text_sample)
end
# Entry point for generating text using the sentence chunk strategy
#
# @param [Hash] params parameters to generate with
# @option [Integer] chunk_size chunk size to use for generation
# @option [Integer] token_size number of tokens to generate
# @option [Integer] text_sample_id TextSample to use as the model
def self.generate(params = {}) # rubocop:disable Metrics/MethodLength
unless chunks_built_for? params[:text_sample_id]
return { message: 'Sentence chunks have not been built for this text sample' }
end
chunk_size, token_size, text_sample_id = extract_generate_params(params)
text_sample_text = TextSample.find(text_sample_id).text
text_sample_token_length = Token.split_into_tokens(text_sample_text).length
output = []
if chunk_size == 'all'
CHUNK_SIZE_RANGE.each do |current_chunk_size|
# handle edge case where text sample has less tokens than the chunk size
if current_chunk_size <= text_sample_token_length
output.push(generate_text(current_chunk_size, token_size,
text_sample_id))
end
end
else
output.push(generate_text(chunk_size, token_size, text_sample_id))
end
{ output: output }
end
# Helper method that pulls individual parameters out of params or sets
# reasonable defaults
# @param (see ::generate)
def self.extract_generate_params(params = {})
chunk_size =
if params[:chunk_size]
.to_i.zero?
Setting.chunk_size
else params[:chunk_size].to_i
end
token_size = if params[:output_size]
.to_i.zero?
Setting.token_size else params[:output_size].to_i end
[chunk_size, token_size, params[:text_sample_id]]
end
def self.chunks_built_for?(text_sample_id)
!SentenceChunk.find_by(text_sample_id: text_sample_id).nil?
end
# Actually generate the text for the given chunk_size and text sample
# @param Integer chunk_size chunk size to use for generation
# @param Integer token_size number of tokens to generate
# @param Integer text_sample_id TextSample to use as the model
#
# @return Hash with generated text and chunk_size
def self.generate_text(chunk_size, token_size, text_sample_id)
chunk = choose_starting_chunk(text_sample_id, chunk_size)
output_token_ids = chunk.token_ids
while output_token_ids.size < token_size
chunk = chunk.choose_next_chunk
# if we couldn't get a next chunk, then just leave it there
break unless chunk
next_token_id = chunk.token_ids[-1]
output_token_ids << next_token_id
end
output = Token.replace_token_ids_with_tokens(output_token_ids).join
{ text: output, chunk_size: chunk_size }
end
def self.choose_starting_chunk(text_sample_id, chunk_size)
candidates = SentenceChunk
.where({ text_sample_id: text_sample_id, size: chunk_size })
.limit(nil)
candidates[(rand * candidates.size).to_i]
end
# Choose the next chunk after this one
def choose_next_chunk
token_ids_where = []
# grab all but the first token in the chunk
token_ids[1..].map.with_index do |token_id, index|
# and build a where clause so that all the tokens in the array match.
# Note: PostgreSQL arrays are 1-indexed and not 0-indexed
token_ids_where << "token_ids[#{index + 1}] = #{token_id}"
end
token_ids_where = token_ids_where.join(' AND ')
candidates = SentenceChunk
.where("text_sample_id = :text_sample_id AND size = :sentence_chunk_size AND #{token_ids_where}",
text_sample_id: text_sample.id, sentence_chunk_size: size)
.limit(nil)
SentenceChunk.choose_chunk_from_candidates(candidates)
end
def self.choose_chunk_from_candidates(candidates)
counts_array = SentenceChunk.build_counts_array(candidates)
counts_array[(rand * counts_array.size).to_i]
end
def self.build_counts_array(candidates)
counts_array = []
candidates.each do |chunk|
chunk.count.times { counts_array.push(chunk) }
end
counts_array
end
# helper method for converting an array of token_ids back to an array of
# readable text
def to_tokens
Token.replace_token_ids_with_tokens(token_ids)
end
end