-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.rb
291 lines (258 loc) · 12.1 KB
/
parser.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
require_relative "rows/blank"
require_relative "rows/regular"
require_relative "rows/compact_planned"
require_relative "rows/custom_config"
require_relative "rows/comment"
module Reading
module Parsing
#
# Parses a string containing a row of a CSV reading log, into a hash
# mirroring the structure of the row. This hash is an intermediate form and
# not the final item data. It's the raw material for Parsing::Transformer to
# generate the final item data.
#
# Below is an example intermediate hash parsed from this row, which has a Rating
# column, then a Head column containing an author, title, series, and extra info:
#
# 3|📕Thomas More - Utopia -- trans. Robert Adams -- ed. George Logan -- in Cambridge History of Political Thought
#
# {
# rating: { number: "1" },
# head: [{
# author: "Thomas More",
# title: "Utopia",
# series_names: ["Cambridge History of Political Thought"],
# series_volumes: [nil],
# extra_info: ["trans. Robert Adams", "ed. George Logan"],
# format: :print,
# }]
# }
#
# The hash's top-level keys are column names. The nested keys come from
# regex capture group names in each column (for this example, see ::regexes
# in rating.rb and head.rb in parsing/rows/regular_columns).
#
# All the rest is just details of how the parts of a column are joined:
#
# - The :head value is an array because Head.split_by_format? is
# true (because a Head column can potentially contain multiple items).
# That's also where { format: :print } comes from.
#
# - The :series_names and :series_volumes values are arrays because these
# keys are in Head.flatten_into_arrays, which causes the column's segments
# (separated by " -- ") to be merged into one hash.
#
class Parser
using Util::HashArrayDeepFetch
# Parses a row string into a hash that mirrors the structure of the row.
# @param string [String] a string containing a row of a CSV reading log.
# @return [Hash]
def parse_row_to_intermediate_hash(string)
columns = extract_columns(string)
if Config.hash.fetch(:skip_compact_planned) && columns.has_key?(Rows::CompactPlanned::Head)
return {}
end
columns.map { |column, column_string|
parse_column(column, column_string)
}.to_h
end
private
# Splits the row string by column and pairs them in a hash with column
# classes, which contain the information necessary to parse each column.
# @param string [String] a string containing a row of a CSV reading log.
# @return [Hash{Class => String}] a hash whose keys are classes inheriting
# Parsing::Rows::Column.
def extract_columns(string)
string = string.dup.force_encoding(Encoding::UTF_8)
column_strings = string.split(Config.hash.fetch(:column_separator))
row_types = [Rows::Blank, Rows::Regular, Rows::CompactPlanned, Rows::CustomConfig, Rows::Comment]
column_classes = row_types
.find { |row_type| row_type.match?(string) }
.tap { |row_type|
if row_type == Rows::CustomConfig
row_type.merge_custom_config!(string)
end
}
.column_classes
.select { |column_class|
Config.hash.fetch(:enabled_columns).include?(column_class.to_sym)
}
if !column_classes.count.zero? && column_strings.count > column_classes.count
raise TooManyColumnsError, "Too many columns"
end
column_classes
.zip(column_strings)
.reject { |_class, string| string.nil? }
.to_h
end
# Parses a column into an array of two elements (a key for the column name
# and a value of its contents).
# @param column_class [Class] a class inheriting Parsing::Rows::Column.
# @param column_string [String] a string containing a column from a row.
# @return [Array(Symbol, Hash), Array(Symbol, Array)]
def parse_column(column_class, column_string)
# Multiple format emojis are possible in some columns:
# - Head column, for multiple items.
# - Sources column, for multiple variants of an item.
# - Compact planned head column, for multiple items.
# This is the default case below the two guard clauses. It's more complex
# because there's possibly a string before the first format, and there's
# an extra level of nesting in the returned array.
# Simplest case: if the column is never split by format, return the
# column name and the parsed segment(s), which is either a Hash (if the
# column can't have multiple segments or if its segments are flattened)
# or an Array (if there are multiple segments and they're not flattened).
if !column_class.split_by_format?
parsed_column = parse_segments(column_class, column_string)
return [column_class.to_sym, parsed_column]
end
# Also simple: if the column *can* be split by format but in this row
# it doesn't contain any format emojis, return the same as above but
# with an extra level of nesting (except when the parsed result is nil).
if column_class.split_by_format? &&
!column_string.match?(Config.hash.deep_fetch(:regex, :formats))
parsed_column = parse_segments(column_class, column_string)
# Wrap a non-empty value in an array so that e.g. a head without
# emojis is still an array. This way the extra level of nesting can
# be consistently expected for columns that *can* be split by format.
parsed_column_nonempty_nested = [parsed_column.presence].compact
return [column_class.to_sym, parsed_column_nonempty_nested]
end
# The rest is the complex case: if the column *can and is* split by format.
# Each format plus the string after it.
format_strings = column_string.split(Config.hash.deep_fetch(:regex, :formats_split))
# If there's a string before the first format, e.g. "DNF" in Head column.
unless format_strings.first.match?(Config.hash.deep_fetch(:regex, :formats))
before_formats = parse_segment(column_class, format_strings.shift, before_formats: true)
end
# Parse each format-plus-string into an array of segments.
heads = format_strings.map { |string|
format_emoji = string[Config.hash.deep_fetch(:regex, :formats)]
string.sub!(format_emoji, "")
format = Config.hash.fetch(:formats).key(format_emoji)
parse_segments(column_class, string)
.merge(format: format)
}
# Combine values of conflicting keys so that in a compact planned
# Head column, sources from before_formats are not ignored.
if before_formats
heads.each do |head|
head.merge!(before_formats) do |k, old_v, new_v|
(new_v + old_v).uniq
end
end
end
[column_class.to_sym, heads]
end
# Parses a string of segments, e.g. "Utopia -- trans. Robert Adams -- ed. George Logan"
# @param column_class [Class] a class inheriting Parsing::Rows::Column.
# @param string [String] a string containing segments, which is either an
# entire column or (for columns that are split by format emoji) a string
# following a format emoji.
# @return [Array<Hash>, Hash] either an array of parsed segments (hashes),
# or a single hash if the column can't be split by segment or if the
# segments are flattened into one hash.
def parse_segments(column_class, string)
return {} if string.blank?
# If the column can't be split by segment, parse as a single segment.
if !column_class.split_by_segment?
return parse_segment(column_class, string)
end
# Add an extra level of nesting if the column can have segment groups,
# as in "2021/1/28..2/1 x4 -- ..2/3 x5 ---- 11/1 -- 11/2"
if column_class.split_by_segment_group?
segments = string
.split(column_class.segment_group_separator)
.map { |segment_group|
segment_group
.split(column_class.segment_separator)
.map.with_index { |segment, i|
parse_segment(column_class, segment, i)
}
}
else
segments = string
.split(column_class.segment_separator)
.map.with_index { |segment, i|
parse_segment(column_class, segment, i)
}
end
if column_class.flatten_into_arrays.any?
segments = segments.reduce { |merged, segment|
merged.merge!(segment) { |_k, old_v, new_v|
# old_v is already an array by this point, since its key should be
# in Column.flatten_into_arrays
old_v + new_v
}
}
end
segments
end
# Parses a segment using a regular expression from the column class.
# @param column_class [Class] a class inheriting Parsing::Rows::Column.
# @param segment [String] a segment, e.g. "Bram Stoker - Dracula".
# @param segment_index [Integer] the position of the segment when it's in
# part of a series of segments; this can change which regular expressions
# are applicable to it.
# @param before_formats [Boolean] whether to use the before-formats regexes.
# @return [Hash{Symbol => Object}] the parsed segment, whose values are Strings
# unless changed via column_class.tweaks or column_class.flatten_into_arrays.
# Example: { author: "Bram Stoker", title: "Dracula"}
def parse_segment(column_class, segment, segment_index = 0, before_formats: false)
if before_formats
regexes = column_class.regexes_before_formats
else
regexes = column_class.regexes(segment_index)
end
parsed_segment = nil
regexes.each do |regex|
parsed_segment = parse_segment_with_regex(segment, regex)
break if parsed_segment
end
if parsed_segment.nil?
raise ParsingError, "Could not parse \"#{segment}\" in " \
"the #{column_class.column_name} column"
end
tweak_and_arrayify_parsed_segment(parsed_segment, column_class)
end
# Parses a segment using the given regular expression.
# @param segment [String] a segment, e.g. "Bram Stoker - Dracula".
# @param regex [Regexp] the regular expression with which to parse the segment.
# @return [Hash{Symbol => String}] e.g. { author: "Bram Stoker", title: "Dracula"}
def parse_segment_with_regex(segment, regex)
segment
.tr(Config.hash.fetch(:ignored_characters), "")
.strip
.match(regex)
&.named_captures
&.compact
&.transform_keys(&:to_sym)
&.transform_values(&:strip)
&.transform_values(&:presence)
end
# Modify the values of the parsed segment according to column_class.tweaks,
# and wrap them in an array according to column_class.flatten_into_arrays.
# @param parsed_segment [Hash] e.g. { author: "Bram Stoker", title: "Dracula"}
# @return [Hash{Symbol => Object}]
def tweak_and_arrayify_parsed_segment(parsed_segment, column_class)
column_class.tweaks.each do |key, tweak|
if parsed_segment.has_key?(key)
parsed_segment[key] = tweak.call(parsed_segment[key])
end
end
# Ensure that values of keys in column_class.flatten_into_arrays are arrays.
column_class.flatten_into_arrays.each do |key|
if parsed_segment.has_key?(key)
val = parsed_segment[key]
# Not using Array(val) because that results in an empty array when
# val is nil, and the nil must be preserved for series name and
# volume arrays to line up with an equal number of elements (because
# the volume may be nil).
parsed_segment[key] = [val] if !val.is_a?(Array)
end
end
parsed_segment
end
end
end
end