forked from bridgedb/bridgedbjs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
jsonld-matcher.js
291 lines (268 loc) · 9.11 KB
/
jsonld-matcher.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
/* JSON-LD @set and @list intersection
* Tabular data like data sources (set or list of objects)
* Search criteria like a db name and an identifier, sorted by the preference for matching ()
* Given tabular data , we want to find one row that matches
* a provided object.
*
* First, we pull out the keys from the provided object that match the column headers
* in the tabular data.
*
* Then we try matching any of the values for each of those keys.
*/
var _ = require('lodash');
var highland = require('highland');
var internalContext = require('./context.json');
var RxNode = require('rx-node');
var Utils = require('./utils.js');
var JsonldMatcher = function(jsonldRx) {
'use strict';
var createJsonldNormalizerStream = function(doc) {
var observable = jsonldRx.defaultNormalize(doc);
var stream = highland();
//RxNode.writeToStream(observable, stream, 'utf8');
RxNode.writeToStream(observable, stream);
return stream;
};
var normalizationNSBase = 'jsonldMatcher';
var jsonldNormalizationNS = normalizationNSBase + 'JsonldNormalized';
var textNormalizationNS = normalizationNSBase + 'TextNormalized';
function _removeNormalizedProperties(args) {
return _.reduce(args, function(result, value, key) {
if (key.indexOf(normalizationNSBase) !== 0) {
result[key] = value;
}
return result;
}, {});
}
function _addNormalizedProperties(input, selectedKeys) {
return highland.pairs(input)
.filter(function(pair) {
return !selectedKeys ? true : selectedKeys.indexOf(pair[0]) > -1;
})
.flatMap(function(pair) {
return _jsonldNormalizePair(pair)
.flatMap(function(jsonldNormalizedPair) {
return _textNormalizePair(jsonldNormalizedPair)
.flatMap(function(textNormalizedPair) {
return [
pair,
jsonldNormalizedPair,
textNormalizedPair
];
});
})
.map(function(pairs) {
return pairs;
});
})
.reduce(input, function(accumulator, pair) {
accumulator[pair[0]] = pair[1];
return accumulator;
});
}
function getFormattedForComparison(
dataStream, name, selectedKeys) {
function init() {
return dataStream.flatMap(function(data) {
return _addNormalizedProperties(
data, selectedKeys);
});
}
return Utils._runOnceGlobal(name, init)
.collect();
}
function _jsonldNormalizePair(pair) {
var doc = {};
doc['@context'] = internalContext;
doc[pair[0]] = pair[1];
return createJsonldNormalizerStream(doc)
.map(function(normalized) {
var elementDelimiter = ' .\n';
var normalizedValues = normalized.split(elementDelimiter);
// Get rid of last element, which will always be '' (empty string)
normalizedValues.pop();
return normalizedValues;
})
.map(function(normalizedValues) {
var key = jsonldNormalizationNS + pair[0];
return [key, normalizedValues];
});
}
/**
* @private
*
* Normalize text for comparison purposes
*
* @param {undefined|null|string|number|object|boolean|date} inputText
* @return {string} normalizedText
*/
function _normalizeText(inputText) {
var stringifiedInput = inputText;
if (!_.isString(inputText)) {
if (_.isNumber(inputText) || _.isRegExp(inputText) ||
_.isDate(inputText) || _.isBoolean(inputText)) {
stringifiedInput = inputText.toString();
} else if (_.isPlainObject(inputText)) {
stringifiedInput = JSON.stringify(inputText);
} else if (_.isUndefined(inputText)) {
stringifiedInput = 'undefined';
} else if (_.isNull(inputText)) {
stringifiedInput = 'null';
} else {
console.warn('Cannot normalize provided value "' +
JSON.stringify(inputText) + '".');
console.warn('Using toString on input.');
stringifiedInput = inputText.toString();
}
}
// not using \w because we don't want to include the underscore
var identifierPattern = /[^A-Za-z0-9]/gi;
var alphanumericText = stringifiedInput.replace(identifierPattern, '');
var normalizedText = alphanumericText;
// This could be null if the inputText were something like '-..-'
if (!_.isNull(alphanumericText)) {
normalizedText = alphanumericText.toUpperCase();
}
return normalizedText;
}
function _textNormalizePair(pair) {
var pairStream;
if (pair[0].indexOf(jsonldNormalizationNS) === -1) {
pairStream = _jsonldNormalizePair(pair);
} else {
pairStream = highland([pair]);
}
return pairStream.map(function(pair) {
var key = textNormalizationNS +
(pair[0]).replace(jsonldNormalizationNS, '');
var value;
if (_.isArray(pair[1])) {
value = pair[1].map(_normalizeText);
} else {
value = _normalizeText(pair[1]);
}
return [key, value];
});
}
function tieredFind(args, dataStream, name, selectedKeys, alternateFilters) {
// if an @id is provided, we will use it. We will search for a matching
// @id and for a match in owl:sameAs.
if (!!args['@id']) {
args['owl:sameAs'] = args['owl:sameAs'] || [];
args['owl:sameAs'].push(args['@id']);
if (selectedKeys.indexOf('@id') === -1) {
selectedKeys.push('@id');
}
if (selectedKeys.indexOf('owl:sameAs') === -1) {
selectedKeys.push('owl:sameAs');
}
}
alternateFilters = alternateFilters || [];
var getPairStream = function() {
return highland.pairs(args).filter(function(pair) {
return selectedKeys.indexOf(pair[0]) > -1;
});
};
var isEmpty = true;
return highland(getFormattedForComparison(
dataStream, name, selectedKeys))
.flatMap(function(dataSet) {
// First we try the built-in, preferred filters
return highland([
getPairStream().flatMap(function(pair) {
return tieredFindAttempt(pair, dataSet, 0);
}),
getPairStream().filter(function(pair) {
return selectedKeys.indexOf(pair[0]) > -1;
})
.flatMap(function(pair) {
return tieredFindAttempt(pair, dataSet, 1);
}),
getPairStream().filter(function(pair) {
return selectedKeys.indexOf(pair[0]) > -1;
})
.flatMap(function(pair) {
return tieredFindAttempt(pair, dataSet, 2);
})
])
// If the preferred filters don't find anything, we try
// any provided alternate filters.
.concat(
alternateFilters.map(function(alternateFilter) {
return highland(dataSet).filter(alternateFilter);
})
)
// If we still don't find anything, we return an error.
.concat(highland([
function() {
var message = 'Could not find a match for ' + name +
' for the provided args "' + JSON.stringify(args) + '"';
var err = new Error(message);
return err;
}()
]))
// TODO why is this not throwing an error when the codeblock
// above returns one?
.errors(function(err, push) {
if (isEmpty) {
return push(err);
}
})
// TODO The chunk of code below seems like a kludge.
// 1) It is trying to detect errors, which should be
// taken care of above.
// 2) It is returning the first non-empty stream, but
// to do so requires using this "isEmpty" variable,
// which seems wrong.
.flatMap(function(inputStream) {
if (highland.isStream(inputStream) && isEmpty) {
return inputStream.map(function(data) {
isEmpty = false;
return data;
});
} else if (!isEmpty) {
return highland([]);
} else {
throw inputStream;
}
});
})
.map(_removeNormalizedProperties);
}
var pairByAttemptIndex = [
function(pair) {
return highland([pair]);
},
// second attempt. if previous failed, we normalize it with a JSON-LD context.
_jsonldNormalizePair,
// third attempt. if previous failed, we get a little looser about the match
// here on this attempt.
function(pair) {
return highland([pair]).flatMap(_textNormalizePair);
}
];
function tieredFindAttempt(pair, candidates, attemptIndex) {
return pairByAttemptIndex[attemptIndex](pair)
.flatMap(function(processedDesiredPair) {
return highland(candidates).filter(function(candidate) {
var candidateValue = candidate[processedDesiredPair[0]];
var processedDesiredValue = processedDesiredPair[1];
return candidateValue === processedDesiredValue ||
!_.isEmpty(
_.intersection(
candidateValue, processedDesiredValue
)
);
});
});
}
return {
_addNormalizedProperties:_addNormalizedProperties,
tieredFind:tieredFind,
_jsonldNormalizePair:_jsonldNormalizePair,
_normalizeText:_normalizeText,
_removeNormalizedProperties:_removeNormalizedProperties,
_textNormalizePair:_textNormalizePair,
};
};
exports = module.exports = JsonldMatcher;