-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
438 lines (365 loc) · 14.7 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
"""Kafka utils modules"""
import json
import re
from typing import Dict, Any, Iterator, Optional, Sequence
from urllib.parse import urlparse
from xml.sax.handler import ContentHandler # nosec B406
from xml.sax.saxutils import escape # nosec B406
from cmem.cmempy.config import get_cmem_base_uri
from cmem.cmempy.workspace.projects.resources.resource import get_resource_response
from cmem.cmempy.workspace.tasks import get_task
from cmem_plugin_base.dataintegration.context import (
ExecutionContext,
ExecutionReport,
UserContext,
)
from cmem_plugin_base.dataintegration.entity import (
Entities,
Entity,
EntityPath,
EntitySchema,
)
from cmem_plugin_base.dataintegration.plugins import PluginLogger
from cmem_plugin_base.dataintegration.utils import (
setup_cmempy_user_access,
split_task_id,
)
from confluent_kafka import Producer, Consumer, KafkaException, KafkaError
from confluent_kafka.admin import AdminClient, TopicMetadata, ClusterMetadata
from defusedxml import ElementTree
from cmem_plugin_kafka.constants import KAFKA_TIMEOUT
# pylint: disable-msg=too-few-public-methods
class KafkaMessage:
"""
A class used to represent/hold a Kafka Message key and value
...
Attributes
----------
key : str
Kafka message key
value : str
Kafka message payload
"""
def __init__(self, key: Optional[str] = None, value: str = ""):
self.value: str = value
self.key: Optional[str] = key
class KafkaProducer:
"""Kafka producer wrapper over confluent producer"""
def __init__(self, config: dict, topic: str):
"""Create Producer instance"""
self._producer = Producer(config)
self._topic = topic
self._no_of_success_messages: int = 0
def process(self, message: KafkaMessage):
"""Produce message to topic."""
self._no_of_success_messages += 1
self._producer.produce(self._topic, value=message.value, key=message.key)
def poll(self, timeout):
"""Polls the producer for events and calls the corresponding callbacks"""
self._producer.poll(timeout)
def flush(self, timeout=KAFKA_TIMEOUT):
"""Wait for all messages in the Producer queue to be delivered."""
prev = 0
while True:
messages_in_queue = self._producer.flush(timeout=timeout)
if prev == messages_in_queue:
break
prev = messages_in_queue
def get_success_messages_count(self) -> int:
"""Return count of the successful messages"""
return self._no_of_success_messages
class KafkaConsumer:
"""Kafka consumer wrapper over confluent consumer"""
def __init__(
self, config: dict, topic: str, log: PluginLogger, context: ExecutionContext
):
"""Create consumer instance"""
self._consumer = Consumer(config)
self._context = context
self._topic = topic
self._log = log
self._no_of_success_messages = 0
self._first_message: Optional[KafkaMessage] = None
self._schema: EntitySchema = None
def __enter__(self):
return self.get_xml_payload()
def get_schema(self):
"""Return kafka message schema paths"""
message = self.get_first_message()
if not message:
return None
json_payload = json.loads(message.value)
schema_paths = []
self._log.info(f'values : {json_payload["entity"]["values"]}')
for path in self._get_paths(json_payload["entity"]["values"]):
path_uri = f"{path}"
schema_paths.append(EntityPath(path=path_uri))
self._schema = EntitySchema(
type_uri=json_payload["schema"]["type_uri"],
paths=schema_paths,
)
return self._schema
def _get_paths(self, values: dict):
self._log.info(f"_get_paths: Values dict {values}")
return list(values.keys())
def get_entities(self):
"""Generate the entities from kafka messages"""
if self._first_message:
yield self._get_entity(self._first_message)
for message in self.poll():
yield self._get_entity(message)
def _get_entity(self, message: KafkaMessage):
try:
json_payload = json.loads(message.value)
except json.decoder.JSONDecodeError as exc:
raise ValueError("Kafka message in not in valid JSON format") from exc
entity_uri = json_payload["entity"]["uri"]
values = [
json_payload["entity"]["values"].get(_.path) for _ in self._schema.paths
]
return Entity(uri=entity_uri, values=values)
def get_xml_payload(self) -> Iterator[bytes]:
"""generate xml file with kafka messages"""
yield '<?xml version="1.0" encoding="UTF-8"?>\n'.encode()
yield "<KafkaMessages>".encode()
for message in self.poll():
yield get_message_with_wrapper(message).encode()
yield "</KafkaMessages>".encode()
def __exit__(self, exc_type, exc_value, exc_tb):
# Exception handling here
self._consumer.close()
def get_success_messages_count(self) -> int:
"""Return count of the successful messages"""
return self._no_of_success_messages
def subscribe(self):
"""Subscribes to a topic to consume messages"""
self._consumer.subscribe(topics=[self._topic])
def get_first_message(self):
"""Get the first message from kafka subscribed topic"""
if self._first_message:
return self._first_message
count = 0
while True:
msg = self._consumer.poll(timeout=KAFKA_TIMEOUT)
count += 1
if msg or count > 3:
break
if msg is None:
self._log.info("get_first_message: Messages are empty")
else:
self._first_message = KafkaMessage(
key=msg.key().decode("utf-8") if msg.key() else "",
value=msg.value().decode("utf-8"),
)
return self._first_message
def poll(self) -> Iterator[KafkaMessage]:
"""Polls the consumer for events and calls the corresponding callbacks"""
while True:
msg = self._consumer.poll(timeout=KAFKA_TIMEOUT)
if msg is None:
self._log.info("Messages are empty")
break
if msg.error():
self._log.error(f"Consumer poll Error:{msg.error()}")
raise KafkaException(msg.error())
self._no_of_success_messages += 1
kafka_message = KafkaMessage(
key=msg.key().decode("utf-8") if msg.key() else "",
value=msg.value().decode("utf-8"),
)
if not self._first_message:
self._first_message = kafka_message
if not self._no_of_success_messages % 10:
self._context.report.update(
ExecutionReport(
entity_count=self._no_of_success_messages,
operation="read",
operation_desc="messages received",
)
)
yield kafka_message
def close(self):
"""Closes the consumer once all messages were received."""
self._consumer.close()
class KafkaXMLHandler(ContentHandler):
"""Custom Callback Kafka XML content handler"""
def __init__(
self, kafka_producer: KafkaProducer, context: ExecutionContext, plugin_logger
):
super().__init__()
self._level: int = 0
self._no_of_children: int = 0
self._kafka_producer = kafka_producer
self._context: ExecutionContext = context
self._log: PluginLogger = plugin_logger
self._message: KafkaMessage = KafkaMessage()
@staticmethod
def attrs_s(attrs):
"""This generates the XML attributes from an element attribute list"""
attribute_list = [""]
for item in attrs.items():
attribute_list.append(f'{item[0]}="{escape(item[1])}"')
return " ".join(attribute_list)
@staticmethod
def get_key(attrs):
"""get message key attribute from element attributes list"""
for item in attrs.items():
if item[0] == "key":
return escape(item[1])
return None
def startElement(self, name, attrs):
"""Call when an element starts"""
self._level += 1
if name == "Message" and self._level == 2:
self.rest_for_next_message(attrs)
else:
open_tag = f"<{name}{self.attrs_s(attrs)}>"
self._message.value += open_tag
# Number of child for Message tag
if self._level == 3:
self._no_of_children += 1
def endElement(self, name):
"""Call when an elements end"""
if name == "Message" and self._level == 2:
# If number of children are more than 1,
# We can not construct proper kafka xml message.
# So, log the error message
if self._no_of_children == 1:
# Remove newline and white space between open and close tag
final_message = re.sub(r">[ \n]+<", "><", self._message.value)
# Remove new and white space at the end of the xml
self._message.value = re.sub(r"[\n ]+$", "", final_message)
self._kafka_producer.process(self._message)
if self._kafka_producer.get_success_messages_count() % 10 == 0:
self._kafka_producer.poll(0)
self.update_report()
else:
self._log.error(
"Not able to process this message. "
"Reason: Identified more than one children."
)
else:
end_tag = f"</{name}>"
self._message.value += end_tag
self._level -= 1
def characters(self, content: str):
"""Call when a character is read"""
self._message.value += content
def endDocument(self):
"""End of the file"""
self._kafka_producer.flush()
def rest_for_next_message(self, attrs):
"""To reset _message"""
value = '<?xml version="1.0" encoding="UTF-8"?>'
key = self.get_key(attrs)
self._message = KafkaMessage(key, value)
self._no_of_children = 0
def update_report(self):
"""Update the plugin report with current status"""
self._context.report.update(
ExecutionReport(
entity_count=self._kafka_producer.get_success_messages_count(),
operation="wait",
operation_desc="messages sent",
)
)
class KafkaEntitiesHandler:
"""Custom Callback Kafka XML content handler"""
def __init__(
self, kafka_producer: KafkaProducer, context: ExecutionContext, plugin_logger
):
self._kafka_producer = kafka_producer
self._context: ExecutionContext = context
self._log: PluginLogger = plugin_logger
def process(self, entities: Entities):
"""Process entities"""
for message_dict in self.get_dict(entities):
kafka_payload = json.dumps(message_dict, indent=4)
self._kafka_producer.process(KafkaMessage(key=None, value=kafka_payload))
if self._kafka_producer.get_success_messages_count() % 10 == 0:
self._kafka_producer.poll(0)
self.update_report()
self._kafka_producer.flush()
def update_report(self):
"""Update the plugin report with current status"""
self._context.report.update(
ExecutionReport(
entity_count=self._kafka_producer.get_success_messages_count(),
operation="wait",
operation_desc="messages sent",
)
)
def get_dict(self, entities: Entities) -> Iterator[Dict[str, str]]:
"""get dict from entities"""
self._log.info("Generate dict from entities")
paths = entities.schema.paths
type_uri = entities.schema.type_uri
result: dict[str, Any] = {"schema": {"type_uri": type_uri}}
for entity in entities.entities:
values: dict[str, Sequence[str]] = {}
for i, path in enumerate(paths):
values[path.path] = list(entity.values[i])
result["entity"] = {"uri": entity.uri, "values": values}
yield result
def get_default_client_id(project_id: str, task_id: str):
"""return dns:projectId:taskId when client id is empty"""
base_url = get_cmem_base_uri()
dns = urlparse(base_url).netloc
return f"{dns}:{project_id}:{task_id}"
def validate_kafka_config(config: Dict[str, Any], topic: str, log: PluginLogger):
"""Validate kafka configuration"""
admin_client = AdminClient(config)
cluster_metadata: ClusterMetadata = admin_client.list_topics(
topic=topic, timeout=KAFKA_TIMEOUT
)
topic_meta: TopicMetadata = cluster_metadata.topics[topic]
kafka_error: KafkaError = topic_meta.error
if kafka_error and kafka_error.code() == KafkaError.LEADER_NOT_AVAILABLE:
raise ValueError(
"The topic you configured, was just created. Save again if this ok for you."
" Otherwise, change the topic name."
)
if kafka_error:
raise kafka_error
log.info("Connection details are valid")
def get_resource_from_dataset(dataset_id: str, context: UserContext):
"""Get resource from dataset"""
setup_cmempy_user_access(context=context)
project_id, task_id = split_task_id(dataset_id)
task_meta_data = get_task(project=project_id, task=task_id)
resource_name = str(task_meta_data["data"]["parameters"]["file"]["value"])
return get_resource_response(project_id, resource_name)
def get_message_with_wrapper(message: KafkaMessage) -> str:
"""Wrap kafka message around Message tags"""
is_xml(message.value)
# strip xml metadata
regex_pattern = "<\\?xml.*\\?>"
msg_with_wrapper = f'<Message key="{message.key}">'
# TODO Efficient way to remove xml doc string
msg_with_wrapper += re.sub(regex_pattern, "", message.value)
msg_with_wrapper += "</Message>\n"
return msg_with_wrapper
def get_kafka_statistics(json_data: str) -> dict:
"""Return kafka statistics from json"""
interested_keys = [
"name",
"client_id",
"type",
"time",
"msg_cnt",
"msg_size",
"topics",
]
stats = json.loads(json_data)
return {
key: ",".join(stats[key].keys())
if isinstance(stats[key], dict)
else f"{stats[key]}"
for key in interested_keys
}
def is_xml(value: str):
"""Check value is xml string or not"""
try:
ElementTree.fromstring(value)
except ElementTree.ParseError as exc:
raise ValueError("Kafka message is not in Valid XML format") from exc