In [None]:
import boto3
import json
from urllib.parse import unquote

In [None]:
aws_region = ''
aws_account_id = ''

In [None]:
event = ''

In [None]:
loader_queue_url = "https://sqs.{}.amazonaws.com/{}/nept-mlops-dev-gremlin-csv-loader".format(aws_region, aws_account_id)
s3_bucket = "nept-mlops-dev-{}".format(aws_account_id)
input_path = "wikimedia-events/raw-events"
output_path = "wikimedia-events/gremlin-csv"

In [None]:
s3_client = boto3.client('s3')
sqs_client = boto3.client('sqs')

In [None]:
keys = list(map(lambda x: unquote(x['s3']['object']['key']), event['Records']))
keys = list(filter(lambda x: x.startswith(input_path), keys))

In [None]:
key = keys[0]

In [None]:
class UserVertexConverter(object):
    def header(self):
        return "~id, ~label"
    def convert(self, event):
        if 'user' in event:
            return "\"user:{}\",user".format(event['user'])
        return None
    
class EdgeConverter(object):
    def header(self):
        return "edge header 1"
    def convert(self, event):
        return "edge 1: {}".format(line)

class Converters(object):
    user_vertex_converter = UserVertexConverter()
    edge_converter = EdgeConverter()
    
    def to_vertexes(self):
        return [('user', self.user_vertex_converter)]
        
    def to_edges(self):
        return []

In [None]:
def process_content_with_converter(content_json, converter_type, converter_name, converter, original_key_suffix):
    output_key = "{}{}-{}-{}".format(output_path, original_key_suffix, converter_type, converter_name)
    converted_lines = [converter.header()]
    for line in content_json:
        converted = converter.convert(line)
        if converted:
            converted_lines.append(converted)
    converted_content = "\n".join(converted_lines)
    print(converted_content)
    s3_client.put_object(Body=converted_content.encode('utf-8'), Bucket=s3_bucket, Key=output_key)
    return output_key

In [None]:
def process_file(key):
    original_key_suffix = key[len(input_path):-1]
    content = s3_client.get_object(Bucket=s3_bucket, Key=key)['Body'].read().decode('utf-8')
    content_json = map(lambda x: json.loads(x), content.splitlines())
    vertex_keys = []
    edge_keys = []
    for (converter_name, converter) in Converters().to_vertexes():
        output_key = process_content_with_converter(content_json, 'vertexes', converter_name, converter, original_key_suffix)
        vertex_keys.append(output_key)
    for (converter_name, converter) in Converters().to_edges():
        output_key = process_content_with_converter(content_json, 'edges', converter_name, converter, original_key_suffix)
        vertex_keys.append(output_key)
        print(output_key)
    loader_message = {
        "vertex_files": list(map(lambda x: "s3://{}/{}".format(s3_bucket, x), vertex_keys)),
        "edge_files": list(map(lambda x: "s3://{}/{}".format(s3_bucket, x), edge_keys))
    }
    sqs_client.send_message(
        QueueUrl=loader_queue_url,
        MessageBody=json.dumps(loader_message)
    )

In [None]:
process_file(key)