# Testing Writing of a Mongo Pipeline

## First set of basic file functions

Basic reads and writes, and something to fetch the ETL config file (if any)

In [88]:
import os

def get_etl_config(file_name):
    return os.path.join(os.getcwd(), file_name)


def write_pipeline(vqb_pipeline, file_name):
    etl_config_path = get_etl_config(file_name)
    # This functionn either appends a MONGO_PIPELINE to an existing ETL_config file or it will
    # create a new ETL_config file and put a MONGO_PIPELINE in it
    with open(etl_config_path, 'a+') as f:
        pipeline_to_write = f'MONGO_PIPELINE: {vqb_pipeline}'
        f.write(pipeline_to_write)
        f.close()


def read_file():
    with open(get_etl_config('etl_config.cson'), 'r') as f:
        print('file contents ', f.read())

## Check that a simple build is added to ETL_config file

This will make sure that we can write a pipeline not built yet by the server into the file
* We first build a MONGO_PIPELINE with three stages (this is the same kind of example used in other parts of this notebook)
* We then will feed this pipeline to the file to see how it works

In [23]:
def build_pipeline_step(output_col, mongo_query):
    return {
        'output_domain': output_col,
        'query' : [mongo_query]
    }

def build_appendy_list(col):
    return [
    { '$match' : { 'domain' : 'purple', 'color' : 'purple' } },
    *append_col_stages(col)
]

PURPLE_LIST = [{ '$match' : { 'domain' : 'test_data', 'color' : 'purple', 'number' : { '$lte' : 50 } } }]
APPENDY_LIST = build_appendy_list('orange')

def append_col_stages(col):
    return [
        {
            "$group": {
              "_id": None,
              "_vqbPipelineInline": {
                "$push": "$$ROOT"
              }
            }
        },
        {
            "$lookup": {
              "from": col,
              "pipeline": [
                {
                  "$match": {}
                },
                {
                  "$project": {
                    "_id": 0
                  }
                }
              ],
              "as": "_vqbPipelineToAppend_0"
            }
        },
        {
            "$project": {
              "_vqbPipelinesUnion": {
                "$concatArrays": [
                  "$_vqbPipelineInline",
                  "$_vqbPipelineToAppend_0"
                ]
              }
            }
        },
        {
            "$unwind": "$_vqbPipelinesUnion"
        },
        {
            "$replaceRoot": {
              "newRoot": "$_vqbPipelinesUnion"
            }
        },
    ]

TEST_VQB_PIPELINE = [
    {
        'output_domain' : 'purple',
        'query' : PURPLE_LIST
    },
    {
        'output_domain' : 'orange',
        'query' : [
            { '$match' : { 'domain' : 'test_data', 'color' : 'orange', 'number' : { '$gt' : 50 } } }
        ]
    },
    {
        'output_domain' : 'result',
        'query' : APPENDY_LIST
    }
]

TEST_VQB_PIPELINE

[{'output_domain': 'purple',
  'query': [{'$match': {'domain': 'test_data',
     'color': 'purple',
     'number': {'$lte': 50}}}]},
 {'output_domain': 'orange',
  'query': [{'$match': {'domain': 'test_data',
     'color': 'orange',
     'number': {'$gt': 50}}}]},
 {'output_domain': 'result',
  'query': [{'$match': {'domain': 'purple', 'color': 'purple'}},
   {'$group': {'_id': None, '_vqbPipelineInline': {'$push': '$$ROOT'}}},
   {'$lookup': {'from': 'orange',
     'pipeline': [{'$match': {}}, {'$project': {'_id': 0}}],
     'as': '_vqbPipelineToAppend_0'}},
   {'$project': {'_vqbPipelinesUnion': {'$concatArrays': ['$_vqbPipelineInline',
       '$_vqbPipelineToAppend_0']}}},
   {'$unwind': '$_vqbPipelinesUnion'},
   {'$replaceRoot': {'newRoot': '$_vqbPipelinesUnion'}}]}]

In [90]:
write_pipeline(TEST_VQB_PIPELINE, 'etl_config.cson')
read_file()

file contents  MONGO_PIPELINE: [{'output_domain': 'purple', 'query': [{'$match': {'domain': 'test_data', 'color': 'purple', 'number': {'$lte': 50}}}]}, {'output_domain': 'orange', 'query': [{'$match': {'domain': 'test_data', 'color': 'orange', 'number': {'$gt': 50}}}]}, {'output_domain': 'result', 'query': [{'$match': {'domain': 'purple', 'color': 'purple'}}, {'$group': {'_id': None, '_vqbPipelineInline': {'$push': '$$ROOT'}}}, {'$lookup': {'from': 'orange', 'pipeline': [{'$match': {}}, {'$project': {'_id': 0}}], 'as': '_vqbPipelineToAppend_0'}}, {'$project': {'_vqbPipelinesUnion': {'$concatArrays': ['$_vqbPipelineInline', '$_vqbPipelineToAppend_0']}}}, {'$unwind': '$_vqbPipelinesUnion'}, {'$replaceRoot': {'newRoot': '$_vqbPipelinesUnion'}}]}]


# What happens if a concepteur wants to update their ETL_config?

We will emulate this by using a factory function to build the steps of a VQB pipeline and replace that pipeline in the ETL_config
NOTE: it's more than likely that this following write function will be used instead of the simple one tested above since the file will most likely exist already

In [24]:
def build_pipeline_step(domain_col, agg_array):
    return {
        'output_domain' : domain_col,
        'query' : agg_array
    }

def build_pipeline(arr_of_outputs, arr_of_queries):
    return [build_pipeline_step(arr_of_outputs[i], arr_of_queries[i]) for i in range(len(arr_of_outputs))]

# NOTE: we're changing the old pipeline from looking for orange stuff to looking for red stuff
arr_of_outputs = ['purple', 'red', 'result']
arr_of_queries = [
    PURPLE_LIST,
    { '$match' : { 'domain' : 'test_data', 'color' : 'red', 'number' : { '$gt' : 50 } } },
    build_appendy_list('red')
]

new_vqb_pipeline = build_pipeline(arr_of_outputs, arr_of_queries)
print(new_vqb_pipeline)

[{'output_domain': 'purple', 'query': [{'$match': {'domain': 'test_data', 'color': 'purple', 'number': {'$lte': 50}}}]}, {'output_domain': 'red', 'query': {'$match': {'domain': 'test_data', 'color': 'red', 'number': {'$gt': 50}}}}, {'output_domain': 'result', 'query': [{'$match': {'domain': 'purple', 'color': 'purple'}}, {'$group': {'_id': None, '_vqbPipelineInline': {'$push': '$$ROOT'}}}, {'$lookup': {'from': 'red', 'pipeline': [{'$match': {}}, {'$project': {'_id': 0}}], 'as': '_vqbPipelineToAppend_0'}}, {'$project': {'_vqbPipelinesUnion': {'$concatArrays': ['$_vqbPipelineInline', '$_vqbPipelineToAppend_0']}}}, {'$unwind': '$_vqbPipelinesUnion'}, {'$replaceRoot': {'newRoot': '$_vqbPipelinesUnion'}}]}]


Now for the fun part : replacing the old Mongo pipeline with the new one
The problem is that we will have to always have the MONGO_PIPELINE at the end of the file for this current thing to work

We can't assume that the file already exists so there are three possibilities:
* We have to create a new file and put a MONGO_PIPELINE in it
* The file exists but there is no MONGO_PIPELINE in it
* The file exists but there is already a MONGO_PIPELINE in it

We want to replace any old MONGO_PIPELINE arrays with the new one, otherwise we just add the new MONGO_PIPELINE to the end of any existing contents inside the etl_config

In [86]:
import re

def update_pipeline_in_file(vqb_pipeline, file_name):
    etl_config_path = get_etl_config(file_name)
    try:
        old_etl_config = open(etl_config_path, 'r')
        replacement_string = f'MONGO_PIPELINE: {vqb_pipeline}'
        contents = old_etl_config.read()
        if 'MONGO_PIPELINE' in contents:
            new_contents = re.sub(r'(MONGO_PIPELINE:)([\W\w]+)', replacement_string, contents, re.M)
        else:
            new_contents = f'{contents}{replacement_string}'
        etl_config_to_update = open(etl_config_path, 'w+')
        etl_config_to_update.write(new_contents)
    except FileNotFoundError:
        write_pipeline(vqb_pipeline, file_name)


update_pipeline_in_file(new_vqb_pipeline, 'etl_config.cson')

## Making sure that our update_pipeline_in_file function works for an etl_config with other contents

In [91]:
other_file_name = 'another_etl_config.cson'

update_pipeline_in_file(new_vqb_pipeline, other_file_name)