In [1]:
############################################################################################
# Author: G. Wong (G.Wong@csiro.au)
# 
# Purpose: Reading a directory, the parset file and import into mongoDB
# 
# Date: 04.10.19
# 
# Execution: run via jupyter notebook
# 
# Note/s:
# This script is a "first run", so this script would be executed shortly after the 
# gridded dumps (exported from the ASKAP pipeline) have been transferred into 
# the storage before being imported into the DINGO pipelines.
# 
# After the file has been transferred, you can specify the filepath of the new directory.  
# The script will list out the files and save it as a entry (key) while it will read the 
# parset.last file and save the information into the database.
# 
# The format of the data entry into MongoDB is similar to how CASDA structures their entries.
#
#
#
# This script assumes that MongoDB is already running (in two terminals you will need to 
# run "mongod" and "mongo" on osx). MongoDB can also run on cloud systems such as AWS 
# and Azure.
# 
# Future development:
# Create a metedata file (within ASKAPsoft), read and save into the database.
# 
# 04/02/20 - modified the inputs to allow additional information to be passed and 
#            cleaned up the code to include documentation
############################################################################################

In [2]:
# import packages
import pymongo  # Assumption that you already have mongoDB installed and running
import os       # Used for os.walk, extract the filepath

In [3]:
############################################################################################
def dictionary_creation(directory, sbid = '1234'):
    """
    Creates a dictionary which then can be 
    used for import (as a json file) into mongoDB.

    Keyword arguments:
    directory: filepath of the data and parset file
    sbid: schedule block ID (default is 1234 but this will 
          not work if an exisiting sbid is within the database).
    """
    dirFiles = os.listdir(directory) # Extract the directory and files at the filepath

    # Create dictionary and lower level
    parameter_dict = {"_id":sbid}
    parameter_dict['dataset'] = {}

    # Create the dumps nest
    parameter_dict['dataset']['dumps'] = {}
    for i, files in enumerate(dirFiles):
        #  For the parset file you want to read that and then save that information   
        modified_key = files.replace('.','_')
        # if you want to read files and save the information e.g. metadata        
        if files == 'parset.in':
            parset = open(('%s/%s' % (directory,files)))
            parameter_dict['dataset']['parset'] = {}
            parameter_dict['dataset']['parset']['filepath'] = f'{directory}/{files}'
            for line in parset:
                if not line.startswith("#"):
                    parts = line.split("=")
                    para_key = parts[0].rstrip().replace('.','_')
                    clip_value = parts[1].rstrip('\n')
                    parameter_dict['dataset']['parset'][para_key] = clip_value.lstrip(' ')
        #  For now you just want to create separate entries for the different dumps (filename)
        else:
            parameter_dict['dataset']['dumps'][(f'{modified_key}')] = {'filepath': f'{directory}/{files}'}
    # return dictionary 
    return parameter_dict

In [4]:
############################################################################################
def database_start_import(import_dict,client="mongodb://localhost:27017/",dbname = "protoData",collection="observations"):
    """
    MongoDB startup and import.  Take the output from dictionary_creation 
    and then save the information into MongoDB.
    
    The mongoDB backend should be running, if you are stuck run the 
    command 'mongo' and 'mongod' in separate terminals

    Keyword arguments:
    import_dict: data to be imported into MongoDB. The structure is in a database format.
    client: MongoDB server link.
    dbname: Name of the database to save the information into.
    collection: name of the collection to save the information into.
    """
    #     
    myclient = pymongo.MongoClient(client)
    # print(myclient.list_database_names())  # list all of the databases (debug purpose)
    dblist = myclient.list_database_names()  # Alternatively, save it as an list and loop through it
    if dbname in dblist:                     # dbname is set to test2 by default
        print("Database exists: %s" % dbname)
    else:
        print("New database created: %s" % dbname)

    mydb = myclient[dbname]         # In MongoDB, a database is not created until it gets content!
    mycol = mydb[collection]        # Collection table entry
    mycol.insert_one(import_dict)   # Data entry
    print('Data entered')

In [7]:
#####################################################
# Single data entry
#####################################################
# directory = '/Users/won10d/Documents/DINGO/development/askapSoft_protoData/cubes-cycle-2'  # Specify the directory
# import_data = dictionary_creation(directory, sbid = '121558040220')   # create the dictionary based on directory
# database_start_import(import_dict=import_data)  # Take the import data and then start database and save data
# print(import_data)
# 
# 
#####################################################
# multiple data entries
#####################################################
listOfFiles = ['/Users/won10d/Documents/DINGO/development/askapSoft_protoData/cubes-cycle-1',
              '/Users/won10d/Documents/DINGO/development/askapSoft_protoData/cubes-cycle-2'
              ]  # enter the multiple directories
listOfSbid = ['095144060220', '095202060220']
for counter, data_dir in enumerate(listOfFiles):
    import_data = dictionary_creation(data_dir, sbid = listOfSbid[counter])
    database_start_import(import_dict=import_data)


Database exists: protoData
Data entered
