In [250]:
import os
import re
import shutil #directory controls
import json
import yaml
import shutil
import fnmatch
import errno
import itertools

# Snowflake DBT to Dataform Converter

### Edit variables on the cell below with the references of your project

[Instructions](https://github.com/datalakehouse/dbt-to-dataform/blob/DLHX-789-dbt-to-dataform/README.md)

[Roadmap convertion spreadsheet](https://docs.google.com/spreadsheets/d/1q96HottHJaEC9vZ0NPrwLsoxVpnRE7fLGz5zI-y0mM0/edit#gid=0).



In [251]:
#dbt source project path
dbt_source_project_path = "../../dbt/dlh-bill-dot-com-analytics-dbt"
#dbt_source_project_path = "../../dbt/stripe_dbt"

#dataform path to be created
dataform_root_path = "../../dataform/test_bill_dot_com"
#dataform_root_path = "../../dataform/test_stripe"

#target schema on snowflake
target_schema = 'DATALAKEHOUSE_DATAFORM_BILL_DOT_COM'

#define the type of conversion (js or sqlx)
conversion_type = 'js' #or sqlx

#timestamp updated_at field to be used on snapshot convertion. 
#This field must represent the column name for the last updated date of the record
dlh_timestamp_field = '"MD_ELT_UPDATED_DTS"'

In [252]:
#creating variables based on defined paths
current_directory = os.getcwd()

dbt_models_file_path = dbt_source_project_path+"/models/" 
dbt_snapshots_file_path = dbt_source_project_path+"/snapshots/" 
dbt_macros_file_path = dbt_source_project_path+"/macros"

dataform_output_sources_path = dataform_root_path+"/definitions/sources"
dataform_output_models_path = dataform_root_path+"/definitions"
dataform_output_includes_path = dataform_root_path+"/includes"


#JSON file of dataform connection to snowflake. https://docs.dataform.co/dataform-cli#create-a-credentials-file
dataform_credentials_file_path = "../.df-credentials.json"

In [253]:
#find files on directories and subdirectories
def find_files(directory, pattern):
        for root, dirs, files in os.walk(directory):
            for basename in files:
                if fnmatch.fnmatch(basename, pattern):
                    filename = os.path.join(root, basename)                    
                    yield filename

In [254]:
#YAML to dictionary
def read_yaml_file(filename):
    with open(filename, 'r') as stream:
        try:
            yaml_dict = yaml.load(stream, Loader=yaml.FullLoader)
            return yaml_dict
        except yaml.YAMLError as exc:
            print(exc)

In [333]:
#Function used to convert source (YML) to (JS) file
def create_js_source_file(dbt_models_file_path,dataform_output_sources_path,conversion_type):

    #check if source directory exists        
    isExistSource = os.path.exists(dbt_models_file_path)



    if not isExistSource:
        print('Directory does not exists:' + dbt_models_file_path)
    else:
        source_files = find_files(dbt_models_file_path, '*.yml')
    #creating directory        
        isExist = os.path.exists(dataform_output_sources_path)

        if not isExist and conversion_type == 'sqlx':
            os.makedirs(dataform_output_sources_path)


        footer = '  return {\n'
        js_parsed=''

    #iterate through each yaml file that contains sources
        for file in source_files:
            #reading all YAML files on source directory
            dic = read_yaml_file(file)

            #getting only sources entity from yaml file

            if 'sources' in dic.keys():
                sources = dic["sources"]

                for source in sources:                    

                    #declaring variables for each element of dictionary that will be used. tables = list, schema and database = str
                    table = source["tables"]            
                    database = source["database"]
                    schema = source["schema"]



                    for tables in table:
                        #getting tables from list
                        tables= tables['name']

                        database = re.sub(r"({{[ ]{0,10}var[ ]{0,9}\([ ]{0,9}')(.*)('[ ]{0,9},[ ]{0,9}')(.*)",r'\2',database)
                        schema = re.sub(r"({{[ ]{0,10}var[ ]{0,9}\([ ]{0,9}')(.*)('[ ]{0,9},[ ]{0,9}')(.*)",r'\2',schema)

                        #creating json .sqlx structure
                        if conversion_type == 'sqlx':
                            jmodel = "declare({\n"
                        else:
                            jmodel = f'const {tables} = declare('+'{\n'
                            footer += f'    {tables}'+',\n'
                        #jmodel += f'"type": "declaration",\n'                
                        jmodel += f'  "database": {database},\n'
                        jmodel += f'  "schema": {schema},\n'
                        jmodel += f'  "name": "{tables}"'
                        jmodel += "\n});\n" #close out JSON file EOF
                     #parsing json and replacing double quotes for single quotes on keys
                        parsed = jmodel.replace('"database":','database:').replace('"schema":','schema:').replace('"name":','name:')
                        parsed = re.sub(r'(\"{{)[ ]{0,9}(var)([\(][\'](.*)(\'\)}}\"))', r'constants.\4', parsed)
                        #creating .sqlx files
                        #write out to file in the appropriate location defined in variables


                        if conversion_type == 'sqlx':

                            with open(f'{dataform_output_sources_path}/sources.js', "a") as jmodel_file:                            
                                print(f'Creating source : {database}_{schema}_{tables}')
                                jmodel_file.write(parsed)
                                jmodel_file.close()

                        else:                        
                            js_parsed += parsed.replace('constants.','')

                if conversion_type == 'js':
                    return(js_parsed+footer)

In [334]:
#Function used to convert DBT SQL model files to SQLX model files on dataform

def create_sqlx_models_files(dbt_models_file_path,dataform_output_models_path):
          
    #check if source directory exists        
    isExistSource = os.path.exists(dbt_models_file_path)
    if not isExistSource:
        print('Directory does not exists:' + dbt_models_file_path)
    else:
        model_files_path = find_files(dbt_models_file_path, '*.sql')
        
                                
        with open(dataform_output_models_path+'/H_INVOCATION_ID.sqlx', 'w+') as f:
            f.write('config {\n  type: "table" \n} \n SELECT uuid_string() as invocation_id from dual')

        #iterate on each SQL file on DBT directory and getting variables
        for filename in model_files_path:  
            single_file_name = os.path.basename(filename)
            destination_path = os.path.dirname(filename)
            destination_path = dataform_output_models_path+(destination_path.split('models')[1])+'/'
            destination_full_path = (destination_path+(os.path.basename(filename)).replace("sql","sqlx"))

         #######copying files to new directory##############   
            try:
                #copy files if directory already exists            
                shutil.copyfile(filename,destination_full_path)
            except IOError as e:
                # ENOENT(2): file does not exist, raised also on missing dest parent dir
                if e.errno != errno.ENOENT:
                    raise
                # create directory if not exists           
                os.makedirs(destination_path)
                # copy SQL files to new directory          
                shutil.copyfile(filename,destination_full_path)
        #######end copying files to new directory##############


        #######reading file and replacing model syntax differences########
            # Read in the file
            with open(destination_full_path, 'r') as file :
                filedata = file.read()
                #getting header data to be replaced
                match = re.search(r'{{[ ]{0,20}config[ ]{0,20}\(', filedata)
                if match:
                    header=filedata[filedata.find("{{"):filedata.find("}}")+2]
                    header_old=header
                else:
                    header="{{ config (\n  materialized= 'view',\n)\n}}"
                    header_old=header
                #dictionary to replace simple syntax elements
                header_replace_dict = {"{{":"", "}}":"}", "materialized":"type","=":":","(":"{",")":"","'":'"',"unique_key":'uniqueKey'}

                #iterate through dictionary keys
                for key in header_replace_dict.keys():
                    #replacing all headers based on dict mapping to 'header' variable
                    header = header.replace(key, header_replace_dict[key])         


            #writing file with replaced header and models references
            with open(destination_full_path, 'w') as file:
                #writting on file the replaced header based on dictionary 
                new_file = filedata
                if not match:
                    new_file = header_old+new_file

                new_file = new_file.replace(header_old,header)
                
                new_file = new_file.replace(header_old,header)
                #converting transient tables to snowflake specific block
                
                new_file = re.sub(r'(transient)[ ]{0,9}[\:][ ]{0,9}(true|false|True|False|TRUE|FALSE)[,]{0,1}',r'snowflake: { \n     \1: \2 \n  }, \n',new_file)
                #converting disabled models syntax
                new_file = re.sub(r'(enabled)[ ]{0,9}[\:][ ]{0,9}(true|True|TRUE)','',new_file)
                new_file = re.sub(r'(enabled)[ ]{0,9}[\:][ ]{0,9}(false|False|FALSE)','disabled: true',new_file)
                new_file = re.sub(r'(enabled)[ ]{0,9}[\:][ ]{0,9}(var)[\{][\"](.*)(\")',r'disabled: !constants.\3',new_file)
                #converting schema variables
                new_file = re.sub(r'(schema)[ ]{0,9}[\:][ ]{0,9}(var)[\{][\"](.*)(\")',r'schema: constants.\3',new_file)
                #removing configurations that does not exists on dataform
                new_file = re.sub(r'(pre_hook|post_hook|alias|meta|persist_docs|merge_update_columns|on_schema_change)[ ]{0,9}[\:][ ]{0,9}.*[,}\")]','',new_file)
                #changing {{ref to ${ref
                new_file = re.sub(r"[{][{][ ]{0,6}(ref|REF|Ref|SOURCE|source|Source)[ ]{0,9}[(][ ]{0,9}[']?",'${ref("', new_file)     
                #including just 1 space before config header
                new_file = re.sub(r'(config)[ ]{0,40}[{]{0,40}','config {',new_file)
                #closing ) with ") after sources and refs
                new_file = re.sub(r"(['][)][ ]{0,9}[}])",'")',new_file)
                #removing alone commmas on start of config blockan
                new_file = re.sub(r"({)\n[ ]{0,100}[,]",'{',new_file)
                #changing syntax reference to macros from {{ to ${
                new_file = re.sub(r"(?<!\')({{)[ ]{0,20}[.]{0,20}",'${common.',new_file)
                new_file = re.sub(r"([\$].+)[ ]{0,9}(?<=\()(')",r'\1"',new_file)
                #changing ' to " on sources references
                new_file = re.sub(r"([\$].+)((?<=[0-9A-Za-z])[ ]{0,9}[_'][.]{0,10}[ ]{0,20}[,][.]{0,20}[ ]{0,20}[.]{0,20}['])",r'\1","',new_file)
                #changing invocation_id to snowflake uuid_string
                new_file = re.sub(r"('{{invocation_id}}'|{{invocation_id}})",'(SELECT invocation_id FROM ${ref("H_INVOCATION_ID")})',new_file)
                #changing incremental loads macro
                new_file = re.sub(r"({[ ]{0,10}%[ ]{0,10}if[ ]{0,10}is_incremental.*)((\s*)(.*)\s*)({%[ ]{0,10}endif..*})",r'${ when(incremental(), \n`\4`) }',new_file)
                new_file = re.sub(r"((?:\${[ ]{0,10}when.*(\s.*)))(\${this\}\})",r'\1${self()}',new_file)
                #variables replacing                
                #new_file = re.sub(r"({[ ]{0,10}%[ ]{0,10}if[ ]{0,10}var[ ]{0,9}(\(')(.*)(\'\)).*)\n((.+\n)+)([ ]{0,10}{%[ ]{0,10}endif..*})",r'${ when(constants.\3, \n`\5`) }',new_file)
                new_file = re.sub(r"({[ ]{0,10}%[ ]{0,10}if[ ]{0,10}var[ ]{0,9}(\(')(.*)(\'\)).*)",r'${ when(constants.\3,\n`',new_file)
                new_file = re.sub(r"([ ]{0,10}{%[ ]{0,10}endif..*})",r'`)}',new_file)
                new_file = re.sub(r"\${ref[ ]{0,9}[\(]\"var(\(\"(.*?)'\))[ ]{0,9}[,](['\"]{0,9}(.*)(\"))",r'${ref(constants.\2,"\4"',new_file)
                new_file = "\npre_operations {\n alter session set query_tag = 'dataform|${dataform.projectConfig.defaultSchema}|${name()}'\n}"
                try:
                    file.write(new_file)                
                    print('Generated file: '+destination_full_path)
                except IOError as e:
                    print ("I/O error({0}): {1}".format(e.errno, e.strerror))
        #######end reading file and replacing model syntax differences########



In [335]:
#Function used to convert DBT SQL model files to JS model files on dataform
def create_js_model_files(dbt_models_file_path,dataform_output_includes_path):

    #check if source directory exists        
    isExistSource = os.path.exists(dbt_models_file_path)
    if not isExistSource:
        print('Directory does not exists:' + dbt_models_file_path)
    else:
        model_files_path = find_files(dbt_models_file_path, '*.sql')


        with open(dataform_output_includes_path+'/H_INVOCATION_ID.js', 'w+') as f:
            f.write('module.exports = (params) => {\n  return publish("H_INVOCATION_ID",\n {  type: "table",\n...params.defaultConfig\n}).query(ctx => ` SELECT uuid_string() as invocation_id from dual \n`)\n}')

        #iterate on each SQL file on DBT directory and getting variables
        for filename in model_files_path:  
            single_file_name = os.path.basename(filename)
            destination_path = os.path.dirname(filename)
            destination_path = dataform_output_includes_path+(destination_path.split('models')[1])+'/'
            destination_full_path = (destination_path+(os.path.basename(filename)).replace("sql","js"))

         #######copying files to new directory##############   
            try:
                #copy files if directory already exists            
                shutil.copyfile(filename,destination_full_path)
            except IOError as e:
                # ENOENT(2): file does not exist, raised also on missing dest parent dir
                if e.errno != errno.ENOENT:
                    raise
                # create directory if not exists           
                os.makedirs(destination_path)
                # copy SQL files to new directory          
                shutil.copyfile(filename,destination_full_path)
        #######end copying files to new directory##############


        #######reading file and replacing model syntax differences########
            # Read in the file
            with open(destination_full_path, 'r') as file :
                filedata = file.read()
                #getting header data to be replaced
                match = re.search(r'{{[ ]{0,20}config[ ]{0,20}\(', filedata)
                if match:
                    header=filedata[filedata.find("{{"):filedata.find("}}")+2]
                    header_old=header
                else:
                    header="{{ config (\n  materialized= 'view',\n)\n}}"
                    header_old=header


                #dictionary to replace simple syntax elements                
                header_replace_dict1 = {"{{":"", "materialized":"type","=":":","(":"{",")":"","'":'"',"unique_key":'uniqueKey'}
                header_replace_dict2 = {"}}":"    ...params.defaultConfig\n}).query(ctx => `"}




                #iterate through dictionary keys
                for key in header_replace_dict1.keys():
                    #replacing all headers based on dict mapping to 'header' variable
                    #header = header.replace(key, header_replace_dict2[key])         
                    header = header.replace(key, header_replace_dict1[key])


                       #iterate through dictionary keys
                for key in header_replace_dict2.keys():
                    #replacing all headers based on dict mapping to 'header' variable
                    #header = header.replace(key, header_replace_dict2[key])         
                    header = header.replace(key, header_replace_dict2[key])

                #header=re.sub(r'\"]$',r'"],',header)
                #header=re.sub(r'\"$',r'",',header)

                header = re.sub(r'(\"]{1})$','"],',header,flags=re.MULTILINE)
                header = re.sub(r'(\"]{0})$','",',header,flags=re.MULTILINE)



            #writing file with replaced header and models references
            with open(destination_full_path, 'w') as file:
                #writting on file the replaced header based on dictionary 

                new_file = filedata
                if not match:
                    new_file = header_old+new_file

                new_file = new_file.replace(header_old,header)

                #converting transient tables to snowflake specific block
                new_file = re.sub(r'(transient)[ ]{0,9}[\:][ ]{0,9}(true|false|True|False|TRUE|FALSE)[,]{0,1}',r'snowflake: { \n     \1: \2 \n  }, \n',new_file)
                #converting disabled models syntax
                new_file = re.sub(r'(enabled)[ ]{0,9}[\:][ ]{0,9}(true|True|TRUE)','',new_file)
                new_file = re.sub(r'(enabled)[ ]{0,9}[\:][ ]{0,9}(false|False|FALSE)','disabled: true',new_file)
                new_file = re.sub(r'(enabled)[ ]{0,9}[\:][ ]{0,9}(var)[\{][\"](.*)(\")',r'disabled: !params.\3',new_file)
                #converting schema variables

                new_file = re.sub(r'(schema)[ ]{0,9}[\:][ ]{0,9}(var)[\{][\"](.*)(\")',r'schema: params.\3',new_file)   
                new_file = re.sub(r'((schema):[ ].*)(\"\,)(.*)',r'\1,',new_file)

                #removing configurations that does not exists on dataform
                new_file = re.sub(r'(pre_hook|post_hook|alias|meta|persist_docs|merge_update_columns|on_schema_change)[ ]{0,9}[\:][ ]{0,9}.*[,}\")]','',new_file)
                #changing {{ref to ${ref
                new_file = re.sub(r"[{][{][ ]{0,6}(ref|REF|Ref|SOURCE|source|Source)[ ]{0,9}[(][ ]{0,9}[']?",'${ctx.ref("', new_file)     

                new_file = re.sub(r"(\${ctx.ref)\((\"var\((\'(.*)\'))\,.*\),[ ]{0,9}(\'(.*)\').*",r'\1(params.\4,"\6")}', new_file)     

                #including just 1 space before config header
                new_file = re.sub(r'(config)[ ]{0,40}[{]{0,40}','module.exports = (params) => {\n  return publish("{name_of_the_model}", {',new_file)

                #closing ) with ") after sources and refs
                new_file = re.sub(r"(['][)][ ]{0,9}[}])",'")',new_file)

                #removing alone commmas on start of config blockan
                new_file = re.sub(r"({)\n[ ]{0,100}[,]",'{',new_file)
                #changing syntax reference to macros from {{ to ${
                new_file = re.sub(r"(?<!\')({{)[ ]{0,20}[.]{0,20}",'${common.',new_file)
                new_file = re.sub(r"([\$].+)[ ]{0,9}(?<=\()(')",r'\1"',new_file)
                #changing ' to " on sources references
                new_file = re.sub(r"([\$].+)((?<=[0-9A-Za-z])[ ]{0,9}[_'][.]{0,10}[ ]{0,20}[,][.]{0,20}[ ]{0,20}[.]{0,20}['])",r'\1","',new_file)
                #changing invocation_id to snowflake uuid_string
                new_file = re.sub(r"('{{invocation_id}}'|{{invocation_id}})",'(SELECT invocation_id FROM ${ctx.ref("H_INVOCATION_ID")})',new_file)
                #changing incremental loads macro
                new_file = re.sub(r"({[ ]{0,10}%[ ]{0,10}if[ ]{0,10}is_incremental.*)((\s*)(.*)\s*)({%[ ]{0,10}endif..*})",r'${ ctx.when(incremental(), \n`\4`) }',new_file)
                new_file = re.sub(r"((?:\${[ ]{0,10}when.*(\s.*)))(\${this\}\})",r'\1${self()}',new_file)
                #variables replacing
                #new_file = re.sub(r"({[ ]{0,10}%[ ]{0,10}if[ ]{0,10}var[ ]{0,9}(\(')(.*)(\'\)).*)\n((.+\n)+)([ ]{0,10}{%[ ]{0,10}endif..*})",r'${ ctx.when(params.\3, \n`\5`) }',new_file)
                new_file = re.sub(r"({[ ]{0,10}%[ ]{0,10}if[ ]{0,10}var[ ]{0,9}(\(')(.*)(\'\)).*)",r'${ ctx.when(params.\3,\n`',new_file)
                new_file = re.sub(r"([ ]{0,10}{%[ ]{0,10}endif..*})",r'`)}',new_file)

                new_file = new_file.replace('{name_of_the_model}',single_file_name.replace('.sql',''))
                new_file = re.sub(r"\${ctx.ref[ ]{0,9}[\(]\"var(\(\"(.*?)'\))[ ]{0,9}[,](['\"]{0,9}(.*)(\"))",r"${ctx.ref(params.\2,'\4'",new_file)
                #end of file
                new_file = re.sub(r"\Z",r"\n`).preOps(ctx => `\n alter session set query_tag = 'dataform|${dataform.projectConfig.defaultSchema}|${ctx.name()}'`\n )\n}",new_file)

                try:
                    file.write(new_file)                
                    print('Generated file: '+destination_full_path)
                except IOError as e:
                    print ("I/O error({0}): {1}".format(e.errno, e.strerror))
        #######end reading file and replacing model syntax differences########



In [336]:
def create_sqlx_snapshot_files(dbt_snapshots_file_path,dataform_output_models_path):
     
    #check if source directory exists        
    isExistSource = os.path.exists(dbt_snapshots_file_path)
    if not isExistSource:
        print('Directory does not exists:' + dbt_snapshots_file_path)
    else:
        snapshots_file_path = find_files(dbt_snapshots_file_path, '*.sql')
        
        for filename in snapshots_file_path:  
            single_file_name = os.path.basename(filename)
            destination_path = os.path.dirname(filename)
            destination_path = dataform_output_models_path+'/snapshots'+(destination_path.split('snapshots')[1])+'/'
            destination_full_path = (destination_path+(os.path.basename(filename)).replace(".sql",".js"))

         #######copying files to new directory##############   
            try:
                #copy files if directory already exists

                shutil.copyfile(filename,destination_full_path)
            except IOError as e:
                # ENOENT(2): file does not exist, raised also on missing dest parent dir
                if e.errno != errno.ENOENT:
                    raise
                # create directory if not exists
                os.makedirs(destination_path)
                # copy source files to new directory
                shutil.copyfile(filename,destination_full_path)
        #######end copying files to new directory##############

            #read copied files
            with open(destination_full_path, 'r') as file :
                filedata = file.read()   
                #getting the name of the snapshot source table
                table = re.search(r"({{)[ ]{0,9}(ref).*((?<=').*(?='))",filedata).group(3)
                #getting the name of the file and removing extension
                file_name = single_file_name.replace('.sql','')
                #removing macro references 
                filedata = re.sub(r'({%)[ ]{0,10}(endsnapshot|snapshot).*(})','',filedata)
                #if exists check cols (not supported to data form), replace for timestamp field
                filedata = re.sub(r"(check_cols.*[\]]{1,1}|check_cols.*[\"]{1,1})",'timestamp: '+dlh_timestamp_field,filedata)
                #getting header block
                filedata=filedata[filedata.find("{"):filedata.find("}")+2]
                #replacing fields on header block
                filedata = filedata.replace(')', 'source: {\n schema: "{{schema}}",\n name: "{{table}}",\n}, \n});')
                filedata = filedata.replace('{{schema}}',target_schema).replace('{{table}}',table)
                #create a dict with some syntax differences
                replace_dict = {"updated_at":"timestamp","unique_key":"  uniqueKey", "config":'scd("{{source_data_scd}}", ',"'":'"',"{{":"","}}":"","=":": "}

                #iterate through dictionary keys
                for key in replace_dict.keys():
                    #replacing all headers
                    filedata = filedata.replace(key, replace_dict[key]).replace('{{source_data_scd}}',file_name)
                    filedata = re.sub(r'(scd\(.*)(\()',r'\1{',filedata)


            #writing file with replaced header and models references
            with open(destination_full_path, 'w') as file:
                #replacing files content
                new_file = filedata
                new_file = re.sub(r'(strategy)[ ]{0,9}[\:][ ]{0,9}.*(,)','',new_file)   
                #removing unsupported configurations
                new_file = re.sub(r'(target_database|target_schema|strategy|invalidate_hard_deletes|check_cols)[ ]{0,9}[\:][ ]{0,9}.*[,}\"\')]','',new_file)
                new_file = re.sub(r"[ ]{2,999}",'',new_file)
                new_file = re.sub(r"^\s*$",'',new_file,re.MULTILINE)
                new_file = re.sub(r'\n\s*\n','\n',new_file,re.MULTILINE)            
                #insert space to ident
                new_file = new_file.replace('uniqueKey:','  uniqueKey:').replace('timestamp:','  timestamp:').replace('source:','  source:').replace('schema:','   schema:').replace('name:','   name:').replace('},','  },')
                #create .JS snapshot file
                try:
                    file.write('const scd = require("dataform-scd");\n\n')
                    file.write(new_file)
                    print('Generated file: '+destination_full_path)
                except IOError as e:
                    print ("I/O error({0}): {1}".format(e.errno, e.strerror))

        #######end reading file and replacing model syntax differences########


In [337]:
def dataform_install_configuration(dataform_root_path,dataform_credentials_file_path,current_directory):
    
    try:
        shutil.rmtree(dataform_root_path)
    except OSError as err:
        print(err)

        
    #initiating dataform new project
    os.system("dataform init snowflake "+dataform_root_path)
    #copying snowflake credentials file
    shutil.copy(dataform_credentials_file_path,dataform_root_path)
    
    packages_file = dataform_root_path+ '/' +'package.json'
    dataform_json_file = dataform_root_path+ '/' +'dataform.json'
    
    with open(dataform_json_file, 'r') as dataform_file:
        dataform_json_file_data = dataform_file.read()
        filedata= re.sub(r'(\"defaultSchema\"[:]{0,9}[ ]{0,9})(.*)\,',r'\1 "'+target_schema+'",', dataform_json_file_data)
        filedata= re.sub(r'(\"assertionSchema\"[:]{0,9}[ ]{0,9})(.*)\,',r'\1 "'+target_schema+'",', filedata)
        
    with open(dataform_json_file, "w") as dataform_file:        
        new_file = filedata
        dataform_file.write(new_file)
        
        
    with open(packages_file, 'r') as file:
        packages_file_data = file.read()
    
    #setting version 19 of dataform
    with open(packages_file, "w") as file:
        #replacing package.json file to version 19 and including scd package
        filedata = re.sub(r'(\"@dataform).*',r'"@dataform/core": "1.19.0",\n        "dataform-scd": "git+https://github.com/dataform-co/dataform-scd.git#0.1"', packages_file_data)
        file.write(filedata)
    
    #changing path to dataform roots    
    os.chdir(dataform_root_path)
       
    #installing version 0.19 and dataform scd
    os.system("dataform install")
    
    #getting back to default directory
    os.chdir(current_directory)

In [338]:
def dataform_assertions_documentation(dbt_models_file_path,dataform_output_models_path,dataform_output_includes_path,conversion_type):

     #check if source directory exists        
    isExistSource = os.path.exists(dbt_models_file_path)
    if not isExistSource:
        print('Directory does not exists:' + dbt_models_file_path)
    else:
        schema_files_path = find_files(dbt_models_file_path, '*.yml')

        for file in schema_files_path:
            #reading all YAML files on source directory
            dic = read_yaml_file(file)
            #getting only sources entity from yaml file
            if 'models' in dic.keys():

                models_dir = dic["models"]
                dictionary={}
                #print(models_dir)
                unique_list=[]
                not_null_list=[]
                not_null = {} 
                table_descriptions = {} 
                description={}
                unique = {}  
                tables_list=[]
                description_list=[]
                table_description_list=[]
                jmodel_tables_description=''
                jmodel_description=''
                jmodel=''
                for model in models_dir:
                    #declaring variables for each element of dictionary that will be used. tables = list, schema and database = str        
                    #print(model["name"])
                    if 'columns' in model:
                        columns = (model["columns"]) 
                    if 'name' in model:
                        tables = (model["name"])


                    tables_list.append(tables)
                    #getting tables descriptions
                    if 'description' in model: 
                        descriptions = (model["description"])
                        table_descriptions = {descriptions:tables}
                        table_description_list.append(table_descriptions.copy())


                    #getting column tests and descriptions
                    for column in columns:
                        column_get = column.get('name')


                        if 'tests' in column:
                            tests= (column.get('tests'))



                            if 'unique' in tests:
                                unique = {column_get:tables}                                                        
                                unique_list.append(unique.copy())


                            if 'not_null' in tests:
                                not_null = {column_get:tables}
                                not_null_list.append(not_null.copy())



                        if 'description'in column:
                            descriptions= (column.get('description'))
                            #description = {'|    '+column_get+': '+"'"+descriptions+"'":tables}
                            description = {'|    '+column_get+': '+'"'+descriptions+'",': tables}
                            description_list.append(description.copy())


                #print(not_null_list)

                keyfunc = lambda d: next(iter(d.values()))   

                #print('Found table description' +str(table_description_list)+'\n')
                #print('Found uniqueKey test' +str(unique_list)+'\n')
                #print('Found Not_Null test' +str(not_null_list)+'\n')
                #print('Found column description' +str(description_list)+'\n')

                #create dictionary for data tests and tables/columns descriptions
                not_null_dict={k: [x for d in g for x in d] 
                    for k, g in itertools.groupby(sorted(not_null_list, key=keyfunc), key=keyfunc)}

                unique_dict={k: [x for d in g for x in d] 
                    for k, g in itertools.groupby(sorted(unique_list, key=keyfunc), key=keyfunc)}

                description_dict={k: [x for d in g for x in d] 
                    for k, g in itertools.groupby(sorted(description_list, key=keyfunc), key=keyfunc)}

                table_description_dict={k: [x for d in g for x in d] 
                        for k, g in itertools.groupby(sorted(table_description_list, key=keyfunc), key=keyfunc)}


                # list of all .sql files in a directory
                if conversion_type == 'sqlx':
                    dataform_files = find_files(dataform_output_models_path, '*.sqlx')
                else:
                    dataform_files = find_files(dataform_output_includes_path, '*.js')

                files_list =[]
                file_name_list =[]

                #getting all model files on dataform path
                for file in dataform_files:
                    #only tables that has descriptions or tests
                    if os.path.basename(file).replace('.sqlx','').replace('.js','') in tables_list:
                        file_name = os.path.basename(file)
                        destination_path = os.path.dirname(file)
                        full_path = destination_path+'/'+file_name     
                        files_list.append(file_name.replace('.sqlx','').replace('.js',''))


                        #read files
                        with open(full_path, 'r') as file :

                            file_name = file_name.replace('.sqlx','').replace('.js','')
                            filedata = file.read()


                            #add description to table (setting variable)
                            if file_name in table_description_dict.keys():
                                jmodel_tables_description='  description: '+str(table_description_dict[file_name]).replace("'",'"').replace('[','').replace(']','')+',\n'

                            #add assertions { on variable

                            if file_name in unique_dict.keys() or file_name in unique_dict.keys():
                                jmodel='  assertions: {\n'


                                #when exists uniqueKey test on dbt, create on dataform
                                if file_name in unique_dict.keys():                    
                                    jmodel+='    uniqueKey: '+str(unique_dict[file_name]).replace("'",'"')+',\n'

                                #when exists NotNull test on dbt, create on dataform
                                if file_name in not_null_dict.keys():
                                    jmodel+='    nonNull: '+str(not_null_dict[file_name]).replace("'",'"')+'\n'


                                jmodel+='\n  },'

                            #add column description on dataform
                            if file_name in description_dict.keys():         
                                jmodel_description = '  columns: {\n'
                                jmodel_description += str(description_dict[file_name]).replace("\\n",'').replace("\\",'').replace('[','').replace("']",'').replace("'|",' \n').replace("',",'')
                                jmodel_description += '\n  },\n'

                            if conversion_type == 'sqlx':
                                #setting variable with all tests and description to be added on model
                                header=filedata[filedata.find("config {"):filedata.find("}")+2]                
                                header_old=header
                                header = header+jmodel_tables_description+jmodel_description+jmodel

                                with open(f'{full_path}', "w") as jmodel_file:
                                    #replace old header to new header with assertions and column or table documentations
                                    match = re.search(r'config[ ]{0,20}\{', filedata)
                                    if match:
                                        jmodel_file.write(filedata.replace(header_old,header))
                                    else:
                                        jmodel_file.write('config {\n'+header+'\n}\n'+filedata)

                            else:
                                  #setting variable with all tests and description to be added on model
                                header=filedata[filedata.find("module.exports = (params) => {"):filedata.find("...params.defaultConfig")]                
                                header_old=header
                                header = header+jmodel_tables_description+jmodel_description+jmodel+'\n'
                            
                                
                                with open(f'{full_path}', "w") as jmodel_file:
                                        #replace old header to new header with assertions and column or table documentations
                                        #match = re.search(r'config[ ]{0,20}\{', filedata)
                                        #if match:
                                    jmodel_file.write(filedata.replace(header_old,header))
                                        #else:
                                         #   jmodel_file.write('config {\n'+header+'\n}\n'+filed
                            

In [339]:
#Function used to convert source (YML) to (SQLX) file
def create_project_variables(dbt_source_project_path,dataform_root_path):



    #check if source directory exists        
    isExistSource = os.path.exists(dbt_source_project_path)
    if not isExistSource:
        print('Directory does not exists:' + dbt_source_project_path)
    else:
        variables_file = find_files(dbt_source_project_path, 'dbt_project.yml')
        dataform_file = dataform_root_path+'/includes/constants.js'


    #iterate through each yaml file that contains sources
        for file in variables_file:
            #reading all YAML files on source directory
            dic = read_yaml_file(file)

            #getting only sources entity from yaml file
            jmodel=''
            if "vars" in dic.keys():

                variables = dic["vars"]            
                jmodel = ''
                if conversion_type == 'sqlx':

                    for key, value in variables.items():
                        print(str(key))
                        jmodel+='const '+str(key)+' = '+'"'+str(value).replace("'",'"')+'";'+'\n'


                    jmodel+='module.exports = {\n'
                    for key, value in variables.items():
                        jmodel+='  '+str(key)+',\n'

                    #replacing list_fields
                    jmodel = re.sub(r'(= \"\[)(.*)(\]\")',r'= [\2]',jmodel)
                    #replacing date_fields
                    jmodel = re.sub(r"[\"]([0-9]{4}[-][0-9]{2}[-][0-9]{2})[\"]",'"'+r"'\1'"+'"',jmodel)
                    #replacing booleans
                    jmodel = re.sub(r'[\=][ ]{0,9}("TRUE"|"true"|"True")',r"= true",jmodel)
                    jmodel = re.sub(r'[\=][ ]{0,9}("FALSE"|"false"|"False")',r"= false",jmodel)
                    jmodel = jmodel[:-2]
                    jmodel+='\n}'


                    with open(f'{dataform_file}', "w") as jmodel_file:
                    #replace old header to new header with assertions and column or table documentations
                        jmodel_file.write(jmodel)
                        jmodel_file.close
                       #print(jmodel)


                else:
                    jmodel += '  params = {\n'
                    const = ''
                    for key, value in variables.items():
                        jmodel+='    '+str(key)+': '+"'"+str(value)+"',"+'\n'
                        const += '    '+str(key)+',\n'
                    jmodel+='    ...params\n  };\n'
                    jmodel+='  const {\n'
                    jmodel+=const


                    #replacing list_fields
                    jmodel = re.sub(r'(= \"\[)(.*)(\]\")',r'= [\2]',jmodel)                
                    #replacing booleans
                    jmodel = re.sub(r"[\:][ ]{0,9}('TRUE'|'true'|'True')",r": true",jmodel)
                    jmodel = re.sub(r"[\:][ ]{0,9}('FALSE'|'false'|'False')",r": false",jmodel)
                    jmodel = jmodel[:-2]
                    jmodel+='\n} = params;'


                    params_sources = '\n'
                    return (jmodel+'\n'+params_sources)

In [340]:
def create_index_file(dataform_root_path):


    #check if source directory exists        
    isExistSource = os.path.exists(dataform_root_path)
    if not isExistSource:
        print('Directory does not exists:' + dataform_root_path)

    else:
        js_files = find_files(dataform_root_path+'/includes/', '*.js')
        dataform_file = dataform_root_path+'/index.js'
        example_file = dataform_root_path+'/definitions/example.js'

         #iterate through each yaml file that contains sources
        jmodel = ''
        jmodel += '\n\nmodule.exports = (params) => {\n'
        #jmodel += 'params = {\n  ...params\n}\n'
        #jmodel += 'const {\n} = params;'
        models = ''
        #footer = '\n\nreturn {\n'
        footer=''
        sources=''
        example = f'const {target_schema} = require("../");'
        example += '\n'
        example += f'const models = {target_schema}('
        example += '{\n});'
        for filename in js_files:
            single_file_name = os.path.basename(filename)
            file_name=single_file_name.replace('.js','')
            destination_path = os.path.dirname(filename)
            destination_path = dataform_root_path+'/includes'+(destination_path.split('includes')[1])+'/'
            destination_full_path = (destination_path+(os.path.basename(filename)))
            destination_full_path = './includes'+destination_full_path.split("/includes",1)[1] 
            if single_file_name != 'sources.js' and single_file_name != 'constants.js':
                models += 'const '+file_name+' = require("'+destination_full_path+'");\n'
                footer += f'    {file_name}: {file_name}(params)'+',\n'
        models = str(models+jmodel)



        variables = str(create_project_variables(dbt_source_project_path,dataform_root_path))

        if variables == 'None':
            variables = ''
        else:
            variables

        sources = str(create_js_source_file(dbt_models_file_path,dataform_output_sources_path,conversion_type))

        index_file = models+variables+sources+footer+'\n  }\n}'



        with open(f'{dataform_file}', "w") as jmodel_file:
            jmodel_file.write(index_file)
            print(index_file)
            print ('Created index.js file')

        with open(f'{example_file}', "w") as jmodel_file:
            jmodel_file.write(example)
            print ('Created example.js file')



In [341]:
#Function used to convert source (YML) to (JS) file
def create_macro_files(dbt_macros_file_path,dbt_models_file_path,dataform_output_includes_path,conversion_type):

    #check if source directory exists        
    isExistMacros = os.path.exists(dbt_macros_file_path)
    if not isExistMacros:
        print('Directory does not exists:' + dbt_macros_file_path)
    else:
        macro_files = find_files(dbt_macros_file_path, '*.sql')


    #creating directory        
        footer_macro_names=''
        filecontent= ''
        destination_full_path = dataform_output_includes_path+'/common.js'
        #iterate on each SQL file on DBT directory and getting variables
        for filename in macro_files:  
            single_file_name = os.path.basename(filename)

            
            with open(filename, 'r') as file :
                filedata = file.read()                    
                macro_names = re.compile(r'({%[ ]{0,9}macro)(([! ])(.*)([ ]{0,10}\())')

                # find all macros
                for match in macro_names.finditer(filedata):
                    footer_macro_names += '    '+match.group(4)+',\n'

                filecontent = re.sub(r'({%[ ]{0,9}macro)','function ',filedata)
                filecontent = re.sub(r'{[ ]{0,9}%[ ]{0,9}endmacro[ ]{0,9}%[ ]{0,9}}','`;\n}',filecontent)
                filecontent = re.sub(r'%}','{\n   return `',filecontent)
                filecontent = re.sub(r'`[ ]{0,20}\n+','`',filecontent)
                filecontent = re.sub(r'({{)([^ ]+)(}})',r'${\2}',filecontent)
                filecontent = re.sub(r'--','//',filecontent)


        footer = '  module.exports = {\n'+footer_macro_names[:-2]+'\n  };'
        new_macro_file = filecontent+footer       


        with open(destination_full_path, 'a') as file:     

            file.write(new_macro_file)

        if conversion_type == 'js':
            model_files = find_files(dataform_output_includes_path, '*.js')

            #if the file has macros, add header 
            for filename in model_files:                
                with open(filename, 'r') as file:

                    filedata = file.read()
                    match = re.search(r'(\${[ ]{0,9}common.)', filedata)

                    if match:
                        header=f'const common = require("../../common");\n'
                        with open(filename, 'w') as file:
                            file.write(header+filedata)
                            file.close()



In [342]:
def dbt_dataform_converter(dataform_root_path,dbt_models_file_path,dbt_snapshots_file_path,dataform_credentials_file_path,dataform_output_models_path,dataform_output_includes_path,current_directory,conversion_type):
    
     #check if source directory exists        
    isExistSource = os.path.exists(dbt_models_file_path)
    if not isExistSource:
        print('Source directory does not exists:' + dbt_models_file_path)
    else:
        #installing dataform
        dataform_install_configuration(dataform_root_path,dataform_credentials_file_path,current_directory)    
        #creating source       
      
        if conversion_type == 'js':            
            create_js_model_files(dbt_models_file_path,dataform_output_includes_path)   
            create_index_file(dataform_root_path)
        else:            
            create_js_source_file(dbt_models_file_path,dataform_output_sources_path,conversion_type)
            create_sqlx_models_files(dbt_models_file_path,dataform_output_models_path)
            #converting variables
            create_project_variables(dbt_source_project_path,dataform_root_path)        
        #creating snapshot files
        create_sqlx_snapshot_files(dbt_snapshots_file_path,dataform_output_models_path)
        #addind test to models
        dataform_assertions_documentation(dbt_models_file_path,dataform_output_models_path,dataform_output_includes_path,conversion_type)
        
        create_macro_files(dbt_macros_file_path,dataform_output_includes_path,dataform_output_includes_path,conversion_type)
        

In [344]:
dbt_dataform_converter(dataform_root_path,dbt_models_file_path,dbt_snapshots_file_path,dataform_credentials_file_path,dataform_output_models_path,dataform_output_includes_path,current_directory,conversion_type)

Writing project files...

[32mDirectories successfully created:[0m
  /Users/guilhermealcantara/OneDrive/brf consulting/dataform/test_bill_dot_com
  /Users/guilhermealcantara/OneDrive/brf consulting/dataform/test_bill_dot_com/definitions
  /Users/guilhermealcantara/OneDrive/brf consulting/dataform/test_bill_dot_com/includes
[32mFiles successfully written:[0m
  /Users/guilhermealcantara/OneDrive/brf consulting/dataform/test_bill_dot_com/dataform.json
  /Users/guilhermealcantara/OneDrive/brf consulting/dataform/test_bill_dot_com/package.json
  /Users/guilhermealcantara/OneDrive/brf consulting/dataform/test_bill_dot_com/.gitignore
[32mNPM packages successfully installed.[0m
Installing NPM dependencies...

[32mProject dependencies successfully installed.[0m
Generated file: ../../dataform/test_bill_dot_com/includes/staging/CUSTOMERS/V_BDC_CUSTOMERS_STG.js
Generated file: ../../dataform/test_bill_dot_com/includes/staging/CUSTOMERS/V_BDC_CUSTOMERS_BANK_ACCOUNT_STG.js
Generated file: ..

In [None]:
#dbt_dataform_converter(dataform_root_path,dbt_models_file_path,dbt_snapshots_file_path,dataform_credentials_file_path,dataform_output_models_path,dataform_output_includes_path,current_directory)