# Librerías



In [1]:
from pathlib import Path
import sqlglot


# Variables globales



In [2]:
READ_DATA_PATH = '../data/database/tsql'
FULL_SCRIPT_PATH = f'{READ_DATA_PATH}/instawdbdw.sql'
SQL_ENCODING = 'UTF-16'

SAMPLES_ABS_PATH = ''
SAMPLES_ENCODING = 'UTF16'
SAMPLES_DELIMITER = '|'

READ_DIALECT = 'tsql'
WRITE_DIALECT = 'postgres'

DB_NAME = 'adventure_works_dw'
SCHEMA = 'adventure_works'

TRANSPILING_MANUAL = 'MANUAL'
TRANSPILING_SQLGLOT = 'SQLGLOT'

WRITE_DATA_PATH = '../data/database/postgres'
TRANSPILED_SCRIPT_PATH = f'{WRITE_DATA_PATH}/create_adventure_works_postgres.sql'
OUTPUT_ENCODING = 'UTF-8'

# Recodificar ficheros CSV de UTF-16 a UTF-8

Procedemos a obtener todos los ficheros CSV almacenados en el directorio de lectura:

In [3]:
csv_files_list = []

for file_path in Path(READ_DATA_PATH).rglob('*.csv'):
    csv_files_list.append(file_path.as_posix())

csv_files_list

['../data/database/tsql/DatabaseLog.csv',
 '../data/database/tsql/DimAccount.csv',
 '../data/database/tsql/DimCurrency.csv',
 '../data/database/tsql/DimCustomer.csv',
 '../data/database/tsql/DimDate.csv',
 '../data/database/tsql/DimDepartmentGroup.csv',
 '../data/database/tsql/DimEmployee.csv',
 '../data/database/tsql/DimGeography.csv',
 '../data/database/tsql/DimOrganization.csv',
 '../data/database/tsql/DimProduct.csv',
 '../data/database/tsql/DimProductCategory.csv',
 '../data/database/tsql/DimProductSubcategory.csv',
 '../data/database/tsql/DimPromotion.csv',
 '../data/database/tsql/DimReseller.csv',
 '../data/database/tsql/DimSalesReason.csv',
 '../data/database/tsql/DimSalesTerritory.csv',
 '../data/database/tsql/DimScenario.csv',
 '../data/database/tsql/FactAdditionalInternationalProductDescription.csv',
 '../data/database/tsql/FactCallCenter.csv',
 '../data/database/tsql/FactCurrencyRate.csv',
 '../data/database/tsql/FactFinance.csv',
 '../data/database/tsql/FactInternetSales.c

Ahora, para cada fichero en el listado, procederemos a abrirlo con el encoding de lectura, y a guardarlo con el encoding de escritura:

In [4]:
csv_files_output_dict = {}

for csv_file_path in csv_files_list:
    csv_file_name = csv_file_path.split('/')[-1]
    csv_output_path = f'{WRITE_DATA_PATH}/{csv_file_name}'

    with open(csv_file_path, 'r', encoding=SAMPLES_ENCODING) as f:
        csv_file = f.read()

    with open(csv_output_path, 'w', encoding=OUTPUT_ENCODING) as f:
        f.write(csv_file)

    csv_files_output_dict[csv_file_name] = Path(csv_output_path).resolve().as_posix()

csv_files_output_dict

{'DatabaseLog.csv': 'C:/python/ucm-tfm-grupo-4/data/database/postgres/DatabaseLog.csv',
 'DimAccount.csv': 'C:/python/ucm-tfm-grupo-4/data/database/postgres/DimAccount.csv',
 'DimCurrency.csv': 'C:/python/ucm-tfm-grupo-4/data/database/postgres/DimCurrency.csv',
 'DimCustomer.csv': 'C:/python/ucm-tfm-grupo-4/data/database/postgres/DimCustomer.csv',
 'DimDate.csv': 'C:/python/ucm-tfm-grupo-4/data/database/postgres/DimDate.csv',
 'DimDepartmentGroup.csv': 'C:/python/ucm-tfm-grupo-4/data/database/postgres/DimDepartmentGroup.csv',
 'DimEmployee.csv': 'C:/python/ucm-tfm-grupo-4/data/database/postgres/DimEmployee.csv',
 'DimGeography.csv': 'C:/python/ucm-tfm-grupo-4/data/database/postgres/DimGeography.csv',
 'DimOrganization.csv': 'C:/python/ucm-tfm-grupo-4/data/database/postgres/DimOrganization.csv',
 'DimProduct.csv': 'C:/python/ucm-tfm-grupo-4/data/database/postgres/DimProduct.csv',
 'DimProductCategory.csv': 'C:/python/ucm-tfm-grupo-4/data/database/postgres/DimProductCategory.csv',
 'DimP

# Traducción de Script T-SQL a Postgres

Cargamos todo el script:

In [19]:
with open(FULL_SCRIPT_PATH, 'r', encoding= SQL_ENCODING) as f:
    full_script = f.read()

Creamos funciones auxiliares y algunas definiciones adicionales a utilizar en el proceso:

In [24]:
REPLACEMENT_DICTS = [
    {'old': '$(DatabaseName)', 'new': DB_NAME},
    {'old': '$(SqlSamplesSourceDataPath)', 'new': SAMPLES_ABS_PATH},
    {'old': '[dbo]', 'new': f'[{SCHEMA}]'},
    {'old': 'ON [PRIMARY]', 'new': ''},
    {'old': 'WITH CHECK ADD', 'new': 'ADD'},
    {'old': 'KEY NONCLUSTERED', 'new': 'KEY'},
    {'old': 'KEY CLUSTERED', 'new': 'KEY'},
    {'old': ' NONCLUSTERED INDEX', 'new': ' INDEX'},
    {'old': ' CLUSTERED INDEX', 'new': ' INDEX'},
]


sections_params = [
    {
        'section': 'db',
        'start_marker': '-- ****************************************\n-- Create Database\n-- ****************************************',
        'end_marker': "*** Checking for $(DatabaseName) Database';\n",
        'transpiling': TRANSPILING_MANUAL
    },
    {
        'section': 'tables',
        'start_marker': '-- ******************************************************\n-- Create tables\n-- ******************************************************',
        'end_marker': 'CREATE TABLE [dbo].[sysdiagrams](',
        'ignore_sub_section': [
            ('CREATE TABLE [dbo].[AdventureWorksDWBuildVersion](\n', 'CREATE TABLE [dbo].[DimAccount](\n')
        ],
        'transpiling': TRANSPILING_SQLGLOT
    },
    {
        'section': 'load',
        'start_marker': '-- ******************************************************\n-- Load data\n-- ******************************************************',
        'end_marker': '-- ******************************************************\n-- Add Primary Keys\n-- ******************************************************',
        'ignore_sub_section': [
            ("PRINT 'Loading [dbo].[AdventureWorksDWBuildVersion]';\n", "PRINT 'Loading [dbo].[DimAccount]';\n")
        ],
        'transpiling': TRANSPILING_MANUAL
    },
    {
        'section': 'primary_keys',
        'start_marker': '-- ******************************************************\n-- Add Primary Keys\n-- ******************************************************',
        'end_marker': '-- ******************************************************\n-- Add Indexes\n-- ******************************************************',
        'ignore_sub_section': [
            ("PRINT '*** Adding Primary Keys';\n", "ALTER TABLE [dbo].[DimAccount] WITH CHECK ADD\n")
        ],
        'transpiling': TRANSPILING_SQLGLOT
    },
    {
        'section': 'indexes',
        'start_marker': '-- ******************************************************\n-- Add Indexes\n-- ******************************************************',
        'end_marker': '-- ****************************************\n-- Create Foreign key constraints\n-- ****************************************',
        'transpiling': TRANSPILING_SQLGLOT
    },
    {
        'section': 'foreign_key',
        'start_marker': '-- ****************************************\n-- Create Foreign key constraints\n-- ****************************************',
        'end_marker': '-- ******************************************************\n-- Add database views.\n-- ******************************************************',
        'transpiling': TRANSPILING_SQLGLOT
    }
]



def trf_strip_upper(line: str) -> str:
    """
    Removes leading/trailing whitespace and converts a string to uppercase.

    Args:
        line: The input string.

    Returns:
        The stripped and uppercased string.
    """

    return line.strip().upper()



def replace_values(script: str, replacement_dicts: list = REPLACEMENT_DICTS) -> str:
    """
    Replaces multiple values in a script string based on a list of dictionaries.

    Args:
        script: The input script string.
        replacement_dicts: A list of dictionaries, each with 'old' and 'new' keys
                           for the replacement. Defaults to REPLACEMENT_DICTS.

    Returns:
        The script string with replaced values.
    """

    for old_new_dict in replacement_dicts:
        script = script.replace(old_new_dict['old'], old_new_dict['new'])

    return script



def clean_lines(script: str, section: str = None) -> str:
    """
    Cleans up a script string by removing specific lines (PRINT, GO, comments)
    and applying section-specific cleaning rules.

    Args:
        script: The input script string.
        section: The name of the script section (e.g., 'indexes') to apply
                 section-specific rules. Defaults to None.

    Returns:
        The cleaned script string.
    """

    cleaned_lines = []

    for line in script.splitlines():
        if trf_strip_upper(line).startswith('PRINT'):
            continue

        if trf_strip_upper(line).startswith('GO'):
            continue

        if trf_strip_upper(line).startswith('-- '):
            continue

        if section == 'indexes' and trf_strip_upper(line).startswith(')WITH'):
            line = ');'

        cleaned_lines.append(line)


    return '\n'.join(cleaned_lines)



def manual_transpiling(script: str, section: str) -> str:
    """
    Manually transpiles specific sections of the SQL script for Postgres.

    Args:
        script: The input script section string.
        section: The name of the script section ('db' or 'load').

    Returns:
        The manually transpiled script section string.
    """

    transpiled_lines = []

    if section == 'db':
        for line in script.strip().splitlines():
            if not trf_strip_upper(line).startswith('CREATE DATABASE'):
                transpiled_lines.append(line)
                continue
            
            transpiled_lines.append(f'\nDROP DATABASE IF EXISTS {DB_NAME};\n\n')
            transpiled_lines.append(line)
            transpiled_lines.append(f'\n\\c {DB_NAME};')
            transpiled_lines.append(f'\nCREATE SCHEMA {SCHEMA};')
            transpiled_lines.append('\nDROP SCHEMA IF EXISTS public CASCADE;')


    if section == 'load':
        for line in script.splitlines():
            if not trf_strip_upper(line).startswith('BULK INSERT'):
                continue

            line = line.replace('BULK INSERT', 'COPY')
            line = line.replace('[', '"')
            line = line.replace(']', '"')

            csv_file_name = line.split("'")[1]
            line = line.replace(
                csv_file_name,
                csv_files_output_dict[csv_file_name]
            )
            
            line += ' WITH ('
            transpiled_lines.append(line)

            transpiled_lines.append(f"    DELIMITER '{SAMPLES_DELIMITER}',")
            transpiled_lines.append("    FORMAT CSV,")
            transpiled_lines.append("    HEADER FALSE,")
            transpiled_lines.append(f"    ENCODING '{OUTPUT_ENCODING.replace('-', '')}'")
            transpiled_lines.append(");\n")

    return '\n'.join(transpiled_lines)



def final_post_processing(script: str) -> str:
    """
    Performs final post-processing on the transpiled script, including data type
    replacements and removal of T-SQL specific keywords.

    Args:
        script: The transpiled script string.

    Returns:
        The final post-processed script string.
    """

    script = script.replace(' GENERATED AS ', ' GENERATED BY DEFAULT AS ')
    script = script.replace(' bit ', ' BOOLEAN ').replace(' BIT ', ' BOOLEAN ')
    script = script.replace(' money ', ' NUMERIC(19,4) ').replace(' MONEY ', ' NUMERIC(19,4) ')
    script = script.replace(' BYTEA(MAX) ', ' BYTEA ')
    script = script.replace(' VARCHAR(MAX) ', ' TEXT ')
    script = script.replace(' NONCLUSTERED', '').replace(' CLUSTERED', '')
    script = script.replace(')\n\nCREATE TABLE', ');\n\nCREATE TABLE')

    return script



A continuación, implementamos el script que permitirá:

1.   Capturar las secciones de interés del script, excluyendo de ella los fragmentos no deseados.
2.   Aplicar transofrmaciones de limpieza sobre la estructura de las secciones de interés.
3.  Traducir del dialecto de entrada (T-SQL) al de salida (Postgres) las secciones deseadas.
4.  Finalmente, unificará todas secciones traducidas, y las exportará a un fichero `.sql`.




In [25]:
output_transpiled_script = ''

for section_data in sections_params:
    section = section_data['section']
    print(f'Transpiling section `{section}` ...')

    start_marker = section_data['start_marker']
    end_marker = section_data['end_marker']

    start_index = full_script.find(start_marker)
    end_index = full_script.find(end_marker)

    print(f'> Start index: {start_index}')
    print(f'> End index: {end_index}')

    if start_index != -1 and end_index != -1:
        section_script = full_script[start_index:end_index]
        section_data[f'{READ_DIALECT}_script'] = section_script
        print(f'> {READ_DIALECT} section obtained')


    cleaned_script = section_script
    if section_data.get('ignore_sub_section'):
        init_index = 0
        cleaned_script = ''

        for ignore_start, ignore_end in section_data['ignore_sub_section']:
            ignore_start_index = section_script.find(ignore_start)
            cleaned_script += section_script[init_index:ignore_start_index]

            init_index = section_script.find(ignore_end)

        cleaned_script += section_script[init_index:]
        print(f'> Sub-sections ignored')

    cleaned_script = replace_values(cleaned_script)
    cleaned_script = clean_lines(cleaned_script, section)
    section_data['cleaned_script'] = cleaned_script
    print(f'> {READ_DIALECT} section cleaned')


    if section_data['transpiling'] == TRANSPILING_SQLGLOT:
        transpiled_script = sqlglot.transpile(
            cleaned_script,
            read= READ_DIALECT,
            write= WRITE_DIALECT,
            pretty= True,
            indent= 4,
            pad= 4,
            normalize_functions= 'upper'
        )
        section_data[f'{WRITE_DIALECT}_script'] = '\n\n'.join(transpiled_script)

    if section_data['transpiling'] == TRANSPILING_MANUAL:
        section_data[f'{WRITE_DIALECT}_script'] = manual_transpiling(cleaned_script, section)

    print(f"> Section transpiled ({section_data['transpiling']}) {READ_DIALECT} --> {WRITE_DIALECT}")

    output_transpiled_script += section_data['start_marker']
    output_transpiled_script += '\n' * 2
    output_transpiled_script += final_post_processing(section_data[f'{WRITE_DIALECT}_script'])
    output_transpiled_script += '\n' * 4

    print()


with open(TRANSPILED_SCRIPT_PATH, 'w', encoding= OUTPUT_ENCODING) as f:
    f.write(output_transpiled_script)

print(f'--> Transpiled script saved in `{TRANSPILED_SCRIPT_PATH}` <--')

Transpiling section `db` ...
> Start index: 2603
> End index: 2809
> tsql section obtained
> tsql section cleaned
> Section transpiled (MANUAL) tsql --> postgres

Transpiling section `tables` ...
> Start index: 7004
> End index: 21633
> tsql section obtained
> Sub-sections ignored
> tsql section cleaned
> Section transpiled (SQLGLOT) tsql --> postgres

Transpiling section `load` ...
> Start index: 22079
> End index: 31133
> tsql section obtained
> Sub-sections ignored
> tsql section cleaned
> Section transpiled (MANUAL) tsql --> postgres

Transpiling section `primary_keys` ...
> Start index: 31133
> End index: 36504
> tsql section obtained
> Sub-sections ignored
> tsql section cleaned
> Section transpiled (SQLGLOT) tsql --> postgres

Transpiling section `indexes` ...
> Start index: 36504
> End index: 40192
> tsql section obtained
> tsql section cleaned
> Section transpiled (SQLGLOT) tsql --> postgres

Transpiling section `foreign_key` ...
> Start index: 40192
> End index: 47319
> tsql 