In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

def parse_informatica_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Data structures to hold the lineage information
    lineage = []
    
    # Extract sources
    sources = {}
    for source in root.findall(".//SOURCE"):
        source_name = source.get('NAME')
        if source_name not in sources:
            sources[source_name] = []
        for field in source.findall(".//SOURCEFIELD"):
            field_name = field.get('NAME')
            sources[source_name].append(field_name)
    
    # Extract transformations
            transformations = {}
            for transformation in root.findall(".//TRANSFORMATION"):
                trans_name = transformation.get('NAME')
                trans_type = transformation.get('TYPE')
                transformations[trans_name] = {
                    'type': trans_type,
                    'fields': []
                    }
                for field in transformation.findall(".//TRANSFORMFIELD"):
                    field_name = field.get('NAME')
                    transformations[trans_name]['fields'].append(field_name)
    
    # Extract instances
                    instances = {}
                    for instance in root.findall(".//INSTANCE"):
                        inst_name = instance.get('NAME')
                        trans_name = instance.get('TRANSFORMATION_NAME')
                        instances[inst_name] = trans_name
    
    # Extract connectors and build lineage
                        for connector in root.findall(".//CONNECTOR"):
                            from_field = connector.get('FROMFIELD')
                            to_field = connector.get('TOFIELD')
                            from_instance = connector.get('FROMINSTANCE')
                            to_instance = connector.get('TOINSTANCE')
                            from_trans = instances[from_instance]
                            to_trans = instances[to_instance]
                            lineage.append([from_trans, from_field, to_trans, to_field])
    
    return sources, transformations, lineage

def build_lineage_table(sources, transformations, lineage):
    data = []
    
    # Add sources to data
    for source_name, fields in sources.items():
        for field in fields:
            data.append([source_name, field, '', '', '', '', ''])
    
    # Add transformations and lineage to data
    for from_trans, from_field, to_trans, to_field in lineage:
        trans_type = transformations[to_trans]['type'] if to_trans in transformations else 'Target Definition'
        data.append([from_trans, from_field, '->', to_trans, to_field, trans_type, ''])
    
    df = pd.DataFrame(data, columns=['Source/Transformation', 'Field', '', 'Target Transformation', 'Target Field', 'Transformation Type', ''])
    
    return df

def save_lineage_to_excel(df, output_file):
    df.to_excel(output_file, index=False)

if __name__ == "__main__":
    xml_file = 'SDE_WC_AR_BRXACT_FS'  # Replace with your XML file path
    sources, transformations, lineage = parse_informatica_xml(xml_file)
    lineage_df = build_lineage_table(sources, transformations, lineage)
    save_lineage_to_excel(lineage_df, 'data_lineage1.xlsx')

print("Data lineage has been saved to data_lineage.xlsx")

FileNotFoundError: [Errno 2] No such file or directory: 'SDE_WC_AR_BRXACT_FS'

## using deepseek r1

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

def parse_informatica_mapping(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    ns = {'pm': 'http://www.informatica.com/Repository/Mapping'}  # Namespace

    # Extract Sources
    sources = {}
    for source in root.findall('.//pm:SOURCE', ns):
        source_name = source.get('NAME')
        fields = []
        for field in source.findall('pm:SOURCEFIELD', ns):
            field_info = {
                'NAME': field.get('NAME'),
                'DATATYPE': field.get('DATATYPE'),
                'PRECISION': field.get('PRECISION'),
                'SCALE': field.get('SCALE')
            }
            fields.append(field_info)
        sources[source_name] = fields

    # Extract Target
    target = {}
    target_fields = []
    for tgt in root.findall('.//pm:TARGET', ns):
        target_name = tgt.get('NAME')
        for field in tgt.findall('pm:TARGETFIELD', ns):
            target_info = {
                'NAME': field.get('NAME'),
                'DATATYPE': field.get('DATATYPE'),
                'PRECISION': field.get('PRECISION'),
                'SCALE': field.get('SCALE')
            }
            target_fields.append(target_info)
    target[target_name] = target_fields

    # Extract Transformations
    transformations = []
    for trans in root.findall('.//pm:TRANSFORMATION', ns):
        trans_info = {
            'NAME': trans.get('NAME'),
            'TYPE': trans.get('TYPE'),
            'DESCRIPTION': trans.get('DESCRIPTION'),
            'FIELDS': []
        }
        
        for field in trans.findall('pm:TRANSFORMFIELD', ns):
            field_info = {
                'NAME': field.get('NAME'),
                'DATATYPE': field.get('DATATYPE'),
                'EXPRESSION': field.get('EXPRESSION'),
                'EXPRESSIONTYPE': field.get('EXPRESSIONTYPE'),
                'PORTTYPE': field.get('PORTTYPE')
            }
            trans_info['FIELDS'].append(field_info)
        
        transformations.append(trans_info)

    # Create Data Lineage DataFrame
    lineage_data = []
    
    # Map Target to Sources
    for tgt_field in target[target_name]:
        target_col = tgt_field['NAME']
        found = False
        
        # Check direct source mappings
        for src_name, src_fields in sources.items():
            for src_field in src_fields:
                if src_field['NAME'] == target_col:
                    lineage_data.append({
                        'Target Column': target_col,
                        'Source Table': src_name,
                        'Source Column': src_field['NAME'],
                        'Transformation Logic': 'Direct mapping'
                    })
                    found = True
                    break
            if found: break
        
        # Check transformation logic if not found in direct mapping
        if not found:
            logic = []
            for trans in transformations:
                for field in trans['FIELDS']:
                    if field['NAME'] == target_col and field['PORTTYPE'] == 'OUTPUT':
                        if field['EXPRESSION']:
                            logic.append(f"{trans['NAME']} ({trans['TYPE']}): {field['EXPRESSION']}")
                        else:
                            logic.append(f"{trans['NAME']} ({trans['TYPE']})")
            
            lineage_data.append({
                'Target Column': target_col,
                'Source Table': 'Multiple',
                'Source Column': 'Transformed',
                'Transformation Logic': ' | '.join(logic) if logic else 'Unknown'
            })

    return pd.DataFrame(lineage_data)

# Usage
xml_file_path = 'Python_tutorials_mapping.XML'
output_excel = 'Data_Lineage_Report.xlsx'

# Generate DataFrame
df = parse_informatica_mapping(xml_file_path)

# Save to Excel
df.to_excel(output_excel, index=False)
print(f"Data lineage report generated: {output_excel}")

UnboundLocalError: cannot access local variable 'target_name' where it is not associated with a value