# RPG Legacy Dependency Analyzer

This tool parses legacy RPG/SQL source code, extracts dependencies (Program-to-Program calls and Program-to-Table accesses), loads them into a Neo4j Graph Database, and runs clustering algorithms to find "islands" of isolated code.

In [1]:
# --- CELL 1: SETUP & CONFIGURATION ---
import os
import re
import pandas as pd
from neo4j import GraphDatabase

# Configuration
REPO_PATH = "./src"             # Folder containing your .rpgle, .sqlrpgle files
NEO4J_URI = "bolt://localhost:7687"  # Localhost works because of 'network_mode: service:neo4j'
NEO4J_AUTH = ("neo4j", "password")

# Verify Environment
print(f"‚úÖ Configuration Loaded.")
print(f"üìÇ Scanning Directory: {os.path.abspath(REPO_PATH)}")
if not os.path.exists(REPO_PATH):
    os.makedirs(REPO_PATH)
    print(f"‚ö†Ô∏è  Directory '{REPO_PATH}' was missing, so I created it. Please add files there!")

‚úÖ Configuration Loaded.
üìÇ Scanning Directory: /workspace/src


In [2]:
# --- CELL 2: UNIVERSAL PARSER (FIXED, FREE, & MIXED MODE) ---
import re
import os
import pandas as pd

def parse_rpg_file(filepath):
    dependencies = []
    full_filename = os.path.basename(filepath)
    filename = full_filename.split('.')[0].upper()
    extension = full_filename.split('.')[-1].upper() if '.' in full_filename else ''
    
    clean_path = os.path.relpath(filepath, start=".")
    if clean_path.startswith("./"): clean_path = clean_path[2:]
    
    try:
        with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
            lines = f.readlines()
    except Exception as e:
        print(f"‚ùå Error reading {filepath}: {e}")
        return []
        
    # --- GLOBAL FLAGS ---
    is_fully_free = False
    if lines and len(lines) > 0:
        # Check for **FREE directive at the very top
        if lines[0].strip().upper().startswith('**FREE'):
            is_fully_free = True

    # State Variable for Mixed Mode
    in_free_block = False 
    is_sql_block = False
    
    for line_num, raw_line in enumerate(lines, 1):
        line = raw_line.strip()
        if not line: continue
        
        upper_line = line.upper()

        # --- STEP 0: HANDLE STATE SWITCHING (/FREE & /END-FREE) ---
        # Directives often start with '/' in column 7, but we check liberally
        if upper_line.startswith('/FREE') or upper_line.startswith('//FREE'):
            in_free_block = True
            continue # Skip the directive line itself
        
        if upper_line.startswith('/END-FREE') or upper_line.startswith('//END-FREE'):
            in_free_block = False
            continue

        # "Effective" Free Mode: True if file is **FREE OR we are inside a /FREE block
        is_free_context = is_fully_free or in_free_block

        # --- STEP 1: SMART COMMENT STRIPPING ---
        # If we are NOT in a free context, we MUST respect Column 7 comments
        if not is_free_context:
            if len(raw_line) > 6 and raw_line[6] == '*': 
                continue
        
        # Universal: Strip // and -- comments (Valid in both modes usually, but critical for Free)
        if '//' in line: line = line.split('//')[0].strip()
        if '--' in line: line = line.split('--')[0].strip()
        if not line: continue

        # Metadata
        source_meta = {
            'source': filename,
            'source_path': clean_path,
            'source_ext': extension,
            'line': line_num,
            'statement': line
        }

        # --- A. Modern "DCL-F" (Free Format File Declaration) ---
        # STRICTLY for Free Contexts
        if is_free_context:
            dcl_match = re.search(r'\bDCL-F\s+(\w+)', line, re.IGNORECASE)
            if dcl_match:
                target = dcl_match.group(1).upper()
                action = 'READ' 
                if 'USAGE(*OUTPUT' in upper_line: action = 'WRITE'
                elif 'USAGE(*UPDATE' in upper_line: action = 'UPDATE'
                elif 'USAGE(*INPUT' in upper_line: action = 'READ'
                
                item = source_meta.copy()
                item.update({'target': target, 'type': 'ACCESSES', 'action': action})
                dependencies.append(item)

        # --- B. Legacy "F-Specs" (Fixed Format) ---
        # STRICTLY for Fixed Contexts (cannot write F-Specs inside /FREE)
        if not is_free_context and upper_line.startswith('F') and len(line) >= 17:
            file_type = line[16].upper()
            if file_type in ['I', 'O', 'U', 'C']:
                target = line[6:16].strip().upper()
                # Anti-False-Positive check
                if '=' not in target and '(' not in target:
                    action = 'WRITE' if file_type == 'O' else 'READ'
                    if file_type == 'U': action = 'UPDATE'
                    if file_type == 'C': action = 'READ/WRITE'
                    
                    item = source_meta.copy()
                    item.update({'target': target, 'type': 'ACCESSES', 'action': action})
                    dependencies.append(item)

        # --- C. Universal OpCodes (CHAIN, READ, WRITE) ---
        opcode_match = re.search(r'\b(CHAIN|READ|READE|READP|WRITE|UPDAT|DELETE)\s+(\w+)', line, re.IGNORECASE)
        if opcode_match:
            op = opcode_match.group(1).upper()
            target = opcode_match.group(2).upper()
            action = 'WRITE' if op in ['WRITE', 'UPDAT', 'DELETE'] else 'READ'
            
            item = source_meta.copy()
            item.update({'target': target, 'type': 'ACCESSES', 'action': action})
            dependencies.append(item)

        # --- D. Universal Calls (CALL, CALLP) ---
        call_match = re.search(r'\b(CALL|CALLB|CALLP)\s+[\' "]?(\w+)[\' "]?', line, re.IGNORECASE)
        if call_match:
            target = call_match.group(2).upper()
            item = source_meta.copy()
            item.update({'target': target, 'type': 'CALLS', 'action': 'EXECUTE'})
            dependencies.append(item)

        # --- E. Embedded SQL ---
        if 'EXEC SQL' in upper_line: is_sql_block = True
        if ';' in line: is_sql_block = False
        
        if is_sql_block or 'EXEC SQL' in upper_line:
            sql_match = re.search(r'\b(FROM|JOIN|INTO|UPDATE|INSERT INTO)\s+(\w+)', line, re.IGNORECASE)
            if sql_match:
                raw_target = sql_match.group(2).upper()
                target = raw_target.split('.')[-1]
                
                item = source_meta.copy()
                item.update({'target': target, 'type': 'ACCESSES', 'action': 'SQL'})
                dependencies.append(item)

    return dependencies

# --- Execution ---
all_deps = []
files_scanned = 0
ALLOWED_EXTENSIONS = ('.rpgle', '.sqlrpgle', '.rpg', '.clp', '.clle')

for root, dirs, files in os.walk(REPO_PATH):
    for file in files:
        if file.lower().endswith(ALLOWED_EXTENSIONS):
            files_scanned += 1
            all_deps.extend(parse_rpg_file(os.path.join(root, file)))

print(f"‚úÖ Universal Scan Complete. Processed {files_scanned} files.")
df_deps = pd.DataFrame(all_deps)
if not df_deps.empty:
    display(df_deps.head())
else:
    print("‚ö†Ô∏è No RPG files found.")

‚úÖ Universal Scan Complete. Processed 364 files.


Unnamed: 0,source,source_path,source_ext,line,statement,target,type,action
0,LOADCUST2,src/IBM-i-RPG-Free-CLP-Code/5250_Subfile/LOADC...,CLLE,19,SUBMIT: SBMJOB CMD(CALL PGM(LOADCUSTR)...,PGM,CALLS,EXECUTE
1,LOADCUSTR,src/IBM-i-RPG-Free-CLP-Code/5250_Subfile/LOADC...,SQLRPGLE,102,from lennons1.csz,LENNONS1,ACCESSES,SQL
2,LOADCUSTR,src/IBM-i-RPG-Free-CLP-Code/5250_Subfile/LOADC...,SQLRPGLE,112,from lennons1.csz,LENNONS1,ACCESSES,SQL
3,LOADCUSTR,src/IBM-i-RPG-Free-CLP-Code/5250_Subfile/LOADC...,SQLRPGLE,120,exec sql fetch from csz_cur into :csz;,CSZ_CUR,ACCESSES,SQL
4,LOADCUSTR,src/IBM-i-RPG-Free-CLP-Code/5250_Subfile/LOADC...,SQLRPGLE,203,exec sql insert into lennons1.custmast values(...,LENNONS1,ACCESSES,SQL


In [3]:
# --- CELL 3: NEO4J LOADER (FILES + LINE NUMBERS) ---

def create_constraints(driver):
    with driver.session() as session:
        # 1. Unique constraints for all node types
        session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (p:Program) REQUIRE p.name IS UNIQUE")
        session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (t:Table) REQUIRE t.name IS UNIQUE")
        session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (f:File) REQUIRE f.path IS UNIQUE")
        print("üîí Constraints Verified.")

def load_data(tx, dataframe):
    query = """
    UNWIND $batch AS row
    
    // 1. Create the Physical File Node & Logical Program Node
    MERGE (f:File {path: row.source_path})
    SET f.name = row.source + '.' + row.source_ext,
        f.extension = row.source_ext
    
    MERGE (p:Program {name: row.source})
    MERGE (p)-[:DEFINED_IN]->(f)
    
    // 2. Handle Relationships (CALLS or ACCESSES)
    // We use apoc.do.when to switch logic based on dependency type
    WITH p, row
    CALL apoc.do.when(
        row.type = 'CALLS',
        
        // CASE A: Program calls Program
        'MERGE (t:Program {name: row.target}) 
         MERGE (p)-[r:CALLS]->(t)
         // Accumulate line numbers: If new, create list. If exists, append to list.
         ON CREATE SET r.lines = [row.line]
         ON MATCH SET r.lines = r.lines + row.line',
         
        // CASE B: Program accesses Table
        // We include {action: row.action} in the relationship key so READs and WRITEs are distinct
        'MERGE (t:Table {name: row.target}) 
         MERGE (p)-[r:ACCESSES {action: row.action}]->(t)
         ON CREATE SET r.lines = [row.line]
         ON MATCH SET r.lines = r.lines + row.line',
         
        {p:p, row:row}
    ) YIELD value
    RETURN count(*)
    """
    tx.run(query, batch=dataframe.to_dict('records'))

if not df_deps.empty:
    try:
        with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
            # Step A: Wipe Database
            with driver.session() as session:
                session.run("MATCH (n) DETACH DELETE n")
                print("üóëÔ∏è  Old Graph Wiped.")

            # Step B: Setup Schema
            create_constraints(driver)
            
            # Step C: Load Data
            with driver.session() as session:
                session.execute_write(load_data, df_deps)
                
                # Validation
                count = session.run("MATCH (n) RETURN count(n) AS c").single()["c"]
                print(f"üöÄ Success! Loaded {count} nodes.")
                
    except Exception as e:
        print(f"‚ùå Database Error: {e}")
else:
    print("‚è≠Ô∏è  Skipping load (No data).")

üóëÔ∏è  Old Graph Wiped.
üîí Constraints Verified.
üöÄ Success! Loaded 172 nodes.


In [4]:
# --- CELL 4: ANALYSIS (GDS) ---

try:
    with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
        with driver.session() as session:
            # 0. Check if GDS is installed
            check = session.run("RETURN gds.version() AS v").single()
            if not check:
                raise Exception("GDS Plugin not found on Neo4j server!")

            # 1. Clean up old projections (Silence Warning by YIELDing specific field)
            session.run("CALL gds.graph.drop('rpgSystem', false) YIELD graphName")

            # 2. Project Graph (In-Memory)
            session.run("""
                CALL gds.graph.project(
                    'rpgSystem',
                    ['Program', 'Table'],
                    ['ACCESSES', 'CALLS']
                )
            """)
            print("üìä Graph Projected to Memory.")

            # 3. Algorithm: Weakly Connected Components (WCC)
            # This finds 'Islands' -> groups of nodes disconnected from the rest
            result = session.run("""
                CALL gds.wcc.stream('rpgSystem')
                YIELD nodeId, componentId
                RETURN gds.util.asNode(nodeId).name AS Name, 
                       labels(gds.util.asNode(nodeId))[0] AS Type, 
                       componentId
                ORDER BY componentId
            """)
            
            df_wcc = pd.DataFrame([r.data() for r in result])
            
            # 4. Clean Memory (Silence Warning)
            session.run("CALL gds.graph.drop('rpgSystem', false) YIELD graphName")

    # --- Reporting ---
    if not df_wcc.empty:
        island_counts = df_wcc['componentId'].value_counts()
        print(f"\nüèùÔ∏è  Found {len(island_counts)} distinct 'Islands' (Isolated Systems).")
        
        print("\n--- Top 5 Largest Systems ---")
        print(island_counts.head(5))
        
        largest_id = island_counts.index[0]
        print(f"\nüîç Components in the Largest System (ID: {largest_id}):")
        display(df_wcc[df_wcc['componentId'] == largest_id].head(10))
    else:
        print("‚ö†Ô∏è  No analysis results generated. (Did you add files to ./src/rpgleparser?)")

except Exception as e:
    print(f"‚ùå Analysis Failed: {e}")

üìä Graph Projected to Memory.

üèùÔ∏è  Found 29 distinct 'Islands' (Isolated Systems).

--- Top 5 Largest Systems ---
componentId
5      17
0      15
21      7
103     7
25      5
Name: count, dtype: int64

üîç Components in the Largest System (ID: 5):


Unnamed: 0,Name,Type,componentId
19,MTNCUSTR,Program,5
20,MTNCUSTD,Table,5
21,SH_HDR,Table,5
22,CUSTMAST,Table,5
23,STATES,Table,5
24,SFT_FKEY,Table,5
25,MSGCTL,Table,5
26,PMTCUSTR,Program,5
27,PMTCUSTD,Table,5
28,SFL,Table,5


In [5]:
# --- CELL 5: MARK ISLANDS (INCLUDING FILES & TABLES) ---

try:
    with GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH) as driver:
        with driver.session() as session:
            print("üîÑ Updating Graph with Island Data...")

            # 1. Project Graph (Include FILES and DEFINED_IN relation)
            session.run("CALL gds.graph.drop('rpgSystem', false) YIELD graphName")
            session.run("""
                CALL gds.graph.project(
                    'rpgSystem',
                    ['Program', 'Table', 'File'],
                    ['ACCESSES', 'CALLS', 'DEFINED_IN']
                )
            """)

            # 2. Run WCC Algorithm
            # This identifies connected clusters regardless of node type
            write_result = session.run("""
                CALL gds.wcc.write('rpgSystem', { 
                    writeProperty: 'componentId' 
                })
                YIELD nodePropertiesWritten
                RETURN nodePropertiesWritten
            """).single()
            
            print(f"   -> Tagged {write_result['nodePropertiesWritten']} nodes (Programs, Tables, Files) with IDs.")

            # 3. Create Island Nodes and Relationships
            # Group ALL nodes by their new componentId
            summary = session.run("""
                MATCH (n) WHERE n.componentId IS NOT NULL
                
                // Create the central Island Node
                MERGE (i:Island {id: n.componentId})
                
                // Link everything (Files, Programs, Tables) to it
                MERGE (n)-[:PART_OF]->(i)
            """).consume()

            # 4. Cleanup Memory
            session.run("CALL gds.graph.drop('rpgSystem', false) YIELD graphName")

            print(f"‚úÖ Success! Created relationships for {summary.counters.relationships_created} links.")
            print("   Visual Check: MATCH (i:Island)<-[:PART_OF]-(n) RETURN i, n LIMIT 50")

except Exception as e:
    print(f"‚ùå Error marking islands: {e}")

üîÑ Updating Graph with Island Data...
   -> Tagged 172 nodes (Programs, Tables, Files) with IDs.
‚úÖ Success! Created relationships for 172 links.
   Visual Check: MATCH (i:Island)<-[:PART_OF]-(n) RETURN i, n LIMIT 50
