In [1]:
import os
import re

def find_function_patterns(directory):
    matches = []

    # Regex pattern for matching function and return statements
    pattern = re.compile(r"(?P<func>\b\w+\b\s*=\s*function\([^)]*\)\s*)"  # Match 'fname = function(...)'
                         r"(?P<return>return\s*\([^)]*\))",                # Match 'return (...)'
                         re.DOTALL)  # Enable dotall to match across lines

    # Walk through the directory
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()

                    # Find all matches in the file
                    for match in pattern.finditer(content):
                        function_pattern = match.group()
                        matches.append({
                            "file_name": file_path,
                            "function_pattern": function_pattern
                        })

            except (UnicodeDecodeError, IOError):
                print(f"Could not read file: {file_path}")

    return matches

# Directory to search
directory_to_search = "/Users/eliasstrauss/Desktop/TU/systemds-fork/scripts/builtin"

# Get the matches
results = find_function_patterns(directory_to_search)

# Print results
for result in results:
    print(f"File: {result['file_name']}")
    print(f"Function Pattern: {result['function_pattern']}")
    print("-" * 40)

File: /Users/eliasstrauss/Desktop/TU/systemds-fork/scripts/builtin/executePipeline.dml
Function Pattern: matrixToList = function(Matrix[Double] X,  Matrix[Double] Y, Matrix[Double] mask, Matrix[Double] FD,
  Matrix[Double] p, Integer flagsCount, String op)
  return (List[Unknown] l, Integer dataFlag, Integer yFlag, Integer executeFlag)
----------------------------------------
File: /Users/eliasstrauss/Desktop/TU/systemds-fork/scripts/builtin/executePipeline.dml
Function Pattern: applyDataFlag = function(Matrix[Double] X, Matrix[Double] mask, Integer dataFlag)
return(Matrix[Double] X,Integer executeFlag)
----------------------------------------
File: /Users/eliasstrauss/Desktop/TU/systemds-fork/scripts/builtin/executePipeline.dml
Function Pattern: confirmData = function(Matrix[Double] nX, Matrix[Double] originalX, Matrix[Double] mask, Integer dataFlag)
return (Matrix[Double] X)
----------------------------------------
File: /Users/eliasstrauss/Desktop/TU/systemds-fork/scripts/builtin/ex

In [2]:
len(results)

349

In [4]:
parameter_names = set()
param_pattern = re.compile(r"function\s*\(([^)]*)\)")  # Captures content within 'function(...)'

for result in results:
    function_pattern = result['function_pattern']
    match = param_pattern.search(function_pattern)
    if match:
        # Extract the parameters
        params = match.group(1)
        
        # Split parameters by commas and process each
        for param in params.split(','):
            param = param.strip()  # Remove leading/trailing whitespace
            
            # Extract the last word (assumes it's the parameter name)
            if param:
                parts = param.split()  # Split by spaces to handle 'DATATYPE NAME'
                name = parts[-1]      # Get the last part, which is the name
                parameter_names.add(name)

In [5]:
parameter_names

{'"DS"',
 '"L2"',
 '"VVV"',
 '"gini"',
 '"greenwood"',
 '"kmeans"',
 '"locf"',
 '"log"',
 '"none"',
 '-1',
 '-1.0',
 '0',
 '0.0',
 '0.000001',
 '0.00001',
 '0.0001',
 '0.001',
 '0.02',
 '0.05',
 '0.1',
 '0.25',
 '0.3',
 '0.5',
 '0.7',
 '0.8',
 '0.9',
 '0.95',
 '0.99',
 '1',
 '1.0',
 '10',
 '100',
 '1000',
 '128',
 '132521',
 '1342516',
 '16',
 '1e-15',
 '1e-5',
 '1e-6',
 '1e-7',
 '1e-8',
 '2',
 '20',
 '200',
 '3',
 '30',
 '300.',
 '32',
 '321452',
 '4',
 '5',
 '50',
 '81',
 '=1.5',
 '=10',
 'A',
 'AIC_best',
 'AIC_cur',
 'B',
 'C',
 'Centering',
 'D',
 'D=0',
 'E',
 'F',
 'F1',
 'FALSE',
 'FD',
 'G',
 'GI',
 'Graph',
 'H',
 'H1',
 'H1_prime',
 'H2',
 'H2_prime',
 'H3',
 'H3_prime',
 'Hin',
 'I',
 'ID',
 'IQR',
 'K',
 'K=2',
 'L',
 'LHS_adj',
 'LHSfeatures',
 'LHSthreshold',
 'M',
 'Mask',
 'Matrix[Double]X',
 'P',
 'P=0',
 'Phi',
 'Q',
 'Q1',
 'Q3',
 'Q=0',
 'R',
 'R=50',
 'RHS_adj',
 'RHSfeatures',
 'RHSthreshold',
 'S',
 'S1',
 'S2',
 'SI',
 'ScaleFactor',
 'Selected',
 'T',
 'T1',
 

In [6]:
len(parameter_names)

549

In [7]:
p2 = parameter_names

In [8]:
unique_parameters = set()
param_pattern = re.compile(r"function\s*\(([^)]*)\)")

for result in results:
    function_pattern = result["function_pattern"]

    # Search for the parameters inside "function(...)"
    param_match = param_pattern.search(function_pattern)
    if param_match:
        param_str = param_match.group(1)  # The content inside the parentheses

        # Split the parameters by commas and process each one
        params = [p.strip() for p in param_str.split(",") if p.strip()]
        for param in params:
            # Extract the parameter name from possible formats:
            # NAME, DATATYPE NAME, DATATYPE NAME=DEFAULT, DATATYPE NAME = DEFAULT
            match = re.match(r"(?:\w+\s+)?(\w+)(?:\s*=\s*[^,]*)?", param)
            if match:
                param_name = match.group(1)
                unique_parameters.add(param_name)

In [9]:
len(unique_parameters)

270

In [10]:
unique_parameters

{'0',
 'AIC_best',
 'AIC_cur',
 'C',
 'D',
 'Frame',
 'Hin',
 'ID',
 'K',
 'List',
 'M',
 'Matrix',
 'P',
 'Q',
 'R',
 'Scalar',
 'Win',
 'a',
 'a0',
 'activation',
 'alpha',
 'always_shuffle',
 'arch_coef',
 'avg_sample_size_per_centroid',
 'b',
 'baseLineScore',
 'batch_size',
 'beta',
 'bins',
 'bits',
 'bound',
 'c',
 'center',
 'changes',
 'channel_max',
 'check',
 'classify',
 'code',
 'col',
 'columnId',
 'conf_type',
 'cont',
 'correctTypos',
 'cv',
 'cvk',
 'd',
 'dataFlag',
 'decay',
 'default',
 'df',
 'dfam',
 'differentOffsets',
 'disp',
 'distance_threshold',
 'dth',
 'e',
 'eAvg',
 'eAvgNew',
 'eAvgOld',
 'enableIncApproxPruning',
 'enableIncMaxScorePruning',
 'enableIncScorePruning',
 'enablePruning',
 'encodeLat',
 'end',
 'end_stepsize',
 'end_vicinity',
 'epochs',
 'eps',
 'epsilon',
 'err_type',
 'evaluationFunc',
 'f',
 'feature_frac',
 'fill_value',
 'flagsCount',
 'frame',
 'freq',
 'frequency_threshold',
 'gamma',
 'h',
 'height',
 'horizontal_axis',
 'i',
 'icp