In [1]:
import os
import re

In [5]:
# Directory to search
directory_to_search = "/home/elias/IdeaProjects/systemds/scripts/builtin"
#directory_to_search = "/Users/eliasstrauss/Desktop/TU/systemds-fork/scripts/builtin"

matches = []

# Regex pattern for matching function and return statements
pattern = re.compile(r"(?P<func>\b\w+\b\s*=\s*function\s*\([^)]*\)\s*)"  # Match 'fname = function (...)'
                     r"(?:(?P<return>return\s*\([^)]*\)))?",             # Match 'return (...)', optional
                     re.DOTALL)                                          # Enable dotall to match across lines

# Walk through the directory
for root, _, files in os.walk(directory_to_search):
    for file in files:
        file_path = os.path.join(root, file)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                
                # replace matrix(nrow(X), ncol(X)) by matrix to avoid the pattern function (...) to pick up the the first closing bracket from nrow(X)
                pattern_nrow = r"nrow\([^)]*\)"
                content = re.sub(pattern_nrow, "nrow", content)
                pattern_ncol = r"ncol\([^)]*\)"
                content = re.sub(pattern_nrow, "ncol", content)
                pattern_matrix = r"matrix\([^)]*\)"
                content = re.sub(pattern_matrix, "matrix", content)

                # Find all matches in the file
                no_match = True
                for match in pattern.finditer(content):
                    function_header = match.group()
                    matches.append({
                        "file_name": file_path,
                        "function_header": function_header
                    })
                    no_match = False
                if no_match:
                    print(file_path)
        except (UnicodeDecodeError, IOError):
            print(f"Could not read file: {file_path}")
            
results = matches
print(len(results), len([1 for _ in files for _, _, files in os.walk(directory_to_search)]))

432 190


In [6]:
results_inner = [r for r in results if r["function_header"][1] != "_"]
results  = [r for r in results if r["function_header"][1] == "_"]
len(results)

201

In [7]:
# Print results
for result in results:
    print(f"File: {result['file_name']}")
    print(f"Function Header: {result['function_header']}")
    print("-" * 40)

File: /home/elias/IdeaProjects/systemds/scripts/builtin/mdedup.dml
Function Header: f_mdedup = function(Frame[String] X, Matrix[Double] LHSfeatures, Matrix[Double] LHSthreshold,
    Matrix[Double] RHSfeatures, Matrix[Double] RHSthreshold, Boolean verbose)
  return(Matrix[Double] MD)
----------------------------------------
File: /home/elias/IdeaProjects/systemds/scripts/builtin/pnmf.dml
Function Header: m_pnmf = function(Matrix[Double] X, Integer rank, Double tol = 1e-8, Integer maxIter = 10, Boolean verbose=TRUE)
  return (Matrix[Double] W, Matrix[Double] H)
----------------------------------------
File: /home/elias/IdeaProjects/systemds/scripts/builtin/km.dml
Function Header: m_km = function(Matrix[Double] X, Matrix[Double] TE, Matrix[Double] GI, Matrix[Double] SI,
    Double alpha = 0.05, String err_type = "greenwood", String conf_type = "log", String test_type = "none")
  return (Matrix[Double] O, Matrix[Double] M, Matrix[Double] T, Matrix[Double] T_GROUPS_OE)
---------------------

In [8]:
unique_parameters = dict()
intput_parameter_filter = re.compile(r"function\s*\(([^)]*)\)")
param_with_default_pattern = re.compile(r"(\w+)\s+(\w+)\s*=\s*(.+)")
for result in results:
    function_header = result["function_header"]

    # Search for the parameters inside "function(...)"
    param_match = intput_parameter_filter.search(function_header)
    if param_match:
        param_str = param_match.group(1)  # The content inside the parentheses

        # Split the parameters by commas and process each one
        params = [p.strip() for p in param_str.split(",") if p.strip()]
        for param in params:
            if not '=' in param:
                parts = param.split()
                if not parts[-1] in unique_parameters:
                    unique_parameters[parts[-1]] = 1
                else:
                    unique_parameters[parts[-1]] += 1
            else:
               # Extract the parameter name from possible formats:
                # NAME, DATATYPE NAME, DATATYPE NAME=DEFAULT, DATATYPE NAME = DEFAULT
                param = param.replace('[', '').replace(']','')
                match = param_with_default_pattern.match(param)
                if match:
                    param_name = match.group(2)
                    if not param_name in unique_parameters:
                        #print(param_name, "[{}]".format(param))
                        unique_parameters[param_name] = 1
                    else:
                        unique_parameters[param_name] += 1
                else:
                    print("Warning no match for: " + param)
print(len(unique_parameters))

364


In [9]:
names = list(unique_parameters.keys())
names.sort()

In [10]:
ml = max([len(name) for name in names])

In [11]:
ml

24

In [12]:
for name in names:
    print("{name: <28} : {c}".format(name=name, c=unique_parameters[name]))

A                            : 3
B                            : 4
C                            : 6
CL                           : 1
CL_T                         : 1
Centering                    : 3
Clusters                     : 2
D                            : 4
F                            : 2
F1                           : 1
G                            : 3
GI                           : 1
Graph                        : 1
H                            : 2
Hin                          : 2
I                            : 2
IQR                          : 1
K                            : 3
L                            : 2
LHSfeatures                  : 1
LHSthreshold                 : 1
M                            : 10
Mask                         : 1
P                            : 16
Q                            : 2
Q1                           : 1
Q3                           : 1
R                            : 9
RHSfeatures                  : 1
RHSthreshold                 : 1
S       

In [10]:
para_by_count = sorted([pair for pair in unique_parameters.items()], key=lambda pair: pair[1], reverse=True)

In [11]:
for n, c in para_by_count:
    print("{name: <28} : {c}".format(name=n, c=c))

X                            : 131
verbose                      : 53
Y                            : 38
img_in                       : 19
seed                         : 18
tol                          : 17
P                            : 16
y                            : 16
reg                          : 16
maxIter                      : 15
k                            : 15
threshold                    : 10
M                            : 10
intercept                    : 10
fill_value                   : 10
mask                         : 9
R                            : 9
maxi                         : 7
alpha                        : 7
C                            : 6
model                        : 6
ctypes                       : 5
rank                         : 5
repairMethod                 : 5
lr                           : 5
d                            : 5
e                            : 5
s_cols                       : 5
s_rows                       : 5
w                          