In [94]:
import ast
from vector_db import VectorDBTextAnalyzerBase

In [130]:
import ast

class DocStringExtractor(ast.NodeVisitor):
    def __init__(self):
        self.all_docstrings = []
        self.current_class_name = None

    def visit_ClassDef(self, node):
        self.current_class_name = node.name
        self.generic_visit(node)
        self.current_class_name = None

    def visit_FunctionDef(self, node):
        self._extract_info(node)

    def visit_AsyncFunctionDef(self, node):
        self._extract_info(node)

    def _extract_info(self, node):
        # Determine if it's a method in a class or a standalone function
        full_name = f"{self.current_class_name}.{node.name}" if self.current_class_name else node.name
        docstring = ast.get_docstring(node)

        # Extract function/method arguments
        args = [arg.arg for arg in node.args.args]

        # Prepare the info dictionary
        info = {
            "unique_identifier": full_name,
            "arguments": args,
            "text": f"{full_name}({', '.join(args)}):\n{docstring}" if docstring else f"{full_name}({', '.join(args)})"
        }
        self.all_docstrings.append(info)

        self.generic_visit(node)

def extract_docstrings(filename):
    with open(filename, "r") as source:
        tree = ast.parse(source.read(), filename=filename)

    docstring_extractor = DocStringExtractor()
    docstring_extractor.visit(tree)
    return docstring_extractor.all_docstrings

# Example usage
# Assuming `filename` is the path to the Python file you want to analyze
# docstrings = extract_docstrings(filename)
# for doc in docstrings:
#     print(doc)


In [131]:
def extract_python_files_in_folder(folder):
    import os
    python_files = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith(".py"):
                python_files.append(os.path.join(root, file))
    return python_files

In [132]:
# taken from easybuild-framework
folder_name = "framework"
python_files = extract_python_files_in_folder(folder_name)
for f in python_files:
    print(f)

framework/easyblock.py
framework/extensioneasyblock.py
framework/easystack.py
framework/__init__.py
framework/extension.py
framework/easyconfig/style.py
framework/easyconfig/tools.py
framework/easyconfig/constants.py
framework/easyconfig/__init__.py
framework/easyconfig/types.py
framework/easyconfig/parser.py
framework/easyconfig/templates.py
framework/easyconfig/default.py
framework/easyconfig/tweak.py
framework/easyconfig/licenses.py
framework/easyconfig/easyconfig.py
framework/easyconfig/format/pyheaderconfigobj.py
framework/easyconfig/format/one.py
framework/easyconfig/format/version.py
framework/easyconfig/format/convert.py
framework/easyconfig/format/__init__.py
framework/easyconfig/format/two.py
framework/easyconfig/format/format.py
framework/easyconfig/format/yeb.py


In [133]:

# docstrings = extract_docstrings("vector_db.py")

docstrings = []
for f in python_files:
    docstrings.extend(extract_docstrings(f))

for d in docstrings:
    print(d)

{'unique_identifier': 'EasyBlock.extra_options', 'arguments': ['extra'], 'text': 'EasyBlock.extra_options(extra):\nExtra options method which will be passed to the EasyConfig constructor.'}
{'unique_identifier': 'EasyBlock.__init__', 'arguments': ['self', 'ec'], 'text': 'EasyBlock.__init__(self, ec):\nInitialize the EasyBlock instance.\n:param ec: a parsed easyconfig file (EasyConfig instance)'}
{'unique_identifier': 'EasyBlock.post_init', 'arguments': ['self'], 'text': 'EasyBlock.post_init(self):\nRun post-initialization tasks.'}
{'unique_identifier': 'EasyBlock._init_log', 'arguments': ['self'], 'text': 'EasyBlock._init_log(self):\nInitialize the logger.'}
{'unique_identifier': 'EasyBlock.close_log', 'arguments': ['self'], 'text': 'EasyBlock.close_log(self):\nShutdown the logger.'}
{'unique_identifier': 'EasyBlock.init_dry_run', 'arguments': ['self'], 'text': 'EasyBlock.init_dry_run(self):\nInitialise easyblock instance for performing a dry run.'}
{'unique_identifier': 'EasyBlock.dry

In [134]:
db_name = "docstring_db"
with VectorDBTextAnalyzerBase(db_name) as vdb:
    vdb.create_db(cleanup=True)
    vdb.populate_db(docstrings)

In [135]:
query = "function for naming easyconfig files"
with VectorDBTextAnalyzerBase(db_name) as vdb:
    response = vdb.search(query=query)
    vdb.print_search_results(query=query, response=response)

_______QUERY_______
function for naming easyconfig files
-------------------
_______RESULTS_______
-------------------
unique_identifier: EasyConfig.name
text: EasyConfig.name(self):
returns name
Distance: 0.18315637111663818
-------------------
unique_identifier: EasyConfig.filename
text: EasyConfig.filename(self):
Determine correct filename for this easyconfig file.
Distance: 0.1854965090751648
-------------------
unique_identifier: robot_find_easyconfig
text: robot_find_easyconfig(name, version):
Find an easyconfig for module in path, returns (absolute) path to easyconfig file (or None, if none is found).
Distance: 0.21934425830841064


In [136]:
query = "composing path to easyconfig given software name"
with VectorDBTextAnalyzerBase(db_name) as vdb:
    response = vdb.search(query=query, limit=3)
    vdb.print_search_results(query=query, response=response)

_______QUERY_______
composing path to easyconfig given software name
-------------------
_______RESULTS_______
-------------------
unique_identifier: EasyConfig.name
text: EasyConfig.name(self):
returns name
Distance: 0.19211065769195557
-------------------
unique_identifier: robot_find_easyconfig
text: robot_find_easyconfig(name, version):
Find an easyconfig for module in path, returns (absolute) path to easyconfig file (or None, if none is found).
Distance: 0.19965869188308716
-------------------
unique_identifier: EasyConfig.filename
text: EasyConfig.filename(self):
Determine correct filename for this easyconfig file.
Distance: 0.2114248275756836


In [137]:
query = "return easyblock class from string"
with VectorDBTextAnalyzerBase(db_name) as vdb:
    response = vdb.search(query=query, limit=3)
    vdb.print_search_results(query=query, response=response)

_______QUERY_______
return easyblock class from string
-------------------
_______RESULTS_______
-------------------
unique_identifier: get_easyblock_class
text: get_easyblock_class(easyblock, name, error_on_failed_import, error_on_missing_easyblock):
Get class for a particular easyblock (or use default)
Distance: 0.1510791778564453
-------------------
unique_identifier: EasyBlock.name
text: EasyBlock.name(self):
Shortcut the get the module name.
Distance: 0.2112443447113037
-------------------
unique_identifier: get_easyblock_instance
text: get_easyblock_instance(ecdict):
Get an instance for this easyconfig
:param ecdict: parsed easyconfig (EasyConfig instance)

returns an instance of EasyBlock (or subclass thereof)
Distance: 0.21596300601959229


In [138]:
query = "draw a dependency graph"
with VectorDBTextAnalyzerBase(db_name) as vdb:
    response = vdb.search(query=query, limit=3)
    vdb.print_search_results(query=query, response=response)

_______QUERY_______
draw a dependency graph
-------------------
_______RESULTS_______
-------------------
unique_identifier: _dep_graph_gv
text: _dep_graph_gv(dottxt, filename):
Render dependency graph to file using graphviz.
Distance: 0.20450878143310547
-------------------
unique_identifier: _dep_graph_dump
text: _dep_graph_dump(dgr, filename):
Dump dependency graph to file, in specified format.
Distance: 0.22117376327514648
-------------------
unique_identifier: dep_graph
text: dep_graph(filename, specs):
Create a dependency graph for the given easyconfigs.
Distance: 0.22357124090194702


In [139]:
query = "steps to take after installing extension"
with VectorDBTextAnalyzerBase(db_name) as vdb:
    response = vdb.search(query=query, limit=3)
    vdb.print_search_results(query=query, response=response)

_______QUERY_______
steps to take after installing extension
-------------------
_______RESULTS_______
-------------------
unique_identifier: Extension.postrun
text: Extension.postrun(self):
Stuff to do after installing a extension.
Distance: 0.17652273178100586
-------------------
unique_identifier: Extension.sanity_check_step
text: Extension.sanity_check_step(self):
Sanity check to run after installing extension
Distance: 0.2012847661972046
-------------------
unique_identifier: Extension.prerun
text: Extension.prerun(self):
Stuff to do before installing a extension.
Distance: 0.213120698928833


In [140]:
query = "gather a list of patches to apply to the source code"
with VectorDBTextAnalyzerBase(db_name) as vdb:
    response = vdb.search(query=query, limit=3)
    vdb.print_search_results(query=query, response=response)

_______QUERY_______
gather a list of patches to apply to the source code
-------------------
_______RESULTS_______
-------------------
unique_identifier: EasyBlock.apply_post_install_patches
text: EasyBlock.apply_post_install_patches(self, patches):
Apply post-install patch files that are specified via the 'postinstallpatches' easyconfig parameter.
Distance: 0.2198238968849182
-------------------
unique_identifier: EasyBlock.patch_step
text: EasyBlock.patch_step(self, beginpath, patches):
Apply the patches
Distance: 0.22515273094177246
-------------------
unique_identifier: EasyBlock.fetch_patches
text: EasyBlock.fetch_patches(self, patch_specs, extension, checksums):
Add a list of patches.
All patches will be checked if a file exists (or can be located)
Distance: 0.22562217712402344

