In [1]:
import pydoc
import re

from nltk.tokenize import sent_tokenize

In [2]:
class DocstringsMiner:
    func_call_regex = re.compile(r"([a-zA-Z]+[.]{1})?[a-zA-Z0-9]+\([^\)]*\)(\.[^\)]*\))?")

    def __init__(self):
        pass

    def get_examples(self, module, replace_args=False, return_lines=False):
        lines = [
            l.strip()
            for l in pydoc.render_doc(module, renderer=pydoc.plaintext).replace("|", "").split("\n")
            if l.strip() != ''
        ]

        pairs = []

        i = 0
        while i < len(lines):
            # current line matches a function definition
            if re.match(self.func_call_regex, lines[i]):
                j = i + 1
                desc = []

                # get the description of the function: first full sentence after function definition
                while True:
                    # if it's a natural sentence, append to description and stop at "."
                    if not re.match(self.func_call_regex, lines[j]):
                        desc.append(lines[j])
                        if "." in lines[j]:
                            break
                    j += 1

                func = lines[i]
                desc = " ".join(desc)

                # remove extra chars after last "."
                if desc[-1] != '.' and '.' in desc:
                    desc = desc[:len(desc) - desc[::-1].index('.')]

                # replace args with placeholders: arg_i
                if replace_args:
                    il, ir = lines[i].index('('), lines[i].index(')')
                    args = lines[i][il + 1:ir].split(",")
                    args = ", ".join(["arg_%d" % i for i in range(len(args))])
                    func = lines[i][:il] + "(" + args + ")"

                pairs.append((func, desc))

                i = j
            else:
                i += 1

        if return_lines:
            return list(set(pairs)), lines
        else:
            return list(set(pairs))

In [4]:
import scipy
# --

dm = DocstringsMiner()

xs, lines = dm.get_examples(module=re, replace_args=False, return_lines=True)

for k, v in xs:
    print(k)
    print(v)
    print()

template(pattern, flags=0)
Compile a template pattern, returning a pattern object DATA A = 256 ASCII = 256 DOTALL = 16 I = 2 IGNORECASE = 2 L = 4 LOCALE = 4 M = 8 MULTILINE = 8 S = 16 U = 32 UNICODE = 32 VERBOSE = 64 X = 64 __all__ = ['match', 'fullmatch', 'search', 'sub', 'subn', 'split', 'fi...

builtins.Exception(builtins.BaseException)
sre_constants.

compile(pattern, flags=0)
Compile a regular expression pattern, returning a pattern object.

subn(pattern, repl, string, count=0, flags=0)
Return a 2-tuple containing (new_string, number).

fullmatch(pattern, string, flags=0)
Try to apply the pattern to all of the string, returning a match object, or None if no match was found.

finditer(pattern, string, flags=0)
Return an iterator over all non-overlapping matches in the string.  For each match, the iterator returns a match object.

split(pattern, string, maxsplit=0, flags=0)
Split the source string by the occurrences of the pattern, returning a list containing the resulting substring

In [None]:
dm = DocstringsMiner()
# modules = ["re", "os", "sys", "str"]
modules = ["scipy"]

for m in modules:
    try:
        exec("import %s" % m)
    except ImportError:
        print(">>> %s is not importable" % m, "\n")

    print(">>> generating examples from %s" % m, "\n")

    for func, desc in dm.get_examples(module=m):
        print("\t", func)
        print("\t", desc, "\n")