In [1]:

# Set up notebook
%pprint
%matplotlib inline
import sys
import os.path as osp, os as os

executable_path = sys.executable
scripts_folder = osp.join(osp.dirname(executable_path), 'Scripts'); assert osp.exists(scripts_folder)
py_folder = osp.abspath(osp.join(os.pardir, 'py')); assert osp.exists(py_folder), "Create the py folder"
ffmpeg_folder = r'C:\ffmpeg\bin'; assert osp.exists(ffmpeg_folder)
shared_folder = osp.abspath(osp.join(os.pardir, 'share')); assert osp.exists(shared_folder)

if (scripts_folder not in sys.path): sys.path.insert(1, scripts_folder)
if (py_folder not in sys.path): sys.path.insert(1, py_folder)
if (ffmpeg_folder not in sys.path): sys.path.insert(1, ffmpeg_folder)
if shared_folder not in sys.path: sys.path.insert(1, shared_folder)

from notebook_utils import NotebookUtilities
nu = NotebookUtilities(
    data_folder_path=osp.abspath(osp.join(os.pardir, 'data')),
    saves_folder_path=osp.abspath(osp.join(os.pardir, 'saves'))
)
nu.delete_ipynb_checkpoint_folders()

# Import needed libraries
import ast
import pkgutil
import re

Pretty printing has been turned OFF


In [2]:

def find_imports_in_file(filepath):
    """Parses a Python file with the AST module and regex to find all import statements."""
    with open(filepath, "r", encoding=nu.encoding_type) as f:
        file_content = f.read()

    # Get imports from regex
    regex_imports = []
    for regex_str in IMPORTS_REGEX.findall(file_content):
        parens_list = re.split("[)(]", regex_str.strip(), 0)
        if len(parens_list) > 1:
            prefix = parens_list[0].strip()
            commas_list = [s.strip() for s in re.split(r"\s*,\s*", parens_list[1].strip(), 0) if s.strip()]
            for comma_str in commas_list:
                regex_imports.append(prefix + ' ' + comma_str)
        elif len(parens_list) == 1:
            imports_list = [s.strip() for s in re.split(r"(from\s+[\w.]+\s+import)\s+", parens_list[0].strip(), 0) if s.strip()]
            if len(imports_list) > 1:
                prefix = imports_list[0]
                commas_list = [s.strip() for s in re.split(r"\s*,\s*", imports_list[1], 0) if s.strip()]
                for comma_str in commas_list:
                    regex_imports.append(prefix + ' ' + comma_str)
            elif len(imports_list) == 1:
                regex_imports.append(imports_list[0].strip())

    # Get imports from ast
    ast_imports = []
    try:
        root = ast.parse(file_content, filename=filepath)
    
    # If there's a syntax error in the file, return ast_imports as empty
    except SyntaxError:
        return ast_imports, regex_imports

    for node in ast.walk(root):
        
        # Collect statements like "import math, sys"
        if isinstance(node, ast.Import):
            for alias in node.names:
                statement = f"import {alias.name}"
                if alias.asname:
                    statement += f" as {alias.asname}"
                ast_imports.append(statement)
        
        # Collect statements like "from x import y as z"
        elif isinstance(node, ast.ImportFrom):
            for alias in node.names:
                module = node.module if node.module else ""
                statement = f"from {module} import {alias.name}"
                if alias.asname:
                    statement += f" as {alias.asname}"
                ast_imports.append(statement)

    return ast_imports, regex_imports

In [3]:

def scan_all_imports_in_folder(folder):
    """Recursively scans all .py files in a folder for import statements."""
    ast_all_imports = {}
    regex_all_imports = {}
    for root, _, files in os.walk(folder):
        for filename in files:
            if filename.endswith(".py"):
                filepath = osp.join(root, filename)
                ast_file_imports, regex_file_imports = find_imports_in_file(filepath)
                ast_all_imports[filepath] = ast_file_imports
                regex_all_imports[filepath] = regex_file_imports
    return ast_all_imports, regex_all_imports

In [4]:

# Get built-in module names
built_in_modules = set(sys.builtin_module_names)

# Get pure Python modules from the standard library
std_lib_path = os.path.dirname(os.__file__)
std_lib_modules = set([
    module_info.name
    for module_info in pkgutil.iter_modules([std_lib_path])
])

# Combine both lists
standard_library_modules = built_in_modules | std_lib_modules

# Sort the list for easier reading
standard_library_modules = sorted(standard_library_modules)

In [5]:

IMPORTS_REGEX = re.compile(r"^\s*(?:import\s+[\w.]+(?:\s+as\s+\w+)?(?:\s*,\s*[\w.]+(?:\s+as\s+\w+)?)*|from\s+[\w.]+\s+import\s+(?:[\w.]+(?:\s+as\s+\w+)?(?:\s*,\s*[\w.]+(?:\s+as\s+\w+)?)*|\(\s*[\w.]+(?:\s+as\s+\w+)?(?:\s*,\s*[\w.]+(?:\s+as\s+\w+)?)*,?\s*\)))", re.MULTILINE)

ast_results, regex_results = scan_all_imports_in_folder(shared_folder)
filepaths = sorted(set(ast_results.keys()).union(set(regex_results.keys())))
for filepath in filepaths:
    print()
    print(f"File: {filepath}")
    ast_set = set(ast_results[filepath])
    regex_set = set(regex_results[filepath])
    print(f'Imports only in ast: {ast_set.difference(regex_set)}')
    print(f'Imports only in regex: {regex_set.difference(ast_set)}')


File: C:\Users\daveb\OneDrive\Documents\GitHub\notebooks\share\base_config.py
Imports only in ast: set()
Imports only in regex: set()

File: C:\Users\daveb\OneDrive\Documents\GitHub\notebooks\share\data_analysis.py
Imports only in ast: set()
Imports only in regex: set()

File: C:\Users\daveb\OneDrive\Documents\GitHub\notebooks\share\data_preparation.py
Imports only in ast: set()
Imports only in regex: {'from notebook_utils import NotebookUtilities'}

File: C:\Users\daveb\OneDrive\Documents\GitHub\notebooks\share\data_validation.py
Imports only in ast: set()
Imports only in regex: set()

File: C:\Users\daveb\OneDrive\Documents\GitHub\notebooks\share\file_operations.py
Imports only in ast: set()
Imports only in regex: set()

File: C:\Users\daveb\OneDrive\Documents\GitHub\notebooks\share\notebook_utils.py
Imports only in ast: set()
Imports only in regex: {'from notebook_utils import NotebookUtilities', 'import os.path as osp, os as os', 'import sys'}

File: C:\Users\daveb\OneDrive\Docume


The "Imports only in regex" are from the examples in the docstrings. It's not worth the trouble to craft a regex and then loop through all the edge cases.