In [1]:

%pprint
%matplotlib inline
import sys
import os.path as osp, os as os

executable_path = sys.executable
scripts_folder = osp.join(osp.dirname(executable_path), 'Scripts'); assert osp.exists(scripts_folder)
py_folder = osp.abspath(osp.join(os.pardir, 'py')); assert osp.exists(py_folder), "Create the py folder"
ffmpeg_folder = r'C:\ffmpeg\bin'; assert osp.exists(ffmpeg_folder)
shared_folder = osp.abspath(osp.join(os.pardir, 'share')); assert osp.exists(shared_folder)

if (scripts_folder not in sys.path): sys.path.insert(1, scripts_folder)
if (py_folder not in sys.path): sys.path.insert(1, py_folder)
if (ffmpeg_folder not in sys.path): sys.path.insert(1, ffmpeg_folder)
if shared_folder not in sys.path: sys.path.insert(1, shared_folder)

from notebook_utils import NotebookUtilities
nu = NotebookUtilities(
    data_folder_path=osp.abspath(osp.join(os.pardir, 'data')),
    saves_folder_path=osp.abspath(osp.join(os.pardir, 'saves'))
)

# Import needed libraries
import ast

Pretty printing has been turned OFF


In [2]:

nu.update_modules_list(modules_list=None, verbose=False)
nu.modules_list

['aiobotocore', 'aiohappyeyeballs', 'aiohttp', 'aioitertools', 'aiosignal', 'alabaster', 'altair', 'anaconda-anon-usage', 'anaconda-catalogs', 'anaconda-client', 'anaconda-cloud-auth', 'anaconda-navigator', 'anaconda-project', 'annotated-types', 'anyio', 'appdirs', 'archspec', 'argon2-cffi', 'argon2-cffi-bindings', 'arro3-core', 'arrow', 'asgiref', 'astroid', 'astropy', 'astropy-iers-data', 'asttokens', 'async-lru', 'atomicwrites', 'attrs', 'Automat', 'autopep8', 'Babel', 'backoff', 'bcrypt', 'beautifulsoup4', 'binaryornot', 'black', 'bleach', 'blinker', 'bokeh', 'boltons', 'botocore', 'Bottleneck', 'Brotli', 'build', 'cachetools', 'certifi', 'cffi', 'chardet', 'charset-normalizer', 'chroma-hnswlib', 'chromadb', 'click', 'cloudpickle', 'colorama', 'colorcet', 'coloredlogs', 'colour-science', 'comm', 'comtypes', 'conda', 'conda-build', 'conda-content-trust', 'conda_index', 'conda-libmamba-solver', 'conda-pack', 'conda-package-handling', 'conda_package_streaming', 'conda-repo-cli', 'cond

In [3]:

import pkgutil

# Get built-in module names
built_in_modules = set(sys.builtin_module_names)

# Get pure Python modules from the standard library
std_lib_path = os.path.dirname(os.__file__)
std_lib_modules = set([
    module_info.name
    for module_info in pkgutil.iter_modules([std_lib_path, shared_folder])
])

# Combine both lists
STANDARD_LIBRARY_MODULES = built_in_modules | std_lib_modules

# Sort the list for easier reading
STANDARD_LIBRARY_MODULES = sorted(STANDARD_LIBRARY_MODULES)

print(STANDARD_LIBRARY_MODULES)



In [4]:

set(nu.modules_list).intersection(set(STANDARD_LIBRARY_MODULES))

set()

In [5]:

# A helper function to check if a module is part of the standard library
def is_standard_library(module_name):
    return module_name in STANDARD_LIBRARY_MODULES

In [6]:

import pkg_resources

def get_actual_name(module_name):
    actual_name = module_name
    try:
        # Try to get the distribution for module_name
        dist = pkg_resources.get_distribution(module_name)
        actual_name = dist.project_name
    except pkg_resources.DistributionNotFound:
        print(f"import {module_name}")
    
    return actual_name

In [7]:

def generate_requirements(file_paths, output_file='requirements.txt', excludes_list=[]):
    """
    Generate a requirements.txt file listing all external libraries imported
    in the specified Python files.
    
    Parameters:
        file_paths (list): A list of file paths to Python scripts.
        output_file (str): The name of the output file for the requirements.
                           Defaults to 'requirements.txt'.
    
    Returns:
        None
    """
    
    # A set to store all unique external libraries imported
    external_imports = set()
    
    # Process each file
    for file_path in file_paths:
        try:
            
            # Open the file and parse it with ast
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                tree = ast.parse(content)
            
            # Analyze the AST to find imports
            for node in ast.walk(tree):
                
                # Process "import module" statements
                if isinstance(node, ast.Import):
                    for alias in node.names:
                        module_name = alias.name.split('.')[0]  # Get the top-level module
                        if not is_standard_library(module_name):
                            external_imports.add(module_name)
                
                # Process "from module import ..." statements
                elif isinstance(node, ast.ImportFrom):
                    if node.module:
                        module_name = node.module.split('.')[0]  # Get the top-level module
                        if not is_standard_library(module_name):
                            external_imports.add(module_name)
        
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

    # Get what are the actual package names on PyPI
    external_imports = [get_actual_name(library) for library in sorted(external_imports) if library not in excludes_list]
    
    # Write the requirements.txt file
    with open(output_file, 'w', encoding='utf-8') as req_file:
        for library in sorted(external_imports):  # Sort for consistency
            req_file.write(f"{library}\n")
    
    return external_imports

In [10]:

import bs4
import imblearn
import pysan
import wikipedia

# Create it for the share repo
file_paths_list = [
    'C:\\Users\\daveb\\OneDrive\\Documents\\GitHub\\notebooks\\share\\base_config.py',
    'C:\\Users\\daveb\\OneDrive\\Documents\\GitHub\\notebooks\\share\\data_analysis.py',
    'C:\\Users\\daveb\\OneDrive\\Documents\\GitHub\\notebooks\\share\\data_preparation.py',
    'C:\\Users\\daveb\\OneDrive\\Documents\\GitHub\\notebooks\\share\\data_validation.py',
    'C:\\Users\\daveb\\OneDrive\\Documents\\GitHub\\notebooks\\share\\file_operations.py',
    'C:\\Users\\daveb\\OneDrive\\Documents\\GitHub\\notebooks\\share\\notebook_utils.py',
    'C:\\Users\\daveb\\OneDrive\\Documents\\GitHub\\notebooks\\share\\uncategorized.py'
]
requirements_file = osp.abspath(osp.join(nu.github_folder, os.pardir, 'share', 'requirements.txt'))
external_imports = generate_requirements(file_paths_list, output_file=requirements_file, excludes_list=['pickle5'])

In [11]:

external_imports

['ipython', 'bs4', 'chromadb', 'cycler', 'dill', 'humanize', 'imblearn', 'matplotlib', 'networkx', 'numpy', 'pandas', 'pysan', 'roman', 'scipy', 'seaborn', 'tqdm', 'webcolors', 'wikipedia']

In [19]:

std_lib_path = osp.dirname(os.__file__)
local_standard_library = set([
    module_info.name
    for module_info in pkgutil.iter_modules([std_lib_path])
]).union(built_in_modules)

In [20]:

local_standard_library.intersection(set(external_imports))

{'importlib', 'subprocess', 'random', 'pickle', 'tokenize', 'shutil', 'pkgutil', 'textwrap', 'difflib', 're', 'fractions', 'datetime', 'inspect', 'statistics', 'urllib', 'os', 'io'}

In [23]:

set(STANDARD_LIBRARY_MODULES).difference(set(local_standard_library))

{'notebook_utils_old', 'uncategorized', 'data_preparation', 'data_analysis', 'file_operations', 'base_config', 'notebook_utils', 'data_validation'}

In [24]:

local_standard_library.intersection(set(external_imports)).difference(set(STANDARD_LIBRARY_MODULES))

{'importlib', 'random', 'subprocess', 'pickle', 'tokenize', 'shutil', 'pkgutil', 'textwrap', 'difflib', 're', 'fractions', 'datetime', 'inspect', 'statistics', 'urllib', 'os', 'io'}