In [1]:

%pprint
%matplotlib inline
import sys
import os.path as osp
import os

executable_path = sys.executable
scripts_folder = osp.join(osp.dirname(executable_path), 'Scripts'); assert osp.exists(scripts_folder)
py_folder = osp.abspath(osp.join(os.pardir, 'py')); assert osp.exists(py_folder), "Create the py folder"
ffmpeg_folder = r'C:\ffmpeg\bin'; assert osp.exists(ffmpeg_folder)
shared_folder = osp.abspath(osp.join(os.pardir, os.pardir, 'share')); assert osp.exists(shared_folder)

if (scripts_folder not in sys.path): sys.path.insert(1, scripts_folder)
if (py_folder not in sys.path): sys.path.insert(1, py_folder)
if (ffmpeg_folder not in sys.path): sys.path.insert(1, ffmpeg_folder)
if shared_folder not in sys.path: sys.path.insert(1, shared_folder)

from notebook_utils import NotebookUtilities
nu = NotebookUtilities(
    data_folder_path=osp.abspath(osp.join(os.pardir, 'data')),
    saves_folder_path=osp.abspath(osp.join(os.pardir, 'saves')),
    verbose=False
)

# Import needed libraries
from bs4 import BeautifulSoup as bs
from pandas import DataFrame
import ast

Pretty printing has been turned OFF


In [2]:

def find_self_references_and_definitions(file_path, verbose=False):
    """
    Analyze a file to find all references to `self.` attributes and methods,
    and check if they are defined.

    Args:
        file_path (str): Path to the Python file to analyze.

    Returns:
        dict: A dictionary with class names as keys and a report of all `self.` attributes
              and methods (whether they are defined or missing) for each class.
    """
    # Read the source code
    with open(file_path, "r") as f:
        source_code = f.read()

    # Parse the source code into an AST
    if verbose: print(file_path)
    tree = ast.parse(source_code)

    # Store results
    results = {}

    # Traverse the AST to find class definitions
    for node in tree.body:
        if isinstance(node, ast.ClassDef):  # Found a class
            class_name = node.name
            # Collect all `self.<attribute>` references and definitions
            self_references = set()
            self_definitions = set()

            # Collect all method names in the class (to treat as `self.<method>` definitions)
            method_names = set()

            # Traverse the class body
            for class_node in node.body:
                # If it's a function (method), record its name
                if isinstance(class_node, ast.FunctionDef):
                    method_names.add(class_node.name)

                    # Walk through the method's body to find self references and assignments
                    for stmt in ast.walk(class_node):
                        # Look for `self.<attribute>` references
                        if (
                            isinstance(stmt, ast.Attribute)
                            and isinstance(stmt.value, ast.Name)
                            and stmt.value.id == "self"
                            and stmt.attr not in ['__class__']
                        ):
                            self_references.add(stmt.attr)

                        # Look for `self.<attribute> = ...` assignments
                        if isinstance(stmt, ast.Assign):
                            for target in stmt.targets:
                                if (
                                    isinstance(target, ast.Attribute)
                                    and isinstance(target.value, ast.Name)
                                    and target.value.id == "self"
                                ):
                                    self_definitions.add(target.attr)

            # Add method names as definitions (since they are callable via `self.<method>`)
            self_definitions.update(method_names)

            # Determine missing definitions
            missing_definitions = self_references - self_definitions

            # Save results for this class
            results[class_name] = {
                "self_references": self_references,
                "self_definitions": self_definitions,
                "missing_definitions": missing_definitions,
            }

    return results

In [3]:

def analyze_modules_in_folder(folder_path):
    """
    Analyze all Python files in a folder for `self.` references and definitions.

    Args:
        folder_path (str): Path to the folder containing Python files to analyze.

    Returns:
        dict: A dictionary with file names as keys and class analysis results as values.
    """
    analysis_results = {}

    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".py"):  # Only process Python files
            file_path = os.path.join(folder_path, file_name)
            results = find_self_references_and_definitions(file_path)
            analysis_results[file_name] = results

    return analysis_results

In [4]:

def print_analysis_results(analysis_results, filter_non_missing=False):
    """
    Pretty-print the analysis results.

    Args:
        analysis_results (dict): The results from analyzing modules.
    """
    for file_name, classes in analysis_results.items():
        if filter_non_missing:
            filter_boolean = len(list(classes.values())[0]['missing_definitions']) > 0
        else:
            filter_boolean = True
        if filter_boolean:
            print(f"File: {file_name}")
            for class_name, data in classes.items():
                print(f"  Class: {class_name}")
                print(f"    - self references: {data['self_references']}")
                print(f"    - self definitions: {data['self_definitions']}")
                print(f"    - missing definitions: {data['missing_definitions']}")
            print()

In [5]:

# Folder containing the Python modules to analyze
folder_path = osp.abspath(osp.join(os.pardir, os.pardir, 'share'))

# Analyze all modules in the folder
results = analyze_modules_in_folder(folder_path)

In [17]:

# Find attributes initialized with the same default values across different classes
rows_list = []
for file_name in ['data_analysis.py', 'data_preparation.py', 'data_validation.py', 'file_operations.py', 'uncategorized.py']:
    key = ''.join([s.title() for s in file_name.split('.')[0].split('_')])
    # print(list(results[file_name].keys())[0])
    row_dict = {e: 1 for e in results[file_name][key]['self_definitions']}
    rows_list.append(row_dict)
df = DataFrame(rows_list)

In [24]:

print()
srs = df.sum().sort_values()
for var_name, _ in srs[srs > 1].items():
    print(f'        self.{var_name}')


        self.lower_ascii_regex
        self.saves_mp3_folder
        self.facebook_aspect_ratio
        self.convert_strings_to_integers
        self.saves_csv_folder
        self.ipynb_defs_regex
        self.standard_lib_modules
        self.get_coordinates
        self.get_color_cycler
        self.object_evaluators
        self.url_regex
        self.simple_defs_regex
        self.twitter_aspect_ratio
        self.filepath_regex
        self.data_csv_folder
        self.saves_folder
        self.data_folder
        self.github_folder
        self.saves_pickle_folder
        self.encoding_type
        self.encoding_types_list
        self.__init__


In [6]:

# Print the results
print_analysis_results(results, filter_non_missing=False)

File: base_config.py
  Class: BaseConfig
    - self references: set()
    - self definitions: {'__init__'}
    - missing definitions: set()

File: data_analysis.py
  Class: DataAnalysis
    - self references: {'get_r_squared_value_latex', 'get_euclidean_distance', 'facebook_aspect_ratio', 'split_list_by_gap', 'get_flattened_dictionary', 'get_inf_nan_mask', 'lower_ascii_regex'}
    - self definitions: {'get_jitter_list', 'get_spearman_rho_value_latex', 'first_order_linear_scatterplot', 'plot_histogram', 'get_regexed_dataframe', 'modalize_columns', 'lower_ascii_regex', 'open_path_in_notepad', 'plot_line_with_error_bars', 'get_nearest_neighbor', 'get_r_squared_value_latex', 'get_euclidean_distance', 'facebook_aspect_ratio', 'convert_strings_to_integers', 'one_hot_encode', 'get_regexed_columns', 'split_list_by_gap', 'get_flattened_dictionary', 'get_wiki_infobox_data_frame', 'get_inf_nan_mask', 'get_minority_combinations', 'get_numeric_columns', 'count_swaps_to_perfect_order', 'get_column_d


----