In [1]:
import os
import pandas as pd
from pathlib import Path
import json
from datetime import datetime

def create_data_inventory(root_folder="Cleaned_Data"):
    """
    Create an inventory of all CSV files and their columns in the specified folder
    """
    inventory = {
        "inventory_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "root_folder": root_folder,
        "folders": {}
    }
    
    try:
        # Walk through all folders
        for folder_path, subfolders, files in os.walk(root_folder):
            relative_path = os.path.relpath(folder_path, root_folder)
            if relative_path == '.':
                relative_path = 'root'
                
            current_folder = {
                "path": folder_path,
                "files": {}
            }
            
            # Process each CSV file in the current folder
            for file in files:
                if file.endswith('.csv'):
                    file_path = os.path.join(folder_path, file)
                    try:
                        # Read the CSV file
                        df = pd.read_csv(file_path)
                        
                        # Get file info
                        file_stat = os.stat(file_path)
                        file_info = {
                            "columns": list(df.columns),
                            "num_columns": len(df.columns),
                            "num_rows": len(df),
                            "size_bytes": file_stat.st_size,
                            "size_mb": round(file_stat.st_size / (1024 * 1024), 2),
                            "last_modified": datetime.fromtimestamp(file_stat.st_mtime).strftime("%Y-%m-%d %H:%M:%S")
                        }
                        
                        current_folder["files"][file] = file_info
                        
                    except Exception as e:
                        current_folder["files"][file] = {
                            "error": str(e)
                        }
            
            if current_folder["files"]:  # Only add folders that contain CSV files
                inventory["folders"][relative_path] = current_folder
        
        return inventory
    
    except Exception as e:
        return {"error": str(e)}

def save_inventory_report(inventory, output_format="both"):
    """
    Save the inventory report in specified format(s)
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Save as JSON
    if output_format in ["json", "both"]:
        json_path = f"data_inventory_{timestamp}.json"
        with open(json_path, 'w') as f:
            json.dump(inventory, f, indent=2)
        print(f"JSON inventory saved to: {json_path}")
    
    # Save as text report
    if output_format in ["text", "both"]:
        text_path = f"data_inventory_{timestamp}.txt"
        with open(text_path, 'w') as f:
            f.write(f"Data Inventory Report\n")
            f.write(f"Generated: {inventory['inventory_date']}\n")
            f.write(f"Root Folder: {inventory['root_folder']}\n")
            f.write("\n" + "="*80 + "\n\n")
            
            for folder_name, folder_info in inventory['folders'].items():
                f.write(f"\nFolder: {folder_name}\n")
                f.write("-" * 40 + "\n")
                
                for file_name, file_info in folder_info['files'].items():
                    f.write(f"\nFile: {file_name}\n")
                    
                    if "error" in file_info:
                        f.write(f"ERROR: {file_info['error']}\n")
                        continue
                    
                    f.write(f"Number of rows: {file_info['num_rows']}\n")
                    f.write(f"Number of columns: {file_info['num_columns']}\n")
                    f.write(f"File size: {file_info['size_mb']} MB\n")
                    f.write(f"Last modified: {file_info['last_modified']}\n")
                    f.write("\nColumns:\n")
                    for col in file_info['columns']:
                        f.write(f"- {col}\n")
                    f.write("\n")
        
        print(f"Text inventory saved to: {text_path}")

def print_inventory_summary(inventory):
    """
    Print a summary of the inventory to the console
    """
    print("\nData Inventory Summary")
    print("=" * 40)
    print(f"Inventory Date: {inventory['inventory_date']}")
    print(f"Root Folder: {inventory['root_folder']}")
    print("\nFolders and Files:")
    
    total_files = 0
    total_size_mb = 0
    
    for folder_name, folder_info in inventory['folders'].items():
        num_files = len(folder_info['files'])
        total_files += num_files
        
        folder_size = sum(
            file_info.get('size_mb', 0) 
            for file_info in folder_info['files'].values() 
            if 'error' not in file_info
        )
        total_size_mb += folder_size
        
        print(f"\n{folder_name}:")
        print(f"  Files: {num_files}")
        print(f"  Total Size: {folder_size:.2f} MB")
        
        for file_name, file_info in folder_info['files'].items():
            if 'error' in file_info:
                print(f"  - {file_name}: ERROR - {file_info['error']}")
            else:
                print(f"  - {file_name}: {file_info['num_columns']} columns, {file_info['num_rows']} rows")
    
    print("\nOverall Summary:")
    print(f"Total Folders: {len(inventory['folders'])}")
    print(f"Total Files: {total_files}")
    print(f"Total Size: {total_size_mb:.2f} MB")

if __name__ == "__main__":
    # Create inventory
    print("Creating data inventory...")
    inventory = create_data_inventory()
    
    if "error" in inventory:
        print(f"Error creating inventory: {inventory['error']}")
    else:
        # Save reports
        save_inventory_report(inventory)
        
        # Print summary to console
        print_inventory_summary(inventory)

Creating data inventory...
JSON inventory saved to: data_inventory_20241215_190831.json
Text inventory saved to: data_inventory_20241215_190831.txt

Data Inventory Summary
Inventory Date: 2024-12-15 19:08:29
Root Folder: Cleaned_Data

Folders and Files:

Accidents:
  Files: 4
  Total Size: 71.18 MB
  - Accidents_2015_cleaned_20241215_184456.csv: 32 columns, 140056 rows
  - Accidents_2016_cleaned_20241215_184458.csv: 32 columns, 136621 rows
  - Accidents_2017_cleaned_20241215_184500.csv: 32 columns, 129982 rows
  - Accidents_2018_cleaned_20241215_184501.csv: 32 columns, 122635 rows

Casualties:
  Files: 4
  Total Size: 30.86 MB
  - Casualties_2015_cleaned_20241215_184502.csv: 16 columns, 186189 rows
  - Casualties_2017_cleaned_20241215_184503.csv: 16 columns, 170993 rows
  - Casualties_2018_cleaned_20241215_184503.csv: 16 columns, 160597 rows
  - Causalties_2016_cleaned_20241215_184504.csv: 16 columns, 181384 rows

Vehicles:
  Files: 4
  Total Size: 60.46 MB
  - Vehicles_2015_cleaned_20

In [3]:
import os
import pandas as pd
from pathlib import Path
import json
from datetime import datetime
import webbrowser

def create_html_report(inventory):
    """
    Create an HTML report from the inventory data
    """
    css_style = """
        body {
            font-family: Arial, sans-serif;
            line-height: 1.6;
            margin: 0;
            padding: 20px;
            background-color: #f5f5f5;
        }
        .container {
            max-width: 1200px;
            margin: 0 auto;
            background-color: white;
            padding: 20px;
            border-radius: 8px;
            box-shadow: 0 0 10px rgba(0,0,0,0.1);
        }
        h1, h2, h3 {
            color: #333;
        }
        .summary-box {
            background-color: #f8f9fa;
            border: 1px solid #dee2e6;
            border-radius: 4px;
            padding: 15px;
            margin: 10px 0;
        }
        .folder-section {
            margin: 20px 0;
            padding: 15px;
            border: 1px solid #dee2e6;
            border-radius: 4px;
        }
        .file-section {
            margin: 10px 0;
            padding: 15px;
            background-color: #f8f9fa;
            border-radius: 4px;
        }
        table {
            width: 100%;
            border-collapse: collapse;
            margin: 15px 0;
        }
        th, td {
            border: 1px solid #dee2e6;
            padding: 8px;
            text-align: left;
        }
        th {
            background-color: #f8f9fa;
        }
        .error {
            color: #dc3545;
            padding: 10px;
            background-color: #f8d7da;
            border-radius: 4px;
            margin: 5px 0;
        }
        .stats {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 10px;
            margin: 10px 0;
        }
        .stat-item {
            background-color: #e9ecef;
            padding: 10px;
            border-radius: 4px;
            text-align: center;
        }
        .column-list {
            background-color: #fff;
            padding: 10px;
            border-radius: 4px;
            columns: 3;
            margin: 10px 0;
        }
        .column-list li {
            margin: 5px 0;
            break-inside: avoid;
        }
    """

    html_content = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Data Inventory Report</title>
        <style>
            {css_style}
        </style>
    </head>
    <body>
        <div class="container">
            <h1>Data Inventory Report</h1>
            <div class="summary-box">
                <h2>Overview</h2>
                <p>Generated: {inventory['inventory_date']}</p>
                <p>Root Folder: {inventory['root_folder']}</p>
                <div class="stats">
                    <div class="stat-item">
                        <strong>Total Folders</strong>
                        <p>{len(inventory['folders'])}</p>
                    </div>
                    <div class="stat-item">
                        <strong>Total Files</strong>
                        <p>{sum(1 for f in inventory['folders'].values() for fi in f['files'].values() if 'error' not in fi)}</p>
                    </div>
                    <div class="stat-item">
                        <strong>Total Size</strong>
                        <p>{sum(fi['size_mb'] for f in inventory['folders'].values() for fi in f['files'].values() if 'error' not in fi):.2f} MB</p>
                    </div>
                </div>
            </div>
    """

    # Add folder sections
    for folder_name, folder_info in inventory['folders'].items():
        html_content += f"""
            <div class="folder-section">
                <h2>📁 {folder_name}</h2>
        """
        
        for file_name, file_info in folder_info['files'].items():
            if 'error' in file_info:
                html_content += f"""
                    <div class="file-section">
                        <h3>📄 {file_name}</h3>
                        <div class="error">
                            <strong>Error:</strong> {file_info['error']}
                        </div>
                    </div>
                """
            else:
                html_content += f"""
                    <div class="file-section">
                        <h3>📄 {file_name}</h3>
                        <div class="stats">
                            <div class="stat-item">
                                <strong>Rows</strong>
                                <p>{file_info['num_rows']:,}</p>
                            </div>
                            <div class="stat-item">
                                <strong>Columns</strong>
                                <p>{file_info['num_columns']}</p>
                            </div>
                            <div class="stat-item">
                                <strong>Size</strong>
                                <p>{file_info['size_mb']:.2f} MB</p>
                            </div>
                            <div class="stat-item">
                                <strong>Last Modified</strong>
                                <p>{file_info['last_modified']}</p>
                            </div>
                        </div>
                        <h4>Columns:</h4>
                        <ul class="column-list">
                            {"".join(f'<li>{col}</li>' for col in file_info['columns'])}
                        </ul>
                    </div>
                """
        
        html_content += """
            </div>
        """

    # Close HTML
    html_content += """
        </div>
    </body>
    </html>
    """

    # Save HTML report
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    html_path = f"data_inventory_{timestamp}.html"
    
    with open(html_path, 'w', encoding='utf-8') as f:
        f.write(html_content)
    
    return html_path

def create_data_inventory(root_folder="Cleaned_Data"):
    """
    Create an inventory of all CSV files and their columns in the specified folder
    """
    inventory = {
        "inventory_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "root_folder": root_folder,
        "folders": {}
    }
    
    try:
        # Walk through all folders
        for folder_path, subfolders, files in os.walk(root_folder):
            relative_path = os.path.relpath(folder_path, root_folder)
            if relative_path == '.':
                relative_path = 'root'
                
            current_folder = {
                "path": folder_path,
                "files": {}
            }
            
            # Process each CSV file in the current folder
            for file in files:
                if file.endswith('.csv'):
                    file_path = os.path.join(folder_path, file)
                    try:
                        # Read the CSV file
                        df = pd.read_csv(file_path)
                        
                        # Get file info
                        file_stat = os.stat(file_path)
                        file_info = {
                            "columns": list(df.columns),
                            "num_columns": len(df.columns),
                            "num_rows": len(df),
                            "size_bytes": file_stat.st_size,
                            "size_mb": round(file_stat.st_size / (1024 * 1024), 2),
                            "last_modified": datetime.fromtimestamp(file_stat.st_mtime).strftime("%Y-%m-%d %H:%M:%S")
                        }
                        
                        current_folder["files"][file] = file_info
                        
                    except Exception as e:
                        current_folder["files"][file] = {
                            "error": str(e)
                        }
            
            if current_folder["files"]:  # Only add folders that contain CSV files
                inventory["folders"][relative_path] = current_folder
        
        return inventory
    
    except Exception as e:
        return {"error": str(e)}

if __name__ == "__main__":
    # Create inventory
    print("Creating data inventory...")
    inventory = create_data_inventory()
    
    if "error" in inventory:
        print(f"Error creating inventory: {inventory['error']}")
    else:
        # Generate and open HTML report
        html_path = create_html_report(inventory)
        print(f"\nHTML report generated: {html_path}")
        
        # Open the report in the default web browser
        webbrowser.open('file://' + os.path.abspath(html_path))

Creating data inventory...

HTML report generated: data_inventory_20241215_191024.html
