In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from ydata_profiling import ProfileReport
import warnings
from io import StringIO
from contextlib import redirect_stdout
import logging
from multiprocessing import Pool
import tqdm  # Import tqdm to disable progress bars

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Disable tqdm progress bars globally
tqdm.tqdm().disable = True

# -----------------------------
# CONFIGURATION
# -----------------------------

# Dynamic data folder input with validation
def get_valid_directory():
    """Prompts for and validates a directory path."""
    while True:
        data_folder = input("Enter dataset folder path (e.g., C:\\Users\\Admin\\Documents\\Datasets) [default: current directory]: ").strip() or os.getcwd()
        if os.path.isfile(data_folder):
            logging.warning(f"Provided path is a file, not a directory: {data_folder}")
            print(f"⚠️ Error: '{data_folder}' is a file, not a directory. Using parent directory.")
            data_folder = os.path.dirname(data_folder)
        if os.path.isdir(data_folder):
            return data_folder
        else:
            logging.error(f"Invalid directory: {data_folder}")
            print(f"❌ Error: '{data_folder}' is not a valid directory. Please try again.")

DATA_FOLDER = get_valid_directory()
OUTPUT_FOLDER = "output"
SHOW_PREVIEW = True
EXCLUDE_PATTERNS = ['id', 'desc', 'number', 'phone', 'contact', 'name']
VISUALIZATION_OPTIONS = {
    'figure_size': (10, 6),  # Hardcoded default figure size
    'colors': ['#FF6384', '#36A2EB', '#FFCE56', '#4BC0C0', '#9966FF']
}

# Setup logging
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
logging.basicConfig(
    filename=os.path.join(OUTPUT_FOLDER, f'processing_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# =============================================================================
# FUNCTIONS
# =============================================================================

def list_and_select_files(data_folder: str) -> list:
    """Lists available datasets and lets user pick one or more for processing."""
    try:
        files = [f for f in os.listdir(data_folder) if f.lower().endswith((".xlsx", ".xls", ".csv", ".json", ".parquet"))]
    except FileNotFoundError:
        logging.error(f"The specified data folder does not exist: {data_folder}")
        print(f"❌ ERROR: The specified data folder does not exist: {data_folder}")
        return []
    except OSError as e:
        logging.error(f"Error accessing directory {data_folder}: {e}")
        print(f"❌ ERROR: Unable to access directory {data_folder}: {e}")
        return []

    if not files:
        logging.warning(f"No datasets found in: {data_folder}")
        print(f"❌ No datasets found in: {data_folder}")
        return []

    print("\nAvailable datasets:")
    for i, f in enumerate(files, 1):
        print(f"  {i}. {f}")

    choice_str = input("\nEnter number(s) of dataset(s) to process, comma-separated (e.g., 1,3) [default: last]: ").strip()

    selected_files = []
    if not choice_str:
        selected_files.append(os.path.join(data_folder, files[-1]))  # Default to last file
    else:
        try:
            choices = [int(c.strip()) - 1 for c in choice_str.split(",")]
            selected_files = [os.path.join(data_folder, files[i]) for i in choices if 0 <= i < len(files)]
            if not selected_files:
                logging.warning("No valid file indices selected. Defaulting to the last dataset.")
                print("⚠️ No valid file indices selected. Defaulting to the last dataset.")
                selected_files.append(os.path.join(data_folder, files[-1]))
        except (ValueError, IndexError):
            logging.error(f"Invalid input: {choice_str}. Defaulting to the last dataset.")
            print("⚠️ Invalid input. Defaulting to the last dataset.")
            selected_files.append(os.path.join(data_folder, files[-1]))

    return selected_files

def load_data(file_path: str) -> pd.DataFrame | None:
    """Loads dataset (CSV, Excel, JSON, Parquet) into a pandas DataFrame."""
    print(f"\n⏳ Loading data from: {os.path.basename(file_path)}")
    logging.info(f"Loading data from: {file_path}")
    try:
        if file_path.lower().endswith(".csv"):
            encodings = ['utf-8', 'latin1', 'iso-8859-1']
            for enc in encodings:
                try:
                    df = pd.read_csv(file_path, encoding=enc)
                    break
                except Exception as e:
                    logging.warning(f"Tried encoding {enc} for {file_path}: {e}")
            else:
                raise ValueError("Failed to load CSV with any encoding")
        elif file_path.lower().endswith((".xlsx", ".xls")):
            df = pd.read_excel(file_path)
        elif file_path.lower().endswith(".json"):
            df = pd.read_json(file_path)
        elif file_path.lower().endswith(".parquet"):
            df = pd.read_parquet(file_path)
        else:
            raise ValueError("Unsupported file format")
        
        print(f"✅ Loaded successfully with shape {df.shape}")
        logging.info(f"Loaded successfully: {file_path}, shape {df.shape}")
        if SHOW_PREVIEW:
            print(df.head())
        return df
    except Exception as e:
        logging.error(f"Error loading {os.path.basename(file_path)}: {e}")
        print(f"❌ Error loading {os.path.basename(file_path)}: {e}")
        return None

def get_cleaning_choices(df_columns: list, row_count: int) -> dict:
    """Asks the user for cleaning, visualization, and sampling preferences with validation."""
    print("\n--- Data Cleaning Options ---")
    
    while True:
        mv_choice = input("Handle missing values?\n [1] Drop rows\n [2] Fill mean/mode\n [3] Do nothing\n Choice: ").strip()
        if mv_choice in ['1', '2', '3']:
            break
        print("⚠️ Invalid choice. Please select 1, 2, or 3.")

    while True:
        dd_choice = input("Drop duplicate rows? (y/n): ").strip().lower()
        if dd_choice in ['y', 'n']:
            break
        print("⚠️ Invalid choice. Please select y or n.")

    bar_cols, pie_cols = [], []
    print("\n--- Visualization Options ---")
    print(f"Available columns: {df_columns}")
    if input("Manually select columns for visualizations? (y/n): ").strip().lower() == 'y':
        bar_input = input("Columns for bar charts (comma-separated) or 'none': ").strip()
        if bar_input.lower() != 'none':
            bar_cols = [c.strip() for c in bar_input.split(",") if c.strip() in df_columns]
            if not bar_cols:
                print("⚠️ No valid columns selected for bar charts.")

        pie_input = input("Columns for pie charts (comma-separated) or 'none': ").strip()
        if pie_input.lower() != 'none':
            pie_cols = [c.strip() for c in pie_input.split(",") if c.strip() in df_columns]
            if not pie_cols:
                print("⚠️ No valid columns selected for pie charts.")

    # Sampling prompt
    sample_size = None
    default_sample_size = 50000 if row_count > 150000 else None
    while True:
        sample_prompt = f"Enter sample size for visualizations (e.g., 50000) [default: {default_sample_size if default_sample_size else 'full dataset'}]: "
        sample_input = input(sample_prompt).strip().lower()
        if sample_input == '' and default_sample_size is not None:
            sample_size = default_sample_size
            break
        if sample_input == 'none':
            break
        try:
            sample_size = int(sample_input)
            if sample_size > 0:
                break
            print("⚠️ Sample size must be positive.")
        except ValueError:
            print("⚠️ Invalid input. Enter a number or 'none'.")

    return {
        'missing_values': {'1': 'drop', '2': 'fill', '3': 'none'}[mv_choice],
        'drop_duplicates': dd_choice == 'y',
        'bar_columns': bar_cols,
        'pie_columns': pie_cols,
        'sample_size': sample_size
    }

def clean_data(df: pd.DataFrame, choices: dict) -> tuple[pd.DataFrame, dict]:
    """Cleans the DataFrame based on user-specified rules."""
    print("⏳ Cleaning data...")
    logging.info("Starting data cleaning")
    df_cleaned = df.copy()
    original_cols = df_cleaned.columns.tolist()

    if choices['drop_duplicates']:
        before = len(df_cleaned)
        df_cleaned.drop_duplicates(inplace=True)
        print(f"  - Dropped {before - len(df_cleaned)} duplicate rows.")
        logging.info(f"Dropped {before - len(df_cleaned)} duplicate rows")

    if choices['missing_values'] == 'drop':
        before = len(df_cleaned)
        df_cleaned.dropna(inplace=True)
        print(f"  - Dropped {before - len(df_cleaned)} rows with NA.")
        logging.info(f"Dropped {before - len(df_cleaned)} rows with NA")
    elif choices['missing_values'] == 'fill':
        for col in df_cleaned.columns:
            if df_cleaned[col].isnull().any():
                if pd.api.types.is_numeric_dtype(df_cleaned[col]):
                    df_cleaned[col].fillna(df_cleaned[col].mean(), inplace=True)
                else:
                    df_cleaned[col].fillna(df_cleaned[col].mode().iloc[0] if not df_cleaned[col].mode().empty else "Unknown", inplace=True)
        print("  - Filled missing values with mean/mode.")
        logging.info("Filled missing values with mean/mode")

    cleaned_cols = [c.strip().replace(" ", "_") for c in df_cleaned.columns]
    mapping = dict(zip(original_cols, cleaned_cols))
    df_cleaned.columns = cleaned_cols

    df_cleaned = df_cleaned.apply(lambda s: pd.to_numeric(s, errors='ignore'))

    print(f"✅ Cleaning complete. Shape: {df_cleaned.shape}")
    logging.info(f"Cleaning complete. Shape: {df_cleaned.shape}")
    return df_cleaned, mapping

def generate_visualizations(df: pd.DataFrame, base_filename: str, output_folder: str, choices: dict):
    """Generates bar and pie chart visualizations."""
    print("📊 Creating visualizations...")
    logging.info(f"Creating visualizations for {base_filename}")
    plot_folder = os.path.join(output_folder, f"{base_filename}_plots")
    os.makedirs(plot_folder, exist_ok=True)

    # Apply sampling if specified
    df_plot = df
    if choices['sample_size'] is not None and choices['sample_size'] < len(df):
        df_plot = df.sample(n=choices['sample_size'], random_state=42)
        print(f"  - Using sample of {choices['sample_size']} rows for visualizations")
        logging.info(f"Using sample of {choices['sample_size']} rows for visualizations")

    categorical_cols = df_plot.select_dtypes(include=['object', 'category']).columns
    filtered_cols = [c for c in categorical_cols if not any(pat in c.lower() for pat in EXCLUDE_PATTERNS)]

    bar_columns = choices['bar_columns']
    pie_columns = choices['pie_columns']
    if not (bar_columns or pie_columns):
        pie_columns = [c for c in filtered_cols if 2 <= df_plot[c].nunique() <= 10][:2]
        bar_columns = [c for c in filtered_cols if 5 <= df_plot[c].nunique() <= 100][:2]

    for col in bar_columns:
        if col in df_plot:
            plt.figure(figsize=VISUALIZATION_OPTIONS['figure_size'])
            top10 = df_plot[col].value_counts().nlargest(10)
            sns.barplot(x=top10.values, y=top10.index, palette=VISUALIZATION_OPTIONS['colors'][:len(top10)])
            plt.title(f"Top 10 {col}")
            plt.savefig(os.path.join(plot_folder, f"bar_{col}.png"))
            plt.close()
            print(f"  - Bar chart: {col}")
            logging.info(f"Generated bar chart for {col}")

    for col in pie_columns:
        if col in df_plot and 2 <= df_plot[col].nunique() <= 10:
            plt.figure(figsize=VISUALIZATION_OPTIONS['figure_size'])
            counts = df_plot[col].value_counts()
            plt.pie(counts, labels=counts.index, autopct='%1.1f%%', colors=VISUALIZATION_OPTIONS['colors'][:len(counts)])
            plt.title(f"Distribution of {col}")
            plt.savefig(os.path.join(plot_folder, f"pie_{col}.png"))
            plt.close()
            print(f"  - Pie chart: {col}")
            logging.info(f"Generated pie chart for {col}")

    if not (bar_columns or pie_columns):
        print("⚠️ No suitable columns for visualization.")
        logging.warning("No suitable columns for visualization")

    print(f"✅ Plots saved in {plot_folder}")
    logging.info(f"Plots saved in {plot_folder}")

def generate_ydata_report(df: pd.DataFrame, base_filename: str, output_folder: str):
    """Generates an interactive HTML profiling report in minimal mode for large datasets."""
    print("📑 Building profiling report...")
    logging.info(f"Building profiling report for {base_filename}")
    try:
        with redirect_stdout(StringIO()):
            profile = ProfileReport(
                df,
                title=f"Profiling Report for {base_filename}",
                minimal=len(df) > 10000,
                explorative=True,
                progress_bar=False  # Disable progress bar and widget
            )
            outpath = os.path.join(output_folder, f"{base_filename}_report_{datetime.now().strftime('%Y%m%d')}.html")
            profile.to_file(outpath, silent=True)
        print(f"✅ Report saved -> {outpath}")
        logging.info(f"Report saved: {outpath}")
    except Exception as e:
        logging.error(f"Report generation failed for {base_filename}: {e}")
        print(f"❌ Report generation failed: {e}")

def save_outputs(df_cleaned: pd.DataFrame, base_filename: str, output_folder: str):
    """Saves the cleaned DataFrame to CSV and Excel (if within row limit)."""
    print("💾 Saving cleaned dataset...")
    logging.info(f"Saving cleaned dataset for {base_filename}")
    os.makedirs(output_folder, exist_ok=True)
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

    csv_path = os.path.join(output_folder, f"{base_filename}_cleaned_{timestamp}.csv")
    df_cleaned.to_csv(csv_path, index=False)
    print(f"✅ Saved CSV -> {csv_path}")
    logging.info(f"Saved CSV: {csv_path}")

    if df_cleaned.shape[0] <= 1_048_576:
        try:
            xlsx_path = os.path.join(output_folder, f"{base_filename}_cleaned_{timestamp}.xlsx")
            df_cleaned.to_excel(xlsx_path, index=False)
            print(f"✅ Saved Excel -> {xlsx_path}")
            logging.info(f"Saved Excel: {xlsx_path}")
        except Exception as e:
            logging.error(f"Failed to save Excel for {base_filename}: {e}")
            print(f"❌ Failed to save Excel: {e}")
    else:
        print("⚠️ Skipped Excel export (too many rows for Excel).")
        logging.warning(f"Skipped Excel export for {base_filename}: too many rows")

def process_file(file_path: str):
    """Processes a single file (used for parallel processing)."""
    base = os.path.splitext(os.path.basename(file_path))[0]
    print(f"\n{'='*20} Processing {base} {'='*20}")
    logging.info(f"Processing file: {base}")

    raw_df = load_data(file_path)
    if raw_df is None:
        return

    cleaned_df, _ = clean_data(raw_df, {'missing_values': 'none', 'drop_duplicates': False})
    cleaning_choices = get_cleaning_choices(cleaned_df.columns, len(cleaned_df))
    cleaned_df, _ = clean_data(cleaned_df, cleaning_choices)
    save_outputs(cleaned_df, base, OUTPUT_FOLDER)
    generate_visualizations(cleaned_df, base, OUTPUT_FOLDER, cleaning_choices)
    generate_ydata_report(raw_df, base, OUTPUT_FOLDER)

# =============================================================================
# MAIN
# =============================================================================

def main():
    logging.info("Starting data processing script")
    selected_files = list_and_select_files(DATA_FOLDER)
    if not selected_files:
        logging.info("No files selected. Exiting.")
        print("No files selected. Exiting.")
        return

    for file_path in selected_files:
        process_file(file_path)

    print("\n🎉 Automation complete!")
    logging.info("Automation complete")

if __name__ == "__main__":
    main()

0it [00:00, ?it/s]

Enter dataset folder path (e.g., C:\Users\Admin\Documents\Datasets) [default: current directory]:  C:\Users\Admin\Documents\Automation\Datasets



Available datasets:
  1. ncr_ride_bookings.csv
  2. SaaS_Customer_Retention_Data.xlsx
  3. sales_data_sample.csv
  4. training.1600000.processed.noemoticon.csv



Enter number(s) of dataset(s) to process, comma-separated (e.g., 1,3) [default: last]:  1




⏳ Loading data from: ncr_ride_bookings.csv
✅ Loaded successfully with shape (150000, 21)
         Date      Time    Booking ID   Booking Status   Customer ID  \
0  2024-03-23  12:29:38  "CNR5884300"  No Driver Found  "CID1982111"   
1  2024-11-29  18:01:39  "CNR1326809"       Incomplete  "CID4604802"   
2  2024-08-23  08:56:10  "CNR8494506"        Completed  "CID9202816"   
3  2024-10-21  17:17:25  "CNR8906825"        Completed  "CID2610914"   
4  2024-09-16  22:08:00  "CNR1950162"        Completed  "CID9933542"   

    Vehicle Type      Pickup Location      Drop Location  Avg VTAT  Avg CTAT  \
0          eBike          Palam Vihar            Jhilmil       NaN       NaN   
1       Go Sedan        Shastri Nagar  Gurgaon Sector 56       4.9      14.0   
2           Auto              Khandsa      Malviya Nagar      13.4      25.8   
3  Premier Sedan  Central Secretariat           Inderlok      13.1      28.5   
4           Bike     Ghitorni Village        Khan Market       5.3      19.6

Handle missing values?
 [1] Drop rows
 [2] Fill mean/mode
 [3] Do nothing
 Choice:  2
Drop duplicate rows? (y/n):  y



--- Visualization Options ---
Available columns: Index(['Date', 'Time', 'Booking_ID', 'Booking_Status', 'Customer_ID',
       'Vehicle_Type', 'Pickup_Location', 'Drop_Location', 'Avg_VTAT',
       'Avg_CTAT', 'Cancelled_Rides_by_Customer',
       'Reason_for_cancelling_by_Customer', 'Cancelled_Rides_by_Driver',
       'Driver_Cancellation_Reason', 'Incomplete_Rides',
       'Incomplete_Rides_Reason', 'Booking_Value', 'Ride_Distance',
       'Driver_Ratings', 'Customer_Rating', 'Payment_Method'],
      dtype='object')


Manually select columns for visualizations? (y/n):  n
Enter sample size for visualizations (e.g., 50000) [default: full dataset]:  none


⏳ Cleaning data...
  - Dropped 0 duplicate rows.
  - Filled missing values with mean/mode.
✅ Cleaning complete. Shape: (150000, 21)
💾 Saving cleaned dataset...
✅ Saved CSV -> output\ncr_ride_bookings_cleaned_20250902_020728.csv
✅ Saved Excel -> output\ncr_ride_bookings_cleaned_20250902_020728.xlsx
📊 Creating visualizations...
  - Bar chart: Booking_Status
  - Bar chart: Vehicle_Type
  - Pie chart: Booking_Status
  - Pie chart: Vehicle_Type
✅ Plots saved in output\ncr_ride_bookings_plots
📑 Building profiling report...


100%|██████████| 21/21 [00:02<00:00,  7.69it/s]


✅ Report saved -> output\ncr_ride_bookings_report_20250902.html

🎉 Automation complete!
