In [None]:
import pandas as pd
from ctgan import CTGAN
import pickle
import os
import time

# --- Configuration ---
# Define the file paths for input data, the saved model, and output data.
INPUT_DATA_FILE = '/content/BMW sales data (2010-2024) (1).csv'
MODEL_FILE = 'ctgan_model.pkl'
SYNTHETIC_OUTPUT_FILE = 'synthetic_data.csv'

# Define the categorical columns in the dataset.
# IMPORTANT: Leave this list EMPTY [] for automated detection based on data type and cardinality.
# If you need specific columns, list them here.
CATEGORICAL_COLUMNS = [
]

# --- Core Functions ---

def detect_categorical_columns(data: pd.DataFrame) -> list:
    """
    Automatically detects columns that should be treated as categorical for CTGAN.

    Heuristic:
    1. Any column with dtype 'object' (string).
    2. Any numeric column (int/float) with low cardinality:
       - Less than 10 unique values AND
       - Unique values are less than 5% of the total rows.
    """
    detected_cols = []

    # 1. Detect 'object' (string) columns
    object_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
    detected_cols.extend(object_cols)

    # 2. Detect low-cardinality numeric columns
    numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
    n_rows = len(data)

    for col in numeric_cols:
        n_unique = data[col].nunique()
        # Criteria 1: Absolute count check (e.g., less than 10 unique values)
        absolute_check = n_unique <= 10
        # Criteria 2: Relative count check (e.g., less than 5% of total rows)
        relative_check = (n_unique / n_rows) < 0.05

        if absolute_check and relative_check:
            detected_cols.append(col)

    # Remove duplicates and return
    return list(set(detected_cols))


def load_data(filepath: str) -> pd.DataFrame or None:
    """Loads the input dataset from a CSV file and enforces categorical types."""
    global CATEGORICAL_COLUMNS # FIX: Moved global declaration to the start of the function

    if not os.path.exists(filepath):
        # Fallback check if the provided path starts with a path that might not exist locally
        local_filepath = os.path.basename(filepath)
        if os.path.exists(local_filepath):
             filepath = local_filepath
        else:
            print(f"ERROR: Input file not found at {filepath}")
            return None

    print(f"Loading data from {filepath}...")
    try:
        data = pd.read_csv(filepath)

        print("\n--- Available Columns in the Dataset ---")
        print(data.columns.tolist())
        print("----------------------------------------\n")

        # Determine categorical columns (use manual list if provided, otherwise auto-detect)
        if CATEGORICAL_COLUMNS:
            active_categorical_cols = [col for col in CATEGORICAL_COLUMNS if col in data.columns]
            all_cols = CATEGORICAL_COLUMNS
            source_description = "Manual list"
        else:
            active_categorical_cols = detect_categorical_columns(data)
            all_cols = active_categorical_cols
            source_description = "Auto-detected"


        missing_cols = [col for col in all_cols if col not in data.columns]

        # --- Enforce categorical data types for CTGAN ---
        for col in active_categorical_cols:
            # Force the column to be treated as a Pandas Category type
            data[col] = data[col].astype('category')

        if missing_cols and source_description == "Manual list":
            print(f"WARNING: The following columns are in the MANUAL CATEGORICAL_COLUMNS list but MISSING from the data: {missing_cols}. Using only available columns.")

        if not active_categorical_cols:
            print("CRITICAL ERROR: No categorical columns were found/defined. CTGAN requires at least one discrete or continuous column.")
            # Return None to prevent proceeding to model training with no features.
            return None
        # --------------------------------------------------

        # Display information for verification
        print(f"Data loaded successfully. Shape: {data.shape}")
        print(f"Source: {source_description}")
        print(f"Categorical Columns being used ({len(active_categorical_cols)}): {active_categorical_cols}")

        # IMPORTANT: Overwrite the global list with the detected columns before returning
        CATEGORICAL_COLUMNS = active_categorical_cols

        return data
    except Exception as e:
        print(f"ERROR loading data: {e}")
        return None


def train_and_save_model(data: pd.DataFrame, model_path: str):
    """Trains a new CTGAN model and saves it to a pickle file."""
    # CATEGORICAL_COLUMNS is now updated globally in load_data

    print("Starting CTGAN model training...")
    start_time = time.time()

    # Initialize CTGAN model
    model = CTGAN(
        epochs=300,
        batch_size=500,
        verbose=True
    )

    # Fit the model to the real data
    model.fit(data, CATEGORICAL_COLUMNS)

    # Save the trained model
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)

    end_time = time.time()
    print(f"Model training complete and saved to {model_path}.")
    print(f"Total training time: {end_time - start_time:.2f} seconds.")
    return model


def load_model(model_path: str):
    """Loads a previously trained CTGAN model."""
    if not os.path.exists(model_path):
        print(f"ERROR: Model file not found at {model_path}.")
        return None

    print(f"Loading trained model from {model_path}...")
    try:
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
        print("Model loaded successfully.")
        return model
    except Exception as e:
        print(f"ERROR loading model: {e}")
        return None


def generate_synthetic_data(model: CTGAN, num_samples: int, output_path: str):
    """Generates synthetic data and saves it to a CSV file."""
    print(f"Generating {num_samples} synthetic samples...")
    start_time = time.time()

    synthetic_data = model.sample(num_samples)

    # Save the synthetic data
    synthetic_data.to_csv(output_path, index=False)

    end_time = time.time()
    print(f"Synthetic data generated and saved to {output_path}.")
    print(f"Generation time: {end_time - start_time:.2f} seconds.")
    print(synthetic_data.head())


def run_automation():
    """Main function to check conditions and execute the pipeline."""

    data = load_data(INPUT_DATA_FILE)
    if data is None:
        return

    # Determine if retraining is needed
    retrain_needed = False

    # If the file exists, we check if retraining is necessary based on file modification times.
    if not os.path.exists(MODEL_FILE):
        print("ACTION: Model file does not exist. Retraining is required.")
        retrain_needed = True
    else:
        try:
            # Check if the input data file is newer than the model file
            input_time = os.path.getmtime(INPUT_DATA_FILE)
            model_time = os.path.getmtime(MODEL_FILE)

            if input_time > model_time:
                print("ACTION: Input data is newer than the existing model. Retraining is required.")
                retrain_needed = True
            else:
                print("STATUS: Existing model is up-to-date with the input data. Reusing model.")
        except Exception as e:
            # Handle case where file metadata cannot be read (e.g., permissions)
            print(f"WARNING: Could not check file modification times ({e}). Assuming retraining is needed.")
            retrain_needed = True

    if retrain_needed:
        # Step 1: Train the model on the new data
        ctgan_model = train_and_save_model(data, MODEL_FILE)
    else:
        # Step 1: Load the existing model
        ctgan_model = load_model(MODEL_FILE)

    if ctgan_model:
        # Step 2: Generate synthetic data (generate the same number of rows as the input data)
        NUM_SAMPLES = data.shape[0]
        generate_synthetic_data(ctgan_model, NUM_SAMPLES, SYNTHETIC_OUTPUT_FILE)
        print("\nPipeline complete. Synthetic data is ready.")
    else:
        print("\nPipeline failed: Could not load or train the model.")

if __name__ == '__main__':
    # NOTE: Before running, ensure you have an 'input_data.csv' file in the same directory.
    run_automation()


Loading data from /content/BMW sales data (2010-2024) (1).csv...

--- Available Columns in the Dataset ---
['Model', 'Year', 'Region', 'Color', 'Fuel_Type', 'Transmission', 'Engine_Size_L', 'Mileage_KM', 'Price_USD', 'Sales_Volume', 'Sales_Classification']
----------------------------------------

Data loaded successfully. Shape: (50000, 11)
Source: Auto-detected
Categorical Columns being used (6): ['Model', 'Transmission', 'Sales_Classification', 'Fuel_Type', 'Color', 'Region']
ACTION: Input data is newer than the existing model. Retraining is required.
Starting CTGAN model training...


Gen. (-2.67) | Discrim. (-0.03):  32%|███▏      | 97/300 [16:07<33:45,  9.98s/it]


KeyboardInterrupt: 