In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import numpy as np

def pca_pipeline(
    df,                      # Input DataFrame
    numeric_features,        # List of numeric feature column names
    categorical_features=[], # List of categorical feature column names (optional)
    n_components=None,       # Number of principal components to keep (default None = all)
    custom_input=None        # Optional custom input for transformation (as DataFrame)
):
    # Extract features
    X = df.copy()

    # Define preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='drop'  # drop any other columns
    )

    # Define pipeline with preprocessing + PCA
    pipeline = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('pca', PCA(n_components=n_components))
    ])

    # Fit PCA pipeline on data
    pipeline.fit(X)

    # Transform data to principal components
    X_pca = pipeline.transform(X)

    print(f"Original shape: {X.shape}")
    print(f"Transformed shape after PCA: {X_pca.shape}")
    print("First 5 rows of transformed data:\n", X_pca[:5])

    # Optional: transform custom input if provided
    if custom_input is not None:
        custom_pca = pipeline.transform(custom_input)
        print("\nCustom input after PCA transformation:")
        print(custom_pca)

    return X_pca

# ------------------ Example usage ------------------

# Sample dataset (numeric + categorical)
df = pd.DataFrame({
    'age': [25, 45, 35, 50, 23],
    'income': [50000, 64000, 58000, 72000, 52000],
    'gender': ['M', 'F', 'F', 'M', 'M']
})

custom_input = pd.DataFrame([{
    'age': 30,
    'income': 60000,
    'gender': 'F'
}])

pca_pipeline(
    df=df,
    numeric_features=['age', 'income'],
    categorical_features=['gender'],
    n_components=1, #no of components
    custom_input=custom_input
)


Original shape: (5, 3)
Transformed shape after PCA: (5, 1)
First 5 rows of transformed data:
 [[-1.58242043]
 [ 1.17123178]
 [-0.00384156]
 [ 1.95726456]
 [-1.54223435]]

Custom input after PCA transformation:
[[-0.16137707]]


array([[-1.58242043],
       [ 1.17123178],
       [-0.00384156],
       [ 1.95726456],
       [-1.54223435]])