In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def random_forest_pipeline(
    df,                      # Input DataFrame
    target_column,           # Name of the target column (binary/class)
    numeric_features,        # List of numeric feature column names
    categorical_features,    # List of categorical feature column names
    test_size=0.2,           # Test set size
    custom_input=None        # Optional custom input for prediction (as DataFrame)
):
    # -----------------------------------------
    # STEP 1: Split input features and target
    # -----------------------------------------
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    # -----------------------------------------
    # STEP 2: Define preprocessing steps
    # - Numeric features: StandardScaler (optional for RF, included for consistency)
    # - Categorical features: OneHotEncoder
    # -----------------------------------------
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),  # input: numeric
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)  # input: categorical
        ]
    )

    # -----------------------------------------
    # STEP 3: Create the pipeline with preprocessing + RandomForest
    # -----------------------------------------
    model = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))  # RandomForest handles non-linear relations well
    ])

    # -----------------------------------------
    # STEP 4: Train-test split
    # -----------------------------------------
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # -----------------------------------------
    # STEP 5: Train the model
    # -----------------------------------------
    model.fit(X_train, y_train)

    # -----------------------------------------
    # STEP 6: Predict on test set and print results
    # -----------------------------------------
    test_prediction = model.predict(X_test)
    print("Test set predictions:\n", test_prediction)

    # -----------------------------------------
    # STEP 7: Custom input prediction (if provided)
    # -----------------------------------------
    if custom_input is not None:
        custom_prediction = model.predict(custom_input)
        custom_proba = model.predict_proba(custom_input)

        print("\nCustom input:")
        print(custom_input)
        print("Predicted class:", custom_prediction)
        print("Class probabilities:", custom_proba)

    return model  # return trained model

# ------------------ Example usage ------------------

# Load the sample dataset
df = pd.read_csv('play_tennis.csv')  # Your dataset should contain both numerical and/or categorical columns
df = df.drop('day', axis=1)  # Drop irrelevant columns

# Example custom input: Must be in same format (same columns as df.drop('play', axis=1))
custom_input = pd.DataFrame([{
    'outlook': 'Sunny',     # categorical
    'temp': 'Hot',          # categorical
    'humidity': 'High',     # categorical
    'wind': 'Weak'          # categorical
}])

# Run the Random Forest pipeline
random_forest_pipeline(
    df=df,
    target_column='play',  # This should be a binary or multi-class label (e.g., 'Yes', 'No')
    numeric_features=[],   # Add numeric columns here if present
    categorical_features=['outlook', 'temp', 'humidity', 'wind'],  # These will be one-hot encoded
    test_size=0.3,
    custom_input=custom_input
)


Test set predictions:
 ['Yes' 'Yes' 'No' 'Yes' 'Yes']

Custom input:
  outlook temp humidity  wind
0   Sunny  Hot     High  Weak
Predicted class: ['No']
Class probabilities: [[0.63 0.37]]


In [7]:


def manual_random_forest(
    df,                      # Input DataFrame
    target_column,           # Target column name
    numeric_features,        # List of numeric feature column names
    categorical_features,    # List of categorical feature column names
    test_size=0.2,           # Test split size
    custom_input=None        # Optional custom input for prediction
):
    # -----------------------------------------
    # STEP 1: Split input and target
    # -----------------------------------------
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )

    # -----------------------------------------
    # STEP 2: Handle preprocessing manually
    # -----------------------------------------

    # 2A. Scale numeric features
    scaler = StandardScaler()
    if numeric_features:
        X_train_num = scaler.fit_transform(X_train[numeric_features])
        X_test_num = scaler.transform(X_test[numeric_features])
    else:
        X_train_num = np.empty((len(X_train), 0))
        X_test_num = np.empty((len(X_test), 0))

    # 2B. Encode categorical features
    # The 'sparse' parameter was removed in recent versions of OneHotEncoder
    # Set sparse_output instead for equivalent behavior (True by default)
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # Changed sparse=False to sparse_output=False
    if categorical_features:
        X_train_cat = encoder.fit_transform(X_train[categorical_features])
        X_test_cat = encoder.transform(X_test[categorical_features])
    else:
        X_train_cat = np.empty((len(X_train), 0))
        X_test_cat = np.empty((len(X_test), 0))

    # 2C. Concatenate processed features
    X_train_processed = np.hstack([X_train_num, X_train_cat])
    X_test_processed = np.hstack([X_test_num, X_test_cat])

    # -----------------------------------------
    # STEP 3: Train Random Forest model
    # -----------------------------------------
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train_processed, y_train)

    # -----------------------------------------
    # STEP 4: Predict on test set
    # -----------------------------------------
    test_pred = clf.predict(X_test_processed)
    print("Test predictions:\n", test_pred)

    # -----------------------------------------
    # STEP 5: Predict on custom input
    # -----------------------------------------
    if custom_input is not None:
        # Preprocess custom input using fitted encoders
        if numeric_features:
            custom_num = scaler.transform(custom_input[numeric_features])
        else:
            custom_num = np.empty((len(custom_input), 0))

        if categorical_features:
            custom_cat = encoder.transform(custom_input[categorical_features])
        else:
            custom_cat = np.empty((len(custom_input), 0))

        custom_processed = np.hstack([custom_num, custom_cat])

        # Predict
        custom_pred = clf.predict(custom_processed)
        custom_proba = clf.predict_proba(custom_processed)

        print("\nCustom input:")
        print(custom_input)
        print("Predicted class:", custom_pred)
        print("Class probabilities:", custom_proba)

    return clf  # Return the trained model

# ------------------ Example usage ------------------

df = pd.read_csv('play_tennis.csv')
df = df.drop('day', axis=1)

custom_input = pd.DataFrame([{
    'outlook': 'Sunny',
    'temp': 'Hot',
    'humidity': 'High',
    'wind': 'Weak'
}])

manual_random_forest(
    df=df,
    target_column='play',
    numeric_features=[],  # Add numeric columns if you have them
    categorical_features=['outlook', 'temp', 'humidity', 'wind'],
    test_size=0.3,
    custom_input=custom_input
)


Test predictions:
 ['Yes' 'Yes' 'No' 'Yes' 'Yes']

Custom input:
  outlook temp humidity  wind
0   Sunny  Hot     High  Weak
Predicted class: ['No']
Class probabilities: [[0.63 0.37]]
