<a href="https://colab.research.google.com/github/dreamingv-oid/CS290/blob/main/Project3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Khush Dakwala and Benjamin Jackson

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

def Attribute_selection_method(task, dataset, target_var, criterion=None):
    # Separate features and target
    X = dataset.drop(columns=[target_var])
    y = dataset[target_var]

    # Identify categorical columns
    categorical_cols = X.select_dtypes(include=['object']).columns

    # Create a preprocessor for categorical columns (One-Hot Encoding)
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ], remainder='passthrough'
    )

    # Select the appropriate model based on task type and criterion
    if task == "classification":
        if criterion == "entropy":
            model = DecisionTreeClassifier(criterion="entropy", random_state=42)
        elif criterion == "gini":
            model = DecisionTreeClassifier(criterion="gini", random_state=42)
        else:
            raise ValueError("Invalid criterion for classification. Choose 'entropy' or 'gini'.")
    elif task == "regression":
        model = DecisionTreeRegressor(criterion="squared_error", random_state=42)
    else:
        raise ValueError("Invalid task type. Choose 'classification' or 'regression'.")

    # Create a pipeline with preprocessing and the model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Fit the model
    pipeline.fit(X, y)

    # Get feature importances and find the best feature
    importances = pipeline.named_steps['model'].feature_importances_
    best_feature_idx = np.argmax(importances)
    best_feature = X.columns[best_feature_idx]

    return best_feature


## Applying the method to Categorical and Numerical targets and comparing the split chosen between our method and sklearn's.

In [2]:
# Load the dataset
url = 'https://raw.githubusercontent.com/dreamingv-oid/CS290/main/train.csv'
df = pd.read_csv(url)

# Strip any whitespace characters from column names
df.columns = df.columns.str.strip()

# Drop rows with missing values
df.dropna(inplace=True)

# Apply the method to the classification dataset
# Create a copy of the DataFrame for classification
df_classification = df.copy()
best_feature_classification = Attribute_selection_method("classification", df_classification, "satisfaction", criterion="gini")

# Fit sklearn's DecisionTreeClassifier to compare first split
X_classification = df_classification.drop(columns=["satisfaction"])
y_classification = df_classification["satisfaction"]

# Create and fit the classifier within a pipeline
classifier = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), X_classification.select_dtypes(include=['object']).columns)
        ], remainder='passthrough'
    )),
    ('model', DecisionTreeClassifier(criterion="gini", random_state=42))
])

classifier.fit(X_classification, y_classification)

# Get the first split chosen by sklearn's DecisionTreeClassifier
first_split_classification = classifier.named_steps['model'].tree_.feature[classifier.named_steps['model'].tree_.children_left[0]]

# Apply the method to the regression dataset
# Create a copy of the DataFrame for regression
df_regression = df.copy()
best_feature_regression = Attribute_selection_method("regression", df_regression, "Flight Distance")

# Fit sklearn's DecisionTreeRegressor to compare first split
X_regression = df_regression.drop(columns=["Flight Distance"])
y_regression = df_regression["Flight Distance"]

# Create and fit the regressor within a pipeline
regressor = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), X_regression.select_dtypes(include=['object']).columns)
        ], remainder='passthrough'
    )),
    ('model', DecisionTreeRegressor(criterion="squared_error", random_state=42))
])

regressor.fit(X_regression, y_regression)

# Get the first split chosen by sklearn's DecisionTreeRegressor
first_split_regression = regressor.named_steps['model'].tree_.feature[regressor.named_steps['model'].tree_.children_left[0]]

# Print the results together
print(f"Best splitting criterion for classification (Gini) via our method: {best_feature_classification}")
print(f"Best splitting criterion for regression via our method: {best_feature_regression}")

# Print sklearn's first split features
print(f"First split feature chosen by sklearn's DecisionTreeClassifier: {X_classification.columns[first_split_classification]}")
print(f"First split feature chosen by sklearn's DecisionTreeRegressor: {X_regression.columns[first_split_regression]}")

Best splitting criterion for classification (Gini) via our method: Baggage handling
Best splitting criterion for regression via our method: Online boarding
First split feature chosen by sklearn's DecisionTreeClassifier: Online boarding
First split feature chosen by sklearn's DecisionTreeRegressor: Online boarding
