In [1]:
#### Fit, Evaluate, Interpret Random Forests ####

# Importing necessary libraries
import pandas as pd
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
from plotnine import ggplot, aes, geom_point, xlab, ylab

# Import data
ran_state = 1234
carseats_train = pd.read_csv("carseats_train.csv")
carseats_train = carseats_train.dropna()

carseats_test = pd.read_csv("carseats_test.csv")
carseats_test = carseats_test.dropna()

# Distinguish training and test data
X_train = carseats_train.drop(['Sales', 'Sales_cat'], axis=1)
y_train = carseats_train['Sales']

X_test = carseats_test.drop(['Sales', 'Sales_cat'], axis=1)
y_test = carseats_test['Sales']

# Model piece to handle categorical variables
ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["US", "Urban", "Shelf_Location"]),
  ],
  remainder = "passthrough"
)

## Regression RF (Min n 30)
rf_pipeline = Pipeline(
  [("preprocessing", ct),
    ("rf", RandomForestRegressor(min_samples_split=30, random_state = ran_state))]
).set_output(transform="pandas")


rf_fitted = rf_pipeline.fit(X_train, y_train)

y_pred = rf_fitted.predict(X_test)

r2_1 = r2_score(y_test, y_pred)

importances = rf_fitted["rf"].feature_importances_
forest_importances = pd.DataFrame({
    "Variable Importance": importances,
    "Variable": ct.transform(X_train).columns
})
print("Variable Importances for Model with Min n of 30:")
print(forest_importances)

## Regression RF (Min n 20)
rf_pipeline = Pipeline(
  [("preprocessing", ct),
    ("rf", RandomForestRegressor(min_samples_split=20, random_state = ran_state))]
).set_output(transform="pandas")


rf_fitted = rf_pipeline.fit(X_train, y_train)

y_pred = rf_fitted.predict(X_test)

r2_2 = r2_score(y_test, y_pred)


## Regression RF (Min n 15)
rf_pipeline = Pipeline(
  [("preprocessing", ct),
    ("rf", RandomForestRegressor(min_samples_split=15, random_state = ran_state))]
).set_output(transform="pandas")


rf_fitted = rf_pipeline.fit(X_train, y_train)

y_pred = rf_fitted.predict(X_test)

r2_3 = r2_score(y_test, y_pred)

## Regression RF (Min n 10)
rf_pipeline = Pipeline(
  [("preprocessing", ct),
    ("rf", RandomForestRegressor(min_samples_split=10, random_state = ran_state))]
).set_output(transform="pandas")


rf_fitted = rf_pipeline.fit(X_train, y_train)

y_pred = rf_fitted.predict(X_test)

r2_4 = r2_score(y_test, y_pred)

## Regression RF (Min n 5)
rf_pipeline = Pipeline(
  [("preprocessing", ct),
    ("rf", RandomForestRegressor(min_samples_split=5, random_state = ran_state))]
).set_output(transform="pandas")


rf_fitted = rf_pipeline.fit(X_train, y_train)

y_pred = rf_fitted.predict(X_test)

r2_5 = r2_score(y_test, y_pred)

# From summary output above
d = {'min_n': [30, 20, 15, 10, 5], 'r2': [r2_1, r2_2, r2_3, r2_4, r2_5] }
results = pd.DataFrame(data = d)

print(results)

Variable Importances for Model with Min n of 30:
    Variable Importance                        Variable
0              0.000663                  dummify__US_No
1              0.000000                 dummify__US_Yes
2              0.000000               dummify__Urban_No
3              0.000382              dummify__Urban_Yes
4              0.047358     dummify__Shelf_Location_Bad
5              0.303142    dummify__Shelf_Location_Good
6              0.032520  dummify__Shelf_Location_Medium
7              0.062522     remainder__Competitor_Price
8              0.019444               remainder__Income
9              0.090383          remainder__Advertising
10             0.008162           remainder__Population
11             0.382861                remainder__Price
12             0.047957                  remainder__Age
13             0.004607            remainder__Education
   min_n        r2
0     30  0.604044
1     20  0.645019
2     15  0.667721
3     10  0.694950
4      5  0.7094

In [2]:
# Predictions for two prices

# Create data frame for new observations
new_store = {'US': ["Yes", "Yes"],
            'Urban': ["Yes", "Yes"],
            'Advertising': [6.3, 6.3],
            'Shelf_Location': ["Good", "Good"],
            'Population': [245.78, 245.78],
            'Competitor_Price': [125, 125],
            'Income': [67, 67],
            'Price': [100, 130],
            'Age': [53, 53],
            'Education': [14, 14]
            }
new_store = pd.DataFrame(data = new_store)

# Build random forest model
rf_pipeline = Pipeline(
  [("preprocessing", ct),
    ("rf", RandomForestRegressor(min_samples_split=5, random_state = ran_state))]
).set_output(transform="pandas")


rf_fitted = rf_pipeline.fit(X_train, y_train)

# Compute and print predictions
y_pred = rf_fitted.predict(new_store)
print("Predictions for Two Price Options:")
print(y_pred)

Predictions for Two Price Options:
[11.1323356  10.08316103]
