In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict,GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score


In [2]:
df_train = pd.read_csv("steel.csv")

Y = df_train["tensile_strength"]

X = df_train[["normalising_temperature",
              "tempering_temperature",
              "percent_silicon",
              "percent_chromium",
              "percent_copper",
              "percent_nickel",
              "percent_sulphur",
              "percent_carbon",
              "percent_manganese"]]


In [3]:
default_model = DecisionTreeRegressor(random_state=42)
k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=42)

y_pred = cross_val_predict(default_model, X, Y, cv=kf)

mae = mean_absolute_error(Y, y_pred)

r2_scores = cross_val_score(default_model, X, Y, cv=kf, scoring='r2')
average_r2 = np.mean(r2_scores)

print(f"MAE: {mae:.3f}")
print(f"R² Score for each fold: {[round(score, 4) for score in r2_scores]}")
print(f"Average R² across {k} folds: {average_r2:.2f}")


Mean Absolute Error (MAE): 26.496
R² Score for each fold: [np.float64(0.8644), np.float64(0.7942), np.float64(0.7077), np.float64(0.8386), np.float64(0.7185), np.float64(0.7201), np.float64(0.8018), np.float64(0.6224), np.float64(0.867), np.float64(0.8525)]
Average R² across 10 folds: 0.78


In [8]:
max_depth_values = [None, 3, 5, 7, 10,12,15,17,21]
max_features_values = [None, 'sqrt', 'log2']

param_grid = {
    'max_depth': max_depth_values,
    'max_features': max_features_values
}

kf = KFold(n_splits=10, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    param_grid=param_grid,
    cv=kf,
    scoring='r2',
    return_train_score=True
)

grid_search.fit(X, Y)

cv_results = grid_search.cv_results_
for i in range(len(cv_results['params'])):
    max_depth = cv_results['params'][i]['max_depth']
    max_features = cv_results['params'][i]['max_features']
    mean_train = cv_results['mean_train_score'][i]
    mean_test = cv_results['mean_test_score'][i]
    
    model = DecisionTreeRegressor(max_depth=max_depth, max_features=max_features, random_state=42)
    y_pred = cross_val_predict(model, X, Y, cv=kf)
    mae = mean_absolute_error(Y, y_pred)
    
    print(f"For max_depth={max_depth}, max_features={max_features} = Train R²={mean_train:.3f}, Avg Test R²={mean_test:.3f}, MAE={mae:.3f}")

print("\nBest hyperparameters:", grid_search.best_params_)


For max_depth=None, max_features=None = Train R²=1.000, Avg Test R²=0.779, MAE=26.496
For max_depth=None, max_features=sqrt = Train R²=1.000, Avg Test R²=0.738, MAE=29.513
For max_depth=None, max_features=log2 = Train R²=1.000, Avg Test R²=0.738, MAE=29.513
For max_depth=3, max_features=None = Train R²=0.625, Avg Test R²=0.519, MAE=48.288
For max_depth=3, max_features=sqrt = Train R²=0.582, Avg Test R²=0.463, MAE=50.772
For max_depth=3, max_features=log2 = Train R²=0.582, Avg Test R²=0.463, MAE=50.772
For max_depth=5, max_features=None = Train R²=0.826, Avg Test R²=0.701, MAE=36.222
For max_depth=5, max_features=sqrt = Train R²=0.769, Avg Test R²=0.651, MAE=40.385
For max_depth=5, max_features=log2 = Train R²=0.769, Avg Test R²=0.651, MAE=40.385
For max_depth=7, max_features=None = Train R²=0.921, Avg Test R²=0.778, MAE=30.396
For max_depth=7, max_features=sqrt = Train R²=0.880, Avg Test R²=0.710, MAE=34.729
For max_depth=7, max_features=log2 = Train R²=0.880, Avg Test R²=0.710, MAE=34