In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

file_path = "/home/dasun/Projects/Research/ICUG/NoteBooks/data/Data.xlsx"

# --- 1. Load and Prepare Data ---
try:
    # Load data from "sheet 3" of the specified Excel file
    df = pd.read_excel(file_path, sheet_name='Sheet3')
except FileNotFoundError:
    print("Error: 'Data.xlsx' not found. Please ensure the file is in the correct directory.")
    exit()

# Clean the data: drop the 'name' column to prevent overfitting and remove any missing values
df = df.drop(columns=['Name '])
df = df.dropna()

# --- 2. Define Features (X) and Target (y) ---
X = df.drop(columns=['IAUC'])
y = df['IAUC']

# --- 3. Identify Feature Types for Preprocessing ---
# Automatically identify numerical and categorical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = ['health conditions'] 

# --- 4. Create the Data Preprocessing Pipeline ---
# This transformer scales numerical features and one-hot encodes categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# --- 5. Define the Final, Tuned Model ---
# We use the exact best parameters discovered during the hyperparameter tuning phase.
# These specific settings resulted in the high accuracy score.
final_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=50,          # The optimal number of trees found
        max_depth=10,             # The optimal max depth
        min_samples_split=2,      # The optimal min samples to split
        min_samples_leaf=1,       # The optimal min samples per leaf
        random_state=42           # Use a fixed random state for reproducible results
    ))
])

# --- 6. Split Data into Training and Testing Sets ---
# We use the same random_state to ensure the split is the same every time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 7. Train the Model ---
# Fit the entire pipeline on the training data
print("🚀 Training the final model with optimal parameters...")
final_model.fit(X_train, y_train)
print("✅ Model training complete.")

# --- 8. Evaluate the Model's Performance ---
# Make predictions on the unseen test data
y_pred = final_model.predict(X_test)

# Calculate the final performance metrics
final_r2 = r2_score(y_test, y_pred)
final_rmse = mean_squared_error(y_test, y_pred, squared=False)

# --- 9. Display the Final Results ---
print("\n-------------------------------------------")
print("      Final Model Performance")
print("-------------------------------------------")
print(f"R-squared (R²): {final_r2:.4f}")
print(f"Root Mean Squared Error (RMSE): {final_rmse:.2f}")
print("-------------------------------------------")

🚀 Training the final model with optimal parameters...


ValueError: A given column is not a column of the dataframe

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

file_path = "/home/dasun/Projects/Research/ICUG/NoteBooks/data/Data.xlsx"


# --- 1. Load and Prepare Data (Corrected) ---
try:
    # Correctly load from 'Sheet3'
    df = pd.read_excel(file_path, sheet_name='Sheet3')
except FileNotFoundError:
    print("Error: 'Data.xlsx' not found. Please ensure the file is in the correct directory.")
    exit()

# Clean data: drop the 'Name ' column (with trailing space) and remove missing values
df = df.drop(columns=['Name '])
df = df.dropna()

# --- 2. Define Features (X) and Target (y) ---
X = df.drop(columns=['IAUC'])
y = df['IAUC']

# --- 3. Identify Feature Types (Corrected) ---
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
# Use 'Food Item' as the key categorical feature
categorical_features = ['Food Item'] 

# --- 4. Create the Data Preprocessing Pipeline ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# --- 5. Define the Final, Tuned Model ---
# These are the new optimal parameters found after re-running the analysis
final_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=200,         # Optimal number of trees
        max_depth=None,           # Optimal max depth
        min_samples_split=2,      # Optimal min samples to split
        min_samples_leaf=1,       # Optimal min samples per leaf
        random_state=42           # For reproducible results
    ))
])

# --- 6. Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 7. Train and Evaluate the Model ---
print("🚀 Training the final, corrected model...")
final_model.fit(X_train, y_train)
print("✅ Model training complete.")

y_pred = final_model.predict(X_test)

final_r2 = r2_score(y_test, y_pred)
final_rmse = mean_squared_error(y_test, y_pred, squared=False)

# --- 8. Display Final Results ---
print("\n-------------------------------------------")
print("      Final Model Performance (Corrected)")
print("-------------------------------------------")
print(f"R-squared (R²): {final_r2:.4f}")
print(f"Root Mean Squared Error (RMSE): {final_rmse:.2f}")
print("-------------------------------------------")

🚀 Training the final, corrected model...
✅ Model training complete.


TypeError: got an unexpected keyword argument 'squared'