In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from flaml import AutoML

Load the data

In [5]:
file_path = 'C:\Texas\Final_Texas_cbsa_SVI_2016.csv'
data = pd.read_csv(file_path)

Filter data for the specific city

In [6]:
data = data[data['NAME'].isin([
    "San Antonio-New Braunfels, TX",
    "Dallas-Fort Worth-Arlington, TX",
    "Austin-Round Rock, TX",
    "Houston-The Woodlands-Sugar Land, TX"
])]

city = "Austin-Round Rock, TX"
data = data[data['NAME'] == city]

Define target and features

In [7]:
y = data.Total
X = data[['theme1', 'theme2', 'theme3', 'theme4', 'RH_mean', 'LST_mean', 'PopDens']]

In [8]:
X.shape

(87, 7)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=111)

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)
train_df.to_csv(f'train_{city}.csv', index=False)
test_df.to_csv(f'test_{city}.csv', index=False)

In [None]:
# final = pd.concat([X, y], axis=1)
# final.to_csv(f"Data_{city}.csv", index=False)

Initialize KFold

In [12]:
kf = KFold(n_splits= 2 , shuffle=True , random_state=111)

Store performance metrics

In [44]:
results = []

Perform K-fold cross-validation

In [45]:
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    print(f"Processing fold {fold+1}...")
    
    # Split the data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Save train and test sets
    train_df = pd.concat([X_train, y_train], axis=1)
    test_df = pd.concat([X_test, y_test], axis=1)
    train_df.to_csv(f'train_fold_{fold+1}.csv', index=False)
    test_df.to_csv(f'test_fold_{fold+1}.csv', index=False)
    
    # Initialize FLAML's AutoML
    automl = AutoML()
    automl_settings = {
        "time_budget": 300,  # total running time in seconds
        "metric": 'rmse',      # metric for evaluation
        "task": 'regression',
        "n_splits": 5,      # number of cross-validation splits
        "eval_method": 'cv',
        "seed": 111,        # random seed for reproducibility
        "verbose":2,
        # "estimator_list": ['xgboost'],  # use XGBoost (xgboost), LGBM (lgbm)
    }
    
    # Fit model
    automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
    best_model = automl.model
    
    # Make predictions
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    # Evaluate performance
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)
    
    # Store results
    results.append([fold+1, best_model, rmse_train, rmse_test, mae_train, mae_test, r2_train, r2_test])
    
    print(f"Fold {fold+1}: Best Model = {best_model}")
    print(f"Fold {fold+1}: RMSE Train = {rmse_train:.2f}, RMSE Test = {rmse_test:.2f}")
    print(f"Fold {fold+1}: MAE Train = {mae_train:.2f}, MAE Test = {mae_test:.2f}")
    print(f"Fold {fold+1}: RÂ² Train = {r2_train:.2f}, R² Test = {r2_test:.2f}\n")

Processing fold 1...
Fold 1: Best Model = <flaml.automl.model.ExtraTreesEstimator object at 0x000001A3BF2FC7C0>
Fold 1: RMSE Train = 88.74, RMSE Test = 151.18
Fold 1: MAE Train = 70.57, MAE Test = 99.69
Fold 1: RÂ² Train = 0.80, R² Test = 0.53

Processing fold 2...
Fold 2: Best Model = <flaml.automl.model.RandomForestEstimator object at 0x000001A3BD302F10>
Fold 2: RMSE Train = 56.57, RMSE Test = 170.25
Fold 2: MAE Train = 40.57, MAE Test = 119.22
Fold 2: RÂ² Train = 0.92, R² Test = 0.41

Processing fold 3...
Fold 3: Best Model = <flaml.automl.model.ExtraTreesEstimator object at 0x000001A3BF052DC0>
Fold 3: RMSE Train = 42.19, RMSE Test = 120.35
Fold 3: MAE Train = 34.74, MAE Test = 90.65
Fold 3: RÂ² Train = 0.96, R² Test = 0.49

Processing fold 4...
Fold 4: Best Model = <flaml.automl.model.ExtraTreesEstimator object at 0x000001A3BDFD2820>
Fold 4: RMSE Train = 12.86, RMSE Test = 176.65
Fold 4: MAE Train = 10.01, MAE Test = 127.47
Fold 4: RÂ² Train = 1.00, R² Test = 0.32

Processing fold 

Save results to a DataFrame

In [46]:
results_df = pd.DataFrame(results, columns=['Fold', 'Best Model','RMSE_Train', 'RMSE_Test', 'MAE_Train', 'MAE_Test', 'R2_Train', 'R2_Test'])
results_df.to_csv(f"5-Fold_results_{city}.csv", index=False)