In [1]:
# Packages
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score

import PyGRF

## Read data and split into training and test sets

In [2]:
data_obesity = pd.read_csv("../Data/Obesity.csv")
y = data_obesity[["Census tract code", "obesity rate"]]
X_train, X_test, y_train, y_test = train_test_split(data_obesity, y, test_size=0.3, random_state=42)

## Search the optimal bandwidth and local model weight using incremental spatial autocorrelation

In [None]:
bandwidth, local_weight, p_value = PyGRF.search_bw_lw_ISA(X_train["obesity rate"], X_train[['Lon', 'Lat']])

## Evaluate performance of PyGRF using 10-fold cross validation

In [None]:
# get columns for only dependent variables
columns_to_exclude = ['Census tract code', 'Lon', 'Lat', 'obesity rate']
X_columns = [column for column in data_obesity.columns if column not in columns_to_exclude]

y_predict = []
y_true = []
df_feature_importance = pd.DataFrame()

K_fold = KFold(n_splits=10, shuffle=True, random_state=42)
for train_index, test_index in K_fold.split(data_obesity):
    # get the training and test data in each fold
    X_train_all, X_test_all = data_obesity.iloc[train_index], data_obesity.iloc[test_index]
    y_train, y_test = X_train_all['obesity rate'], X_test_all['obesity rate']
    X_train = X_train_all[X_columns]
    X_test = X_test_all[X_columns]
    xy_coord = X_train_all[['Lon', 'Lat']]
    coords_test = X_test_all[['Lon', 'Lat']]

    # standarize dependent variables
    training_stat = X_train.describe().transpose()
    X_scaled_train = standarize_data(X_train, training_stat)
    X_scaled_test = standarize_data(X_test, training_stat)

    # create a PyGRF model
    pygrf_obesity = PyGRF.PyGRFBuilder(ntree=400, mtry=1/3, band_width=152, train_weighted=True, predict_weighted=True, bootstrap=False,
                          resampled=True, random_seed=42)

    # fit the model and use it to make predictions
    pygrf_obesity.fit(X_scaled_train, y_train, xy_coord)
    predict_combined, predict_global, predict_local = pygrf_obesity.predict(X_scaled_test, coords_test, local_weight=0.4488)

    # get the feature importance output by the local models
    local_feature_importance = pygrf_obesity.get_local_feature_importance()
    df_feature_importance = pd.concat([df_feature_importance, local_feature_importance])

    y_predict = y_predict + predict_combined
    y_true = y_true + y_test.tolist()

In [None]:
# compute the RMSE and r-square
rmse = mean_squared_error(y_true, y_predict, squared=False)
r2 = r2_score(y_true, y_predict)
print("rmse: " + str(round(rmse, 4)), "r2: " + str(round(r2, 4)))