In [1]:
# Packages
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score

import PyGRF

### Read data and split into training and test sets

In [2]:
data_income = pd.read_csv("../Data/Income.csv")
y = data_income[["CodeELSTAT", "Income01"]]
X_train, X_test, y_train, y_test = train_test_split(data_income, y, test_size=0.3, random_state=42)

### Search the optimal bandwidth and local model weight using incremental spatial autocorrelation

In [3]:
bandwidth, local_weight, p_value = PyGRF.search_bw_lw_ISA(X_train["Income01"], X_train[['X', 'Y']])

 There are 60 disconnected components.
 There are 3 disconnected components.


bandwidth: 39, moran's I: 0.46002225416187353, p-value: 5.07564088419884e-309


  self.seI_norm = self.VI_norm ** (1 / 2.0)
  self.seI_rand = VIR ** (1 / 2.0)


### Evaluate performance of PyGRF using 10-fold cross validation

In [4]:
# function for standarizing independent variables
def standarize_data(data, stats):
    return (data - stats['mean']) / stats['std']

In [5]:
# get columns for only dependent variables
X_columns = ['UnemrT01', 'PrSect01', 'Foreig01']

y_predict = []
y_true = []
df_local_fi = pd.DataFrame()
df_global_fi = pd.DataFrame()

K_fold = KFold(n_splits=10, shuffle=True, random_state=42)

i = 0
for train_index, test_index in K_fold.split(data_income):
    print("fold:", i)
    
    # get the training and test data in each fold
    X_train_all, X_test_all = data_income.iloc[train_index], data_income.iloc[test_index]
    y_train, y_test = X_train_all['Income01'], X_test_all['Income01']
    X_train = X_train_all[X_columns]
    X_test = X_test_all[X_columns]
    xy_coord = X_train_all[['X', 'Y']]
    coords_test = X_test_all[['X', 'Y']]

    # standarize independent variables
    training_stat = X_train.describe().transpose()
    X_scaled_train = standarize_data(X_train, training_stat)
    X_scaled_test = standarize_data(X_test, training_stat)

    # create a PyGRF model
    pygrf = PyGRF.PyGRFBuilder(n_estimators=60, max_features=1, band_width=39, train_weighted=True, predict_weighted=True, bootstrap=False,
                          resampled=True, random_state=42)

    # fit the model and use it to make predictions
    pygrf.fit(X_scaled_train, y_train, xy_coord)
    predict_combined, predict_global, predict_local = pygrf.predict(X_scaled_test, coords_test, local_weight=0.46)

    # get the feature importance output by local models
    local_fi = pygrf.get_local_feature_importance()
    df_local_fi = pd.concat([df_local_fi, local_fi])

    # get the feature importance output by the global random forest model
    global_fi = pygrf.global_model.feature_importances_
    df_global_fi = pd.concat([df_global_fi, pd.DataFrame(data=global_fi.reshape(1, -1), columns=X_columns)])

    y_predict = y_predict + predict_combined
    y_true = y_true + y_test.tolist()

    i = i + 1

fold: 0
fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8
fold: 9


In [6]:
# compute the RMSE and r-square
rmse = mean_squared_error(y_true, y_predict, squared=False)
r2 = r2_score(y_true, y_predict)
print("rmse: " + str(round(rmse, 4)), "r2: " + str(round(r2, 4)))

rmse: 1546.5187 r2: 0.7231


### Examine the obtained feature importance

In [7]:
# show the local feature importance
print(df_local_fi.shape)
df_local_fi.head()

(2925, 4)


Unnamed: 0,model_index,UnemrT01,PrSect01,Foreig01
0,0,0.262599,0.503027,0.234374
1,1,0.247207,0.5084,0.244394
2,2,0.244848,0.500078,0.255075
3,3,0.218492,0.514029,0.267478
4,4,0.271027,0.524014,0.20496


In [8]:
# show the global feature importance
print(df_global_fi.shape)
df_global_fi.head()

(10, 3)


Unnamed: 0,UnemrT01,PrSect01,Foreig01
0,0.236782,0.577236,0.185982
0,0.237711,0.586112,0.176177
0,0.226723,0.591093,0.182185
0,0.232569,0.588763,0.178669
0,0.236716,0.580253,0.183031


We used the example income dataset from the R package "SpatialML" developed by Georganos et al., 2019. We would like to express our gratitude to the authors of this work for generously providing this dataset.