In [20]:
# Packages
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score

import PyGRF

## Read data and split into training and test sets

In [21]:
data_311 = pd.read_csv("../Data/311Request.csv")
y = data_311[["CBG ID", "311_requests"]]
X_train, X_test, y_train, y_test = train_test_split(data_311, y, test_size=0.3, random_state=42)

Unnamed: 0,CBG ID,snow depth,% below poverty,% civilian unemployed,% capita income,% no high school,% 65 older,% 17 younger,% household disability,% single parent household,...,% multi unit,% mobile homes,% crowding,% no vehicle,% group quarters,historical requests,neighbor historical request,Lon,Lat,311_requests
18,360000000000.0,166.325103,0.275636,0.301294,21589,0.185222,0.289582,0.127974,0.436533,1.0,...,0.0,0.0,0.0,0.236842,0.013946,0.182937,0.200163,330130,324994,35.274815
275,360000000000.0,135.436336,0.258929,0.0,17821,0.380665,0.089286,0.367857,0.190476,0.247573,...,0.0,0.0,0.0,0.15873,0.0,0.189286,0.153141,323919,328299,7.142857
86,360000000000.0,136.850184,0.270059,0.005319,27018,0.130934,0.136986,0.270059,0.447109,0.642005,...,0.0,0.011158,0.081805,0.104372,0.0,0.093444,0.125055,324331,328895,11.252446
227,360000000000.0,242.765666,0.463448,0.060241,25611,0.16085,0.191724,0.06069,0.487572,0.842105,...,0.189142,0.0,0.0,0.479924,0.0,0.212414,0.153223,327135,318397,13.793103
92,360000000000.0,152.551669,0.471319,0.014412,16048,0.07197,0.128052,0.113772,0.178836,1.0,...,0.09826,0.0,0.0,0.231746,0.008752,0.108706,0.12411,330110,327614,9.672962


## Search the optimal bandwidth and local model weight using incremental spatial autocorrelation

In [22]:
bandwidth, local_weight, p_value = PyGRF.search_bw_lw_ISA(X_train["311_requests"], X_train[['Lon', 'Lat']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coords['coordinate'] = coords.apply(lambda row: tuple(row), axis=1)
 There are 59 disconnected components.


bandwidth: 131, moran's I: 0.04443162328339294, p-value: 5.56700472691986e-35


  self.seI_rand = VIR ** (1 / 2.0)


## Evaluate performance of PyGRF using 10-fold cross validation

In [17]:
# function for standarizing variables
def standarize_data(data, stats):
    return (data - stats['mean']) / stats['std']

In [18]:
# get columns for only dependent variables
columns_to_exclude = ['CBG ID', 'Lon', 'Lat', '311_requests']
X_columns = [column for column in data_311.columns if column not in columns_to_exclude]

y_predict = []
y_true = []
df_feature_importance = pd.DataFrame()

K_fold = KFold(n_splits=10, shuffle=True, random_state=42)
for train_index, test_index in K_fold.split(data_311):
    # get the training and test data in each fold
    X_train_all, X_test_all = data_311.iloc[train_index], data_311.iloc[test_index]
    y_train, y_test = X_train_all['311_requests'], X_test_all['311_requests']
    X_train = X_train_all[X_columns]
    X_test = X_test_all[X_columns]
    xy_coord = X_train_all[['Lon', 'Lat']]
    coords_test = X_test_all[['Lon', 'Lat']]

    # standarize dependent variables
    training_stat = X_train.describe().transpose()
    X_scaled_train = standarize_data(X_train, training_stat)
    X_scaled_test = standarize_data(X_test, training_stat)

    # create a PyGRF model
    pygrf_311 = PyGRF.PyGRFBuilder(ntree=60, mtry=1/3, band_width=131, train_weighted=True, predict_weighted=True, bootstrap=False,
                          resampled=True, random_seed=42)

    # fit the model and use it to make predictions
    pygrf_311.fit(X_scaled_train, y_train, xy_coord)
    predict_combined, predict_global, predict_local = pygrf_311.predict(X_scaled_test, coords_test, local_weight=0.0444)

    # get the feature importance output by the local models
    local_feature_importance = pygrf_311.get_local_feature_importance()
    df_feature_importance = pd.concat([df_feature_importance, local_feature_importance])

    y_predict = y_predict + predict_combined
    y_true = y_true + y_test.tolist()

In [19]:
# compute the RMSE and r-square
rmse = mean_squared_error(y_true, y_predict, squared=False)
r2 = r2_score(y_true, y_predict)
print("rmse: " + str(round(rmse, 4)), "r2: " + str(round(r2, 4)))

rmse: 15.4855 r2: 0.4205
