In [1]:
# Packages
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score

import PyGRF

### Read data and split into training and test sets

In [2]:
data_311 = pd.read_csv("../Data/311Request.csv")
y = data_311[["CBG ID", "311_requests"]]
X_train, X_test, y_train, y_test = train_test_split(data_311, y, test_size=0.3, random_state=42)

### Search the optimal bandwidth and local model weight using incremental spatial autocorrelation

In [3]:
bandwidth, local_weight, p_value = PyGRF.search_bw_lw_ISA(X_train["311_requests"], X_train[['Lon', 'Lat']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coords['coordinate'] = coords.apply(lambda row: tuple(row), axis=1)
 There are 59 disconnected components.


bandwidth: 131, moran's I: 0.04443162328339294, p-value: 5.56700472691986e-35


  self.seI_rand = VIR ** (1 / 2.0)


### Evaluate performance of PyGRF using 10-fold cross validation

In [4]:
# function for standarizing independent variables
def standarize_data(data, stats):
    return (data - stats['mean']) / stats['std']

In [5]:
# get columns for only dependent variables
columns_to_exclude = ['CBG ID', 'Lon', 'Lat', '311_requests']
X_columns = [column for column in data_311.columns if column not in columns_to_exclude]

y_predict = []
y_true = []
df_feature_importance = pd.DataFrame()

K_fold = KFold(n_splits=10, shuffle=True, random_state=42)
for train_index, test_index in K_fold.split(data_311):
    # get the training and test data in each fold
    X_train_all, X_test_all = data_311.iloc[train_index], data_311.iloc[test_index]
    y_train, y_test = X_train_all['311_requests'], X_test_all['311_requests']
    X_train = X_train_all[X_columns]
    X_test = X_test_all[X_columns]
    xy_coord = X_train_all[['Lon', 'Lat']]
    coords_test = X_test_all[['Lon', 'Lat']]

    # standarize independent variables
    training_stat = X_train.describe().transpose()
    X_scaled_train = standarize_data(X_train, training_stat)
    X_scaled_test = standarize_data(X_test, training_stat)

    # create a PyGRF model
    pygrf_311 = PyGRF.PyGRFBuilder(n_estimators=60, max_features=1/3, band_width=131, train_weighted=True, predict_weighted=True, bootstrap=False,
                          resampled=True, random_seed=42)

    # fit the model and use it to make predictions
    pygrf_311.fit(X_scaled_train, y_train, xy_coord)
    predict_combined, predict_global, predict_local = pygrf_311.predict(X_scaled_test, coords_test, local_weight=0.0444)

    # get the feature importance output by local models
    local_feature_importance = pygrf_311.get_local_feature_importance()
    df_feature_importance = pd.concat([df_feature_importance, local_feature_importance])

    y_predict = y_predict + predict_combined
    y_true = y_true + y_test.tolist()

In [6]:
# compute the RMSE and r-square
rmse = mean_squared_error(y_true, y_predict, squared=False)
r2 = r2_score(y_true, y_predict)
print("rmse: " + str(round(rmse, 4)), "r2: " + str(round(r2, 4)))

rmse: 15.4855 r2: 0.4205


In [7]:
# show the local feature importance
print(df_feature_importance.shape)
df_feature_importance.head()

(2610, 19)


Unnamed: 0,model_index,snow depth,% below poverty,% civilian unemployed,% capita income,% no high school,% 65 older,% 17 younger,% household disability,% single parent household,% minority,% not well english,% multi unit,% mobile homes,% crowding,% no vehicle,% group quarters,historical requests,neighbor historical request
0,0,0.024639,0.013803,0.051702,0.013589,0.031262,0.021775,0.02523,0.035325,0.013596,0.011449,0.00584,0.003662,0.244129,0.003041,0.020844,0.004406,0.281611,0.194095
1,1,0.028372,0.012235,0.054203,0.037277,0.015287,0.033815,0.024023,0.028019,0.007463,0.017211,0.007487,0.006333,0.13094,0.011391,0.019245,0.005361,0.3066,0.254736
2,2,0.021804,0.022529,0.042661,0.023111,0.028895,0.022575,0.030884,0.018162,0.013939,0.013383,0.006626,0.004036,0.11618,0.006657,0.009674,0.004737,0.348129,0.266018
3,3,0.033417,0.018874,0.04256,0.056637,0.025085,0.025075,0.022026,0.018858,0.026423,0.052048,0.025751,0.013664,0.002228,0.013962,0.025462,0.006486,0.339474,0.25197
4,4,0.048855,0.021553,0.028368,0.04491,0.023554,0.033304,0.020745,0.028333,0.022154,0.050736,0.02281,0.014873,0.002996,0.011635,0.024614,0.014393,0.353071,0.233095
