In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd

import tensorflow as tf
from tensorflow import keras

from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from scipy.spatial import distance

import spacv

In [None]:
# load the dataset for obesity prevalence prediction

df_obesity = pd.read_csv("../Data/Obesity/Obesity.csv")
df_obesity[["GEOID"]] = df_obesity[["GEOID"]].astype(str)
y = df_obesity['obesity_cr']
df_obesity.shape

In [None]:
using_columns = ['% Black','% Ame Indi and AK Native','% Asian','% Nati Hawa and Paci Island','% Hispanic or Latino','% male',
                 '% married','% age 18-29','% age 30-39','% age 40-49','% age 50-59','% age >=60','% <highschool',
                 'median income','% unemployment','% below poverty line','% food stamp/SNAP','median value units built',
                 'median year units built','% renter-occupied housing units','population density']
num_features = len(using_columns)
num_features

In [None]:
# Create the geodataframe for the data

gdf_obesity = gpd.GeoDataFrame(df_obesity, geometry=gpd.points_from_xy(df_obesity['Lonpro'], df_obesity['Latpro']))

In [None]:
# Standardization function

def standarize_data(data, stats):
    return (data - stats['mean'])/ stats['std']

## Random CV

In [None]:
# random split

y_dnn_socio_predict = []
y_true = []

ten_fold = KFold(n_splits=10, shuffle=True, random_state=42)

i = 1

for train_index, test_index in ten_fold.split(df_obesity):
    print("fold:", str(i))
    X_train, X_test = df_obesity.iloc[train_index], df_obesity.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    X_train = X_train[using_columns]
    X_test = X_test[using_columns]
    
    training_stat = X_train.describe().transpose()
    scaled_X_train = standarize_data(X_train, training_stat)
    scaled_X_test = standarize_data(X_test, training_stat)

    tf.random.set_seed(42)
    
    dnn_model = keras.models.Sequential([
        keras.layers.Dense(160,activation="relu"),
        keras.layers.Dense(208,activation="relu"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(160,activation="relu"),
        keras.layers.Dense(160,activation="relu"),
        keras.layers.Dense(256,activation="relu"),
        keras.layers.Dense(32,activation="relu"),
        keras.layers.Dense(240,activation="relu"),
        keras.layers.Dense(96,activation="relu"),
        keras.layers.Dense(208,activation="relu"),
        keras.layers.Dense(1)
    ])
    
    #early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
    dnn_model.compile(optimizer="adam", loss=keras.losses.mean_squared_error, metrics=[keras.metrics.mean_squared_error])
    dnn_model.fit(x=scaled_X_train, y=y_train, epochs=50, verbose=2) #callbacks=[early_stop],
    
    this_y_predict = dnn_model.predict(scaled_X_test).flatten()
    y_dnn_socio_predict = y_dnn_socio_predict + this_y_predict.tolist()
    y_true = y_true + y_test.tolist()
    
    i = i + 1

In [None]:
dnn_socio_rmse = mean_squared_error(y_true , y_dnn_socio_predict, squared=False)
dnn_socio_r2 = r2_score(y_true, y_dnn_socio_predict)
print("rmse: " + str(round(dnn_socio_rmse,4)), "r2: " + str(round(dnn_socio_r2,4)))

## Clustering-based spatial CV

In [None]:
# Split the data based on their coordinates using k-means clustering algorithm

kmeans = KMeans(n_clusters=10, random_state=42).fit(df_obesity[['Lonpro','Latpro']])
centroids = kmeans.cluster_centers_

plt.scatter(df_obesity['Lonpro'], df_obesity['Latpro'], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
plt.show()

In [None]:
# label the cluster index of each sample. 

df_obesity_cluster = df_obesity.copy()
df_obesity_cluster["cluster"] = kmeans.labels_.tolist()
df_obesity_cluster["cluster"].value_counts()

In [None]:
y_dnn_socio_predict = []
y_true = []

group_index = df_obesity_cluster['cluster'].values

group_kfold = GroupKFold(n_splits=10)

i = 1

for train_index, test_index in group_kfold.split(df_obesity_cluster, y, group_index):
    print("fold:", str(i))

    X_train, X_test = df_obesity.iloc[train_index], df_obesity.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    X_train = X_train[using_columns]
    X_test = X_test[using_columns]
    
    training_stat = X_train.describe().transpose()
    scaled_X_train = standarize_data(X_train, training_stat)
    scaled_X_test = standarize_data(X_test, training_stat)

    tf.random.set_seed(42)
    
    dnn_model = keras.models.Sequential([
        keras.layers.Dense(160,activation="relu"),
        keras.layers.Dense(208,activation="relu"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(160,activation="relu"),
        keras.layers.Dense(160,activation="relu"),
        keras.layers.Dense(256,activation="relu"),
        keras.layers.Dense(32,activation="relu"),
        keras.layers.Dense(240,activation="relu"),
        keras.layers.Dense(96,activation="relu"),
        keras.layers.Dense(208,activation="relu"),
        keras.layers.Dense(1)
    ])
    
    #early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
    dnn_model.compile(optimizer="adam", loss=keras.losses.mean_squared_error, metrics=[keras.metrics.mean_squared_error])
    dnn_model.fit(x=scaled_X_train, y=y_train, epochs=50,   verbose=2) #callbacks=[early_stop],
    
    this_y_predict = dnn_model.predict(scaled_X_test).flatten()
    y_dnn_socio_predict = y_dnn_socio_predict + this_y_predict.tolist()
    y_true = y_true + y_test.tolist()
    
    i = i + 1    

In [None]:
dnn_socio_rmse = mean_squared_error(y_true , y_dnn_socio_predict, squared=False)
dnn_socio_r2 = r2_score(y_true, y_dnn_socio_predict)
print("rmse: " + str(round(dnn_socio_rmse,4)), "r2: " + str(round(dnn_socio_r2,4)))

## Grid-based spatial CV

In [None]:
grid_cv = spacv.HBLOCK(3, 3, method='unique', buffer_radius=0).split(gdf_obesity['geometry'])

In [None]:
y_dnn_socio_predict = []
y_true = []

i = 1

for train_index, test_index in grid_cv:
    print("fold:", str(i))

    X_train, X_test = df_obesity.iloc[train_index], df_obesity.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    X_train = X_train[using_columns]
    X_test = X_test[using_columns]
    
    training_stat = X_train.describe().transpose()
    scaled_X_train = standarize_data(X_train, training_stat)
    scaled_X_test = standarize_data(X_test, training_stat)

    tf.random.set_seed(42)
    
    dnn_model = keras.models.Sequential([
        keras.layers.Dense(160,activation="relu"),
        keras.layers.Dense(208,activation="relu"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(160,activation="relu"),
        keras.layers.Dense(160,activation="relu"),
        keras.layers.Dense(256,activation="relu"),
        keras.layers.Dense(32,activation="relu"),
        keras.layers.Dense(240,activation="relu"),
        keras.layers.Dense(96,activation="relu"),
        keras.layers.Dense(208,activation="relu"),
        keras.layers.Dense(1)
    ])
    
    #early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
    dnn_model.compile(optimizer="adam", loss=keras.losses.mean_squared_error, metrics=[keras.metrics.mean_squared_error])
    dnn_model.fit(x=scaled_X_train, y=y_train, epochs=50,   verbose=2) #callbacks=[early_stop],
    
    this_y_predict = dnn_model.predict(scaled_X_test).flatten()
    y_dnn_socio_predict = y_dnn_socio_predict + this_y_predict.tolist()
    y_true = y_true + y_test.tolist()
    
    i = i + 1

In [None]:
dnn_socio_rmse = mean_squared_error(y_true , y_dnn_socio_predict, squared=False)
dnn_socio_r2 = r2_score(y_true, y_dnn_socio_predict)
print("rmse: " + str(round(dnn_socio_rmse,4)), "r2: " + str(round(dnn_socio_r2,4)))

## Geo-attribute-based spatial CV

In [None]:
# load the file for showing which borough each census tract is located in.

gdf_tract_borough = gpd.read_file("../Data/Obesity/gdf_tract_borough.shp")

In [None]:
df_obesity_block = df_obesity.merge(gdf_tract_borough[['GEOID','index_righ']], how='left', left_on="GEOID", right_on="GEOID")
df_obesity_block.head()

In [None]:
df_obesity_block.index_righ.nunique()

In [None]:
y_dnn_socio_predict = []
y_true = []

block = df_obesity_block['index_righ'].values
group_kfold = GroupKFold(n_splits=5)

i = 1

for train_index, test_index in group_kfold.split(df_obesity, y, block):
    print("fold:", str(i))

    X_train, X_test = df_obesity.iloc[train_index], df_obesity.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    X_train = X_train[using_columns]
    X_test = X_test[using_columns]
    
    training_stat = X_train.describe().transpose()
    scaled_X_train = standarize_data(X_train, training_stat)
    scaled_X_test = standarize_data(X_test, training_stat)

    tf.random.set_seed(42)
    
    dnn_model = keras.models.Sequential([
        keras.layers.Dense(160,activation="relu"),
        keras.layers.Dense(208,activation="relu"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(160,activation="relu"),
        keras.layers.Dense(160,activation="relu"),
        keras.layers.Dense(256,activation="relu"),
        keras.layers.Dense(32,activation="relu"),
        keras.layers.Dense(240,activation="relu"),
        keras.layers.Dense(96,activation="relu"),
        keras.layers.Dense(208,activation="relu"),
        keras.layers.Dense(1)
    ])
    
    #early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
    dnn_model.compile(optimizer="adam", loss=keras.losses.mean_squared_error, metrics=[keras.metrics.mean_squared_error])
    dnn_model.fit(x=scaled_X_train, y=y_train, epochs=50,   verbose=2) #callbacks=[early_stop],
    
    this_y_predict = dnn_model.predict(scaled_X_test).flatten()

    y_dnn_socio_predict = y_dnn_socio_predict + this_y_predict.tolist()
    y_true = y_true + y_test.tolist()
    
    i = i + 1

In [None]:
dnn_socio_rmse = mean_squared_error(y_true , y_dnn_socio_predict, squared=False)
dnn_socio_r2 = r2_score(y_true, y_dnn_socio_predict)
print("rmse: " + str(round(dnn_socio_rmse,4)), "r2: " + str(round(dnn_socio_r2,4)))

## Spatial leave-one-out CV

In [None]:
# Compute the radius of buffer as the 0.05 quantile of distances of data

from itertools import combinations

lng_lat_coords = np.array(df_obesity[['Lonpro','Latpro']])

distances = [distance.euclidean(p1, p2) for p1, p2 in combinations(lng_lat_coords, 2)]
distances_array=np.array(distances)
np.quantile(distances_array, 0.05)

In [None]:
# Split the training and test data for each fold using the buffer_radius

skcv = spacv.SKCV(n_splits=1995, buffer_radius=3219, random_state=42).split(gdf_obesity['geometry'])

In [None]:
y_dnn_socio_predict = []
y_true = []

i = 1

for train_index, test_index in skcv:
    print("fold:", str(i))
    X_train, X_test = df_obesity.iloc[train_index], df_obesity.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    X_train = X_train[using_columns]
    X_test = X_test[using_columns]
    
    training_stat = X_train.describe().transpose()
    scaled_X_train = standarize_data(X_train, training_stat)
    scaled_X_test = standarize_data(X_test, training_stat)

    tf.random.set_seed(42)
    
    dnn_model = keras.models.Sequential([
        keras.layers.Dense(160,activation="relu"),
        keras.layers.Dense(208,activation="relu"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(160,activation="relu"),
        keras.layers.Dense(160,activation="relu"),
        keras.layers.Dense(256,activation="relu"),
        keras.layers.Dense(32,activation="relu"),
        keras.layers.Dense(240,activation="relu"),
        keras.layers.Dense(96,activation="relu"),
        keras.layers.Dense(208,activation="relu"),
        keras.layers.Dense(1)
    ])
    
    #early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
    dnn_model.compile(optimizer="adam", loss=keras.losses.mean_squared_error, metrics=[keras.metrics.mean_squared_error])
    dnn_model.fit(x=scaled_X_train, y=y_train, epochs=50, verbose=2) #callbacks=[early_stop],
    
    this_y_predict = dnn_model.predict(scaled_X_test).flatten()
    y_dnn_socio_predict = y_dnn_socio_predict + this_y_predict.tolist()
    y_true = y_true + y_test.tolist()
    
    i = i + 1

In [None]:
dnn_socio_rmse = mean_squared_error(y_true , y_dnn_socio_predict, squared=False)
dnn_socio_r2 = r2_score(y_true, y_dnn_socio_predict)
print("rmse: " + str(round(dnn_socio_rmse,4)), "r2: " + str(round(dnn_socio_r2,4)))