In [206]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
from tqdm import tqdm
import seaborn as sns
import re
import datetime
import os
import random
import h3 
import shap 

In [207]:
import json

h3_l7_df = pd.read_json("h3_l7_df.json", orient="records", lines=True)

In [208]:
# 設定plt環境
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [209]:
# 設定訓練、驗證和測試集的比例
train_proportion = 0.7  

train_size = int(h3_l7_df.shape[0]*train_proportion)

seed = 7578 #666
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# h3_l7_id = np.random.choice(spatial_data.shape[0], spatial_data.shape[0])
# spatial_data = spatial_data[h3_l7_id]
# train_spatial_data = spatial_data[:train_size, :]
# test_spatial_data = spatial_data[train_size:, :]

# 隨機打亂 DataFrame
shuffled_h3_l7_df = h3_l7_df.sample(frac=1, random_state=seed).reset_index(drop=True)

# 分割訓練集和測試集
train_h3_l7_df = shuffled_h3_l7_df.iloc[:train_size]
test_h3_l7_df = shuffled_h3_l7_df.iloc[train_size:]

In [210]:
# 將 h3_l7_df 資料框中的 'id' 列移除，僅保留數據進行正規化
shuffled_spatial_data = shuffled_h3_l7_df.drop('id', axis=1).drop('police', axis=1)


# # # 對數據進行正規化：將每個數據列的最小值調整為 0，最大值調整為 1
normalized_spatial_data = (shuffled_spatial_data - shuffled_spatial_data.min()) / (shuffled_spatial_data.max() - shuffled_spatial_data.min())

#設定OHCA正規化反函數 方便把預測結果返回原本scale
ohca_reguli_inverse = (h3_l7_df.ohca.max()-h3_l7_df.ohca.min()) + h3_l7_df.ohca.min()

# 將 DataFrame 轉換為 numpy array，並設定數據類型為 np.float64
spatial_data = np.array(normalized_spatial_data).astype(np.float64)


train_spatial_data = spatial_data[:train_size, :]
test_spatial_data = spatial_data[train_size:, :]


In [211]:
class Regressor(nn.Module):
    """
    用於迴歸任務的神經網絡模型 Regressor。
    
    結構:
    - 兩層隱藏層，並使用 ReLU 激活函數
    - 最後一層為線性層，不使用激活函數（適用於迴歸）
    """
    def __init__(self, input_size=2, hidden_size=32, output_size=1):
        super().__init__()
        # 定義三層全連接層
        self.fc1 = nn.Linear(input_size, hidden_size)     # 第一層：輸入層到隱藏層
        self.fc2 = nn.Linear(hidden_size, hidden_size)    # 第二層：隱藏層到隱藏層
        self.fc3 = nn.Linear(hidden_size, output_size)    # 第三層：隱藏層到輸出層

        # 初始化權重和偏置
        nn.init.normal_(self.fc1.weight, std=0.02)
        nn.init.constant_(self.fc1.bias, 0)
        nn.init.normal_(self.fc2.weight, std=0.02)
        nn.init.constant_(self.fc2.bias, 0)
        nn.init.normal_(self.fc3.weight, std=0.02)
        nn.init.constant_(self.fc3.bias, 0)
        
    def forward(self, input):
        # 前向傳播過程
        output = F.relu(self.fc1(input))  # 第一層 + ReLU 激活
        output = F.relu(self.fc2(output)) # 第二層 + ReLU 激活
        output = self.fc3(output)         # 第三層（不使用激活函數）
        return output

In [212]:
window_size = 1

def train_reg(spatial_data, 
              s_net,
              s_net_optim, 
              window_size, iter_num=5000):
    """
    訓練 s_net  網絡來預測 spatial_data 中的數據。
    
    參數:
    - spatial_data: numpy array，包含訓練數據
    - s_net: 神經網絡模型
    - s_net_optim: 優化器
    - window_size: 每次迭代的隨機取樣大小
    - iter_num: 訓練迭代次數
    
    返回:
    - loss_array: 每次迭代的損失值
    - t_fea_array, s_fea_array: 用於存儲特徵的暫時性陣列（目前未使用）
    """

    loss_array = []     # 儲存每次迭代的損失
    t_fea_array = []    # 預留用於儲存暫時性特徵的空列表
    s_fea_array = []    # 預留用於儲存暫時性特徵的空列表

    for _ in tqdm(range(iter_num)):
        
        # 隨機選擇一組數據索引
        h3_l7_id = np.random.choice(spatial_data.shape[0] - 1, window_size)

        # s_net 輸入特徵向量並計算輸出
        # s_fea = s_net(torch.autograd.Variable(torch.FloatTensor(spatial_data[h3_l7_id, :-1])))

        # 提取目標變數（即輸入的最後一列數據）並轉為 Tensor
        ohca = spatial_data[h3_l7_id, -1].reshape(-1, 1)
        ohca = torch.autograd.Variable(torch.FloatTensor(ohca))

        # p_pred 用於預測目標變數
        p_pred = s_net(torch.autograd.Variable(torch.FloatTensor(spatial_data[h3_l7_id, :-1]))).reshape(-1, 1)

        # 定義均方誤差損失
        mseloss = torch.nn.MSELoss(reduction='sum')
        loss = mseloss(p_pred, ohca)
        
        # 清空前一次計算的梯度
        s_net_optim.zero_grad()
        
        
        # 計算損失的梯度
        # loss.backward()
        autograd.backward(loss)

        # 更新神經網絡參數
        s_net_optim.step()
        
        # 儲存損失值
        loss_array.append(loss.detach().cpu().numpy())

    return loss_array, t_fea_array, s_fea_array

# 初始化模型和優化器
s_net = Regressor(input_size=spatial_data.shape[1] - 1, hidden_size=spatial_data.shape[1] * 2, output_size=1)
s_net_optim = optim.Adam(s_net.parameters(), lr=1e-3, weight_decay=1e-5)

iter_num=30000
# 執行訓練過程
loss_array, t_fea_array, s_fea_array = train_reg(train_spatial_data, s_net,
                                                 s_net_optim,
                                                 window_size, iter_num)

100%|██████████| 30000/30000 [00:43<00:00, 690.84it/s]


In [213]:
y_head_train = s_net(torch.autograd.Variable(torch.FloatTensor(train_spatial_data[:, :-1]))).detach().numpy()*ohca_reguli_inverse
y_train = train_spatial_data[:, -1]*ohca_reguli_inverse
y_head_test = s_net(torch.autograd.Variable(torch.FloatTensor(test_spatial_data[:, :-1]))).detach().numpy()*ohca_reguli_inverse
y_test = test_spatial_data[:, -1].reshape(-1, 1)*ohca_reguli_inverse

In [214]:
mae = np.abs(y_head_test-y_test)
ans_mae = mae.sum()/mae.shape[0]

print('MAE of test set= ',ans_mae)

# 計算殘差變異
ss_residual = np.sum((y_test - y_head_test) ** 2)

# 計算總變異量
ss_total = np.sum((y_test - np.mean(y_test)) ** 2)

# 計算 R²
r_squared = 1 - (ss_residual / ss_total)

n = mae.shape[0]          # Number of data points
p = train_spatial_data.shape[1]            # Number of predictors

# Adjusted R-squared calculation
r_squared_adj = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)

print("R² of test set= ", r_squared)
print("ADJ R² of test set= ", r_squared_adj)

MAE of test set=  10.548381258334432
R² of test set=  0.17095083605589612
ADJ R² of test set=  4.090092338337115


# SHAP

In [215]:
# 假設 `spatial_data` 包含背景數據，用於 SHAP 的解釋
background_data = torch.FloatTensor(train_spatial_data[:, :-1])  
test_data = torch.FloatTensor(test_spatial_data[:, :-1])  

In [216]:
# 建立 SHAP 解釋器，使用背景數據
explainer = shap.GradientExplainer(s_net, background_data)
shap_values_test = explainer.shap_values(test_data)*ohca_reguli_inverse
# Get the shap values from my test data

test_features_df = shuffled_spatial_data.iloc[:, :-1]
feature_names = test_features_df.columns

In [217]:
feature_names

Index(['school', 'parking', 'grave_yard', 'college', 'hospital', 'fuel',
       'place_of_worship', 'fast_food', 'stage', 'restaurant', 'university',
       'theatre', 'library', 'post_office', 'pharmacy', 'fire_station',
       'toilets', 'childcare', 'car_wash', 'bank', 'ice_cream',
       'community_centre', 'cafe', 'shelter', 'ranger_station', 'shower',
       'veterinary', 'pub', 'cinema', 'animal_shelter', 'clinic',
       'exhibition_centre', 'nightclub', 'arts_centre', 'kindergarten',
       'bus_station', 'townhall', 'prison', 'doctors', 'dentist', 'studio',
       'marketplace', 'car_rental', 'driving_school', 'payment_centre', 'dojo',
       'gambling', 'bar', 'bicycle_parking', 'bbq', 'social_facility'],
      dtype='object')

In [218]:
#把SHAP 換成壞圖之格式
shap_col = shap_values_test.shape[0]
shap_row = shap_values_test.shape[1]
shap_values_test_2D = shap_values_test.reshape(shap_col,shap_row)
# shap.summary_plot(shap_values_test_2D, test_data,feature_names)

In [219]:
feature_names_w_SHAP = [f'shap {col}' for col in feature_names] # 在每個列名前加上 'shap'
SHAP_df = pd.DataFrame(shap_values_test_2D, columns=feature_names_w_SHAP) #換成 DF

In [220]:
df1 = test_h3_l7_df.reset_index(drop=True)
df2 = SHAP_df.reset_index(drop=True)
test_h3_l7_df_S = pd.concat([df1, df2], axis=1) #合併SHAP值到test_h3_l7_df

In [221]:
feature_names.shape[0]+3

54

In [222]:
# 初始化結果 DataFrame
spatial_data_score = pd.DataFrame()
spatial_data_score['id'] = test_h3_l7_df_S['id']

# 循環處理每一列
for col in range(feature_names.shape[0]):
    col_result = []  # 用於存儲當前列的計算結果
    
    # 遍歷每一行
    for row in range(test_h3_l7_df_S.shape[0]):
        denominator = test_h3_l7_df_S.iloc[row, col + 1]  # 分母
        numerator = test_h3_l7_df_S.iloc[row, col + feature_names.shape[0] + 3]  # 分子
        
        # 如果分母為 0，直接使用原分子數據
        if denominator == 0:
            col_result.append(numerator)
        else:
            col_result.append(numerator / denominator)  # 正常執行除法
    
    # 將當前列結果存入結果 DataFrame
    spatial_data_score[feature_names[col]] = col_result


In [223]:
spatial_data_score

Unnamed: 0,id,school,parking,grave_yard,college,hospital,fuel,place_of_worship,fast_food,stage,...,marketplace,car_rental,driving_school,payment_centre,dojo,gambling,bar,bicycle_parking,bbq,social_facility
0,872af6343ffffff,5.381099,0.812338,0.086109,0.061548,-0.24982,2.54157,3.866589,-1.357674,-0.02609,...,-0.047497,-0.047673,-0.211015,0.0,0.0,0.0,-0.025991,-0.045271,-0.045736,0.0
1,872af0cb6ffffff,5.9723,0.081062,0.118967,-0.015649,-0.114502,0.950483,-3.31961,-0.214094,-0.026089,...,0.0,0.0,-0.161682,0.0,0.0,0.0,-0.02599,-0.055925,-0.052922,0.0
2,872af6364ffffff,-4.453692,0.128532,0.088826,-0.074309,-0.015027,-0.577032,-8.359896,-0.080073,-0.078477,...,0.0,0.0,-0.017128,0.0,0.0,0.0,-0.078179,0.0,0.0,0.0
3,872af6224ffffff,-2.810134,-0.159748,-0.029208,-0.178025,-0.164787,-0.786529,-3.826666,-0.324603,-0.052205,...,0.0,0.0,-0.235869,0.0,0.0,0.0,-0.052007,-0.057474,-0.054651,0.0
4,872af0c85ffffff,-3.481367,-0.18609,-0.021783,-0.083665,-0.173245,-0.836033,1.628884,-0.011669,0.0,...,-0.149761,-0.150354,-0.058888,0.0,0.0,0.0,0.0,-0.111854,-0.105847,0.0
5,872af6265ffffff,-0.226368,0.067376,-0.019026,-0.007675,-0.14437,-0.480818,-0.256585,0.074764,-0.052178,...,-0.080621,-0.080927,-0.056858,0.0,4.703143e-37,0.0,-0.05198,-0.078306,-0.075033,0.0
6,872af6361ffffff,-2.517901,-1.095211,-0.013318,-0.00633,-0.093894,-0.504361,-3.580736,-0.212743,-0.026117,...,-0.099763,-0.100157,-0.143105,0.0,0.0,0.0,-0.026018,-0.088668,-0.084405,0.0
7,872af0c90ffffff,5.306777,-0.193495,0.125395,-0.016651,-0.35979,-2.864004,-1.163465,0.179738,0.0,...,-0.108824,-0.109214,-0.053503,0.0,0.0,0.0,0.0,-0.105745,-0.100438,0.0
8,872af6340ffffff,-2.42468,-0.862507,-0.007827,-0.006284,-0.080036,-0.576472,-2.659049,-0.352222,-0.087071,...,-0.116884,-0.117349,-0.064045,0.0,0.0,0.0,-0.086972,-0.140884,-0.134232,0.0
9,872af0c8dffffff,-6.350654,1.235644,0.005485,-0.081434,-0.190837,-0.428863,-9.535344,0.441911,-0.024531,...,-0.049846,-0.050041,-0.267487,3.3351609999999996e-36,0.0,0.0,-0.024436,-0.027962,-0.02646,0.0


In [263]:
# building_df = pd.read_csv('OpenStreetMap/B_features.csv')
poi_df = pd.read_csv('OpenStreetMap/A_features.csv')
# poi_df = poi_df[['osmid', 'amenity', 'ele', 'denomination', 'brand', 'cuisine',
#                     'healthcare', 'parking', 'geometry', 'height']
poi_df = poi_df[['osmid', 'amenity', 'ele', 'geometry']]

# 初始化列表
lat = []
lon = []
h3_l7 = []

# 只挑選有效的行
valid_indices = []  # 用來記錄有效行的索引
# 遍歷每一行
for i in range(poi_df.shape[0]):
    # 提取經緯度
    coordinates = re.findall("\d+\.?\d*", poi_df.geometry[i])
    if len(coordinates) < 3:
        continue  # 如果座標數據不完整，跳過該行
    else:
        # 提取緯度和經度
        latitude = float(coordinates[1])  # 緯度
        longitude = -float(coordinates[0])  # 經度
        lat.append(latitude)
        lon.append(longitude)
        
        # 轉換為 H3 L7 id
        h3_l7.append(h3.geo_to_h3(latitude, longitude, resolution=7))
        
        # 記錄有效行的索引
        valid_indices.append(i)

# 使用 valid_indices 創建新的 DataFrame
poi_df_valid = poi_df.iloc[valid_indices]

# 新增列到有效的 DataFrame
poi_df_valid['lat'] = lat
poi_df_valid['lon'] = lon
poi_df_valid['h3_l7'] = h3_l7

# 刪除 'geometry' 列
poi_df_valid = poi_df_valid.drop('geometry', axis=1)

  coordinates = re.findall("\d+\.?\d*", poi_df.geometry[i])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_df_valid['lat'] = lat
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_df_valid['lon'] = lon
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_df_valid['h3_l7'] = h3_l7


In [264]:
poi_df_valid

Unnamed: 0,osmid,amenity,ele,lat,lon,h3_l7
1001,37918264,school,3.0,36.850349,-75.986790,872af636cffffff
1002,38028116,school,7.0,36.897203,-76.142903,872af6309ffffff
1003,38028120,school,6.0,36.899387,-76.147563,872af6309ffffff
1004,38054683,school,3.0,36.841675,-75.979026,872af636dffffff
1005,38054722,parking,,36.848191,-76.034993,872af636affffff
...,...,...,...,...,...,...
3478,13825469,place_of_worship,,36.853923,-76.153865,872af6353ffffff
3479,14020613,place_of_worship,3.0,36.757505,-76.050832,872af0c92ffffff
3480,14325039,clinic,,36.872663,-76.139490,872af6350ffffff
3481,14756225,parking,,36.857801,-76.026476,872af636effffff


In [261]:
selected_values = spatial_data_score['id']
test_poi_df = poi_df_valid[poi_df_valid['h3_l7'].isin(selected_values)]

In [None]:

test_poi_df = test_poi_df.reset_index(drop=True)
test_poi_df['score'] = 12  # 初始化 'score' 列為空值

# 循環處理每一行
for i in range(0, test_poi_df.shape[0]):
    poi_id = test_poi_df['h3_l7'].iloc[i]  # 取得當前行的 poi_id
    building_type = test_poi_df['amenity'].iloc[i]  # 取得當前行的 building_type
    
    # 查找 spatial_data_score 中對應的 id 和 amenity
    positions = spatial_data_score.index[spatial_data_score['id'] == poi_id]
 
        # 檢查 building_type 是否在 spatial_data_score 的列中
    if building_type in spatial_data_score.columns:
        building_score = spatial_data_score.loc[positions, building_type]
        
        if not building_score.empty:
            # 如果找到了對應的建築分數，將其轉換為數字並儲存
            test_poi_df.loc[i, 'score'] = pd.to_numeric(building_score.iloc[0])
        else:
            # 如果沒有找到對應的 building_type，可以設為 NaN 或其他預設值
            test_poi_df.loc[i, 'score'] = 'no uilding_type'
    else:
        # 如果 building_type 不存在於 spatial_data_score，設為 NaN 或其他預設值
        test_poi_df.loc[i, 'score'] = 'no uilding_type 2'


# 檢查結果
test_poi_df
# test_poi_df.to_csv('test_poi_df.csv', index=False, sep=',', encoding='utf-8-sig')

  test_poi_df.loc[i, 'score'] = pd.to_numeric(building_score.iloc[0])
  test_poi_df.loc[i, 'score'] = 'no uilding_type 2'


In [None]:
# ohca_df = pd.read_csv('OHCAs.csv')
# h3_l7 = []

# for i in range(ohca_df.shape[0]):
#     h3_l7.append(h3.geo_to_h3(ohca_df.Latitude[i], ohca_df.Longitude[i], resolution=7))

# ohca_df['h3_l7'] = h3_l7

# if min_lat < min(ohca_df['Latitude']): min_lat = min(ohca_df['Latitude'])
# if max_lat > max(ohca_df['Latitude']): max_lat = max(ohca_df['Latitude'])
# if min_lon < min(ohca_df['Longitude']): min_lon = min(ohca_df['Longitude'])
# if max_lon > max(ohca_df['Longitude']): max_lon = max(ohca_df['Longitude'])

# ohca_df = ohca_df.drop_duplicates(subset=['ReceivedTime', 'Latitude', 'Longitude'])
# ohca_df['ReceivedTime'] = pd.to_datetime(ohca_df['ReceivedTime'])
# ohca_df['ReceivedTime'] = ohca_df['ReceivedTime'].apply(lambda x: x.date())

In [None]:
# cols = np.concatenate((
#             poi_df.amenity.unique(),
#         ))
# len(cols)

# h3_l7_df = pd.DataFrame(data={'id': np.unique(np.concatenate((poi_df.h3_l7.unique(), ohca_df.h3_l7.unique())))})
# h3_l7_df[cols] = 0

# for i in range(poi_df.shape[0]):
#     h3_l7_id = poi_df.iloc[i]['h3_l7']
#     h3_l7_df.loc[h3_l7_df['id'] == h3_l7_id, poi_df.iloc[i]['amenity']] += 1
