In [1]:
import h5py
import numpy as np
import pandas as pd
import pickle

f = np.load('Data/bay/pems-bay-20.npy')
f

array([[69.2, 65. , 67.4, ..., 69.4, 68.4, 65. ],
       [69. , 64.5, 66. , ..., 69.2, 68.2, 64.8],
       [68.9, 64.8, 66.7, ..., 69.1, 68.5, 65.1],
       ...,
       [69. , 65.1, 65.9, ..., 69.2, 68. , 65. ],
       [69. , 64.7, 67. , ..., 69.3, 67.8, 64.8],
       [69. , 65.6, 66.3, ..., 69.3, 67.9, 64.9]])

In [2]:
f.shape

(5760, 319)

In [3]:
df_sensors = pd.read_csv('Data/bay/graph_sensor_locations_bay.csv')
# 要删除的sensor_id列表
sensor_ids_to_remove = [401495, 402369, 402288, 402368, 402289, 404554]

# 使用isin方法和布尔索引来删除这些sensor_id
df_sensors = df_sensors[~df_sensors['sensor_id'].isin(sensor_ids_to_remove)]
df_sensors

Unnamed: 0,sensor_id,latitude,longitude,segment,abs
0,400001,37.364085,-121.901149,US101-N,387.90
1,400017,37.253303,-121.945440,SR85-S,10.07
2,400030,37.359087,-121.906538,I880-S,3.64
3,400040,37.294949,-121.873109,SR87-S,2.86
4,400045,37.363402,-121.902233,I880-N,4.02
...,...,...,...,...,...
320,413845,37.422887,-121.925747,SR237-W,8.85
321,413877,37.321613,-121.899642,I280-N,2.97
322,413878,37.324641,-121.888603,I280-N?,2.32
323,414284,37.323066,-121.896538,I280-N,2.77


In [4]:
import numpy as np
import pandas as pd
from scipy.spatial import distance

def process_weather_data(df_sensors, weather_stations_data, station_file):
    # 定义天气站点位置
    weather_stations = np.array(weather_stations_data)
    
    # 计算距离并分配最近的天气站点
    sensors_coords = df_sensors[['latitude', 'longitude']].values
    distances = distance.cdist(sensors_coords, weather_stations, metric='euclidean')
    closest_station_indices = distances.argmin(axis=1)
    
    # 将索引转换为天气站点名称
    station_names = ['KSJC']
    closest_stations = [station_names[i] for i in closest_station_indices]
    df_sensors['最接近的天气站点'] = closest_stations
    
    # 处理天气站点数据
    weather_df = pd.read_csv(station_file)
    weather_df = weather_df[['Station_ID', 'Date_Time', 'weather_summary_set_1d']]
    
    # 转换时间格式并重采样
    weather_df['Date_Time'] = pd.to_datetime(weather_df['Date_Time'], format='%m/%d/%Y %H:%M %Z')
    weather_df = weather_df[weather_df['Date_Time'].dt.minute.isin(range(0, 60, 5))]
    
    # 创建完整的时间序列并填充缺失值
    time_range = pd.date_range(weather_df['Date_Time'].min(), 
                             weather_df['Date_Time'].max(), 
                             freq='5T')
    complete_df = pd.DataFrame({'Date_Time': time_range})
    merged_df = pd.merge(complete_df, weather_df, on='Date_Time', how='left')
    
    # 填充缺失值
    for column in weather_df.columns[1:]:
        merged_df[column] = merged_df[column].fillna(method='ffill')
    
    # 天气编码映射
    weather_codes = {
        "broken": 1,
        "clear": 2,
        "haze": 3,
        "light rain": 4,
        "mist": 5,
        "overcast": 6,
        "scattered": 7,
        "thin scattered": 8
    }
    
    # 转换天气描述为编码
    merged_df['weather_code'] = merged_df['weather_summary_set_1d'].map(weather_codes)
    
    # 提取所需时间段的天气数据
    weather_data = merged_df.loc[:5759, ['Date_Time', 'weather_code']]
    
    return df_sensors, weather_data

# 使用示例:

weather_stations_data = [
    [37.35917, -121.92417]  # KSJC
]

df_sensors_KSJC, weather_data_KSJC = process_weather_data(df_sensors, 
                                              weather_stations_data, 
                                              'Data/bay/KSJC.csv')
print('weather_data_KSJC:',weather_data_KSJC)

# df_sensors_KLAX, weather_data_KLAX = process_weather_data(df_sensors, 
#                                               weather_stations_data, 
#                                               'Data/I405/KLAX_values.csv')
# print('weather_data_KLAX:',weather_data_KLAX)

# df_sensors_KLGB, weather_data_KLGB = process_weather_data(df_sensors, 
#                                               weather_stations_data, 
#                                               'Data/I405/KLGB_values.csv')
# print('weather_data_KLGB:',weather_data_KLGB)

# df_sensors_KSMO, weather_data_KSMO = process_weather_data(df_sensors, 
#                                               weather_stations_data, 
#                                               'Data/I405/KSMO_values.csv')
# print('weather_data_KSMO:',weather_data_KSMO)



weather_data_KSJC:                      Date_Time  weather_code
0    2022-01-01 00:00:00+00:00           2.0
1    2022-01-01 00:05:00+00:00           2.0
2    2022-01-01 00:10:00+00:00           2.0
3    2022-01-01 00:15:00+00:00           2.0
4    2022-01-01 00:20:00+00:00           2.0
...                        ...           ...
5755 2022-01-20 23:35:00+00:00           2.0
5756 2022-01-20 23:40:00+00:00           2.0
5757 2022-01-20 23:45:00+00:00           2.0
5758 2022-01-20 23:50:00+00:00           2.0
5759 2022-01-20 23:55:00+00:00           2.0

[5760 rows x 2 columns]


  time_range = pd.date_range(weather_df['Date_Time'].min(),
  merged_df[column] = merged_df[column].fillna(method='ffill')


In [5]:
# 更新后的天气站点数据
weather_stations = [
    np.array([37.35917, -121.92417])  # KSJC
    
]


# 将传感器站点的经纬度转换为数组形式
sensors_coords = df_sensors[['latitude', 'longitude']].values

# 计算每个传感器站点与每个天气站点之间的距离
distances = distance.cdist(sensors_coords, weather_stations, metric='euclidean')

# 找到每个传感器站点最近的一个天气站点的索引
closest_station_indices = distances.argmin(axis=1)

# 将索引转换为天气站点名称或编号
closest_stations = ['KSJC' for i in closest_station_indices]

# 将结果添加到DataFrame中
df_sensors['最接近的天气站点'] = closest_stations

print(df_sensors)

     sensor_id   latitude   longitude  segment     abs 最接近的天气站点
0       400001  37.364085 -121.901149  US101-N  387.90     KSJC
1       400017  37.253303 -121.945440   SR85-S   10.07     KSJC
2       400030  37.359087 -121.906538   I880-S    3.64     KSJC
3       400040  37.294949 -121.873109   SR87-S    2.86     KSJC
4       400045  37.363402 -121.902233   I880-N    4.02     KSJC
..         ...        ...         ...      ...     ...      ...
320     413845  37.422887 -121.925747  SR237-W    8.85     KSJC
321     413877  37.321613 -121.899642   I280-N    2.97     KSJC
322     413878  37.324641 -121.888603  I280-N?    2.32     KSJC
323     414284  37.323066 -121.896538   I280-N    2.77     KSJC
324     414694  37.315051 -121.913497   I280-N    3.91     KSJC

[319 rows x 6 columns]


In [6]:
# 创建一个空字典来存储每个VDS的时间序列
time_series_dict = {}

# 为每个VDS填充对应的天气代码
for index, row in df_sensors.iterrows():
    vds = row['sensor_id']
    closest_station = row['最接近的天气站点']
    
    # 根据最接近的天气站点选择数据
    if closest_station == 'KSJC':
        weather_data = weather_data_KSJC
    # elif closest_station == 'KLAX':
    #     weather_data = weather_data_KLAX
    # elif closest_station == 'KLGB':
    #     weather_data = weather_data_KLGB
    # elif closest_station == 'KSMO':
    #     weather_data = weather_data_KSMO
    else:
        print('没有对应的气象站点')
        continue  # 如果没有匹配的天气站点，跳过这个VDS
    
    # 将天气代码填充到字典中
    time_series_dict[vds] = weather_data['weather_code'].values

# 将字典转换为二维numpy数组
num_vds = len(df_sensors)
num_time_steps = len(weather_data['Date_Time'])
weather_matrix = np.full((num_vds, num_time_steps), np.nan)  # 使用NaN初始化数组

# 填充数组
for vds, time_series in time_series_dict.items():
    vds_index = np.where(df_sensors['sensor_id'] == vds)[0][0]  # 获取VDS的索引
    weather_matrix[vds_index, :] = time_series

# 输出二维数组
print(weather_matrix)

[[2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 ...
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]]


In [7]:
weather_matrix.shape

(319, 5760)

In [8]:
# weather_matrix = weather_matrix.T
# weather_matrix

In [9]:
# 找到NaN值和负值的位置并打印
nan_or_negative_indices = np.where((weather_matrix != np.nan) & (weather_matrix <= 0))
print("\nNaN值和负值的位置:",nan_or_negative_indices)
for pos in zip(*nan_or_negative_indices):
    print(f"行 {pos[0] + 1}, 列 {pos[1] + 1}")

# 用左边的数值填充NaN值和负值，如果左边是nan，则继续往左
for i in range(weather_matrix.shape[0]):
    for j in range(weather_matrix.shape[1]):
        if np.isnan(weather_matrix[i, j]) or weather_matrix[i, j] < 0:
            # 从左边查找有效值
            left_valid = j - 1
            while left_valid >= 0 and np.isnan(weather_matrix[i, left_valid]) or weather_matrix[i, left_valid] < 0:
                left_valid -= 1
            
            # 如果左边没有有效值，则从右边查找
            if left_valid < 0:
                right_valid = j + 1
                while right_valid < weather_matrix.shape[1] and np.isnan(weather_matrix[i, right_valid]) or weather_matrix[i, right_valid] < 0:
                    right_valid += 1
                if right_valid < weather_matrix.shape[1]:  # 找到了右边的有效值
                    weather_matrix[i, j] = weather_matrix[i, right_valid]
            else:  # 找到了左边的有效值
                weather_matrix[i, j] = weather_matrix[i, left_valid]

# 打印填充后的数组
print("\n填充后的数组:")
print(weather_matrix)


NaN值和负值的位置: (array([], dtype=int64), array([], dtype=int64))

填充后的数组:
[[2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 ...
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]]


In [10]:
weather_matrix

array([[2., 2., 2., ..., 2., 2., 2.],
       [2., 2., 2., ..., 2., 2., 2.],
       [2., 2., 2., ..., 2., 2., 2.],
       ...,
       [2., 2., 2., ..., 2., 2., 2.],
       [2., 2., 2., ..., 2., 2., 2.],
       [2., 2., 2., ..., 2., 2., 2.]])

In [11]:
weather_matrix.shape

(319, 5760)

In [12]:
np.save('Data/bay/weather-20',weather_matrix)

In [13]:
# 检查数组中是否有NaN值
has_nan = np.isnan(weather_matrix).any()

# 检查数组中是否有负值
has_negative = (weather_matrix < 0).any()

print("数组中是否有NaN值:", has_nan)
print("数组中是否有负值:", has_negative)

数组中是否有NaN值: False
数组中是否有负值: False


In [14]:
weather_matrix.shape

(319, 5760)

In [15]:
weather_matrix[974:977,:]

array([], shape=(0, 5760), dtype=float64)

In [16]:
weather_matrix[:,0:324]

array([[2., 2., 2., ..., 2., 2., 2.],
       [2., 2., 2., ..., 2., 2., 2.],
       [2., 2., 2., ..., 2., 2., 2.],
       ...,
       [2., 2., 2., ..., 2., 2., 2.],
       [2., 2., 2., ..., 2., 2., 2.],
       [2., 2., 2., ..., 2., 2., 2.]])

In [17]:
# 找出NaN值的位置
nan_positions = np.where(np.isnan(weather_matrix))
nan_positions

(array([], dtype=int64), array([], dtype=int64))

In [18]:
import numpy as np
d1 = np.load('Data/bay/weather-20.npy')
nan_indices = np.isnan(d1)
nan_positions = np.where(nan_indices)
print("\nNaN值的位置:",nan_positions)


NaN值的位置: (array([], dtype=int64), array([], dtype=int64))
