In [15]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import networkx as nx
import ast


In [16]:
PARENT = "../Sweep/"
DATA_FOLDER = PARENT + "data_WESCO/"
DATA_FOLDER_CACHE = DATA_FOLDER + 'cache/'
VENUE_CONFIG_FILE = DATA_FOLDER_CACHE + "WESCO_config.json"
VENUE = "WESCO"

In [17]:
def read_json_config(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

config = read_json_config(VENUE_CONFIG_FILE)

In [18]:
sensors_unparsed = config[VENUE]["placements"]
sensors = {int(key): value for key, value in sensors_unparsed.items()}
sensors

{100: {'row': 3, 'column': 3},
 101: {'row': 6, 'column': 5},
 102: {'row': 6, 'column': 11},
 103: {'row': 0, 'column': 24}}

In [19]:
df = pd.read_csv(DATA_FOLDER_CACHE + VENUE + '.csv', index_col=0)
#df = df.dropna()

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 109 entries, 0 to 108
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           109 non-null    object 
 1   pm2_5_avg_100  109 non-null    float64
 2   pm2_5_avg_101  109 non-null    float64
 3   pm2_5_avg_102  109 non-null    float64
 4   pm2_5_avg_103  109 non-null    float64
 5   status_100     109 non-null    int64  
 6   status_101     109 non-null    int64  
 7   status_102     109 non-null    int64  
 8   status_103     109 non-null    int64  
dtypes: float64(4), int64(4), object(1)
memory usage: 8.5+ KB


In [21]:
#new features from Date -> minute, hour, month, year, dayofyear (1-365)

In [22]:
df['Date'] = pd.to_datetime(df['Date'])
df['hour'] = df['Date'].dt.hour
df['minute'] = df['Date'].dt.minute
df['dayoftheyear'] = df['Date'].dt.dayofyear
df['hour'] = df['hour'].astype('int')
df['minute'] = df['minute'].astype('int')
df['dayoftheyear'] = df['dayoftheyear'].astype('int')
df = df.sort_values(['Date']) #already sorted
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 109 entries, 0 to 108
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           109 non-null    datetime64[ns]
 1   pm2_5_avg_100  109 non-null    float64       
 2   pm2_5_avg_101  109 non-null    float64       
 3   pm2_5_avg_102  109 non-null    float64       
 4   pm2_5_avg_103  109 non-null    float64       
 5   status_100     109 non-null    int64         
 6   status_101     109 non-null    int64         
 7   status_102     109 non-null    int64         
 8   status_103     109 non-null    int64         
 9   hour           109 non-null    int64         
 10  minute         109 non-null    int64         
 11  dayoftheyear   109 non-null    int64         
dtypes: datetime64[ns](1), float64(4), int64(7)
memory usage: 11.1 KB


In [23]:
dataStartDate = sorted(df['Date'])[0]
dataEndDate = sorted(df['Date'])[-1]
print(f"Dataset has data from {dataStartDate} to {dataEndDate}")

Dataset has data from 2024-07-26 00:15:00 to 2024-07-29 23:00:00


In [24]:
h, w = config[VENUE]["height"], config[VENUE]["width"]
vents = config[VENUE]["vents"]


In [25]:
mask_hvac = np.zeros((h, w), dtype=int)

for v in vents:
    mask_hvac[vents[v].get("row"), vents[v].get("column")] = 1 if vents[v].get('direction') == "down" else 2

In [26]:
mask_hvac

array([[1, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        2, 0, 2],
       [0, 1, 1, 2, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0],
       [1, 0, 0, 0, 0, 2, 1, 0, 2, 0, 1, 0, 1, 2, 0, 1, 0, 1, 2, 1, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0],
       [1, 1, 1, 2, 0, 0, 2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 0, 1, 2, 1, 0, 0,
        0, 0, 0],
       [0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        2, 1, 2],
       [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 2, 0, 0,
        1, 0, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
        1, 0, 1]])

In [27]:
mapCells = []

for i in range(h):
    for j in range(w):
        mapCells.append(
            {
                "row": i,
                "column": j
            }
        )

In [28]:
def add_walls(grid_graph):
    with open(DATA_FOLDER_CACHE + 'walls.txt', 'r') as file:
        lines = file.readlines()
        filtered_lines = [l for l in lines if "#" not in l] 
        edges_to_remove = [ast.literal_eval(line.strip()) for line in filtered_lines]

    for n1, n2 in edges_to_remove:
        grid_graph.remove_edge(n1, n2)

grid_graph = nx.grid_2d_graph(8, 25)  

for u, v in grid_graph.edges():
        grid_graph[u][v]['weight'] = 1

add_walls(grid_graph)

In [29]:
def get_distance(source, target):
    
    try:
        distance = nx.shortest_path_length(grid_graph, source=source, target=target, weight='weight')
    except nx.NetworkXNoPath:
        distance = 100

    return distance

In [30]:
sensorDistanceMap = { }

for i in [1,2,3,4]:
    sensorDistanceMap[i] = {
        'sensorID': np.zeros((h, w), dtype='int'),
        'distance': np.zeros((h, w), dtype='int'),
    }

for cell in mapCells:
    distances = []
    for currentSensorID, sensor in sensors.items():
        distances.append((
                get_distance(
                    (sensor['row'], sensor['column']),
                    (cell['row'], cell['column'])
                ),
                currentSensorID
            ))
    distances.sort()        
    k = 1
    for currentSensorDistance, currentSensorID in distances:
        sensorDistanceMap[k]['sensorID'][cell['row']][cell['column']] = currentSensorID
        sensorDistanceMap[k]['distance'][cell['row']][cell['column']] = currentSensorDistance
        k += 1

In [31]:
sensorDistanceMap

{1: {'sensorID': array([[100, 100, 100, 100, 100, 100, 100, 100, 100, 102, 102, 102, 102,
          102, 102, 102, 102, 103, 103, 103, 103, 103, 103, 103, 103],
         [100, 100, 100, 100, 100, 100, 100, 100, 100, 102, 102, 102, 102,
          102, 102, 102, 102, 103, 103, 103, 103, 103, 103, 103, 103],
         [100, 100, 100, 100, 100, 100, 100, 100, 100, 102, 102, 102, 102,
          102, 102, 102, 102, 103, 103, 103, 103, 103, 103, 103, 103],
         [100, 100, 100, 100, 101, 101, 101, 101, 101, 102, 102, 102, 102,
          102, 102, 102, 102, 102, 103, 103, 103, 103, 103, 103, 103],
         [100, 100, 100, 100, 101, 101, 101, 101, 101, 102, 102, 102, 102,
          102, 102, 102, 102, 102, 102, 103, 103, 103, 103, 103, 103],
         [100, 100, 100, 100, 101, 101, 101, 101, 101, 102, 102, 102, 102,
          102, 102, 102, 102, 102, 102, 102, 102, 102, 103, 103, 103],
         [100, 100, 100, 100, 101, 101, 101, 101, 101, 102, 102, 102, 102,
          102, 102, 102, 102, 102,

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 109 entries, 0 to 108
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           109 non-null    datetime64[ns]
 1   pm2_5_avg_100  109 non-null    float64       
 2   pm2_5_avg_101  109 non-null    float64       
 3   pm2_5_avg_102  109 non-null    float64       
 4   pm2_5_avg_103  109 non-null    float64       
 5   status_100     109 non-null    int64         
 6   status_101     109 non-null    int64         
 7   status_102     109 non-null    int64         
 8   status_103     109 non-null    int64         
 9   hour           109 non-null    int64         
 10  minute         109 non-null    int64         
 11  dayoftheyear   109 non-null    int64         
dtypes: datetime64[ns](1), float64(4), int64(7)
memory usage: 11.1 KB


In [33]:
y = df[['pm2_5_avg_100', 'pm2_5_avg_101', 'pm2_5_avg_102','pm2_5_avg_103']].to_numpy()


In [34]:
X = df.drop(columns=['Date'])
X_columns = X.columns 
X = X.to_numpy() 


In [35]:
X

array([[  2.08333333,   3.41666667,   4.3       , ...,   0.        ,
         15.        , 208.        ],
       [  2.        ,   3.75      ,   4.23333333, ...,   0.        ,
         30.        , 208.        ],
       [  2.        ,   3.88333333,   4.18333333, ...,   0.        ,
         45.        , 208.        ],
       ...,
       [  1.10714286,   2.78846154,   2.9       , ...,  22.        ,
         30.        , 211.        ],
       [  1.0625    ,   2.73333333,   2.83333333, ...,  22.        ,
         45.        , 211.        ],
       [  1.05357143,   2.8       ,   2.83333333, ...,  23.        ,
          0.        , 211.        ]])

In [36]:
X.shape, y.shape

((109, 11), (109, 4))

In [37]:
X_columns

Index(['pm2_5_avg_100', 'pm2_5_avg_101', 'pm2_5_avg_102', 'pm2_5_avg_103',
       'status_100', 'status_101', 'status_102', 'status_103', 'hour',
       'minute', 'dayoftheyear'],
      dtype='object')

In [38]:
cubeFeatures = ['hour', 'minute', 'day_of_the_year', 
            'pm25_1_closest_concentration', 'pm25_1_closest_distance', 'status_closest_1',
            'pm25_2_closest_concentration', 'pm25_2_closest_distance', 'status_closest_2', 
            'pm25_3_closest_concentration', 'pm25_3_closest_distance', 'status_closest_3',
            'pm25_4_closest_concentration', 'pm25_4_closest_distance', 'status_closest_4',
            'hvac_locations']

def indexOfFeatureInCube(feature): 
    return cubeFeatures.index(feature)

def indexOfFeature(feature):
    return list(X_columns).index(feature)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 109 entries, 0 to 108
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           109 non-null    datetime64[ns]
 1   pm2_5_avg_100  109 non-null    float64       
 2   pm2_5_avg_101  109 non-null    float64       
 3   pm2_5_avg_102  109 non-null    float64       
 4   pm2_5_avg_103  109 non-null    float64       
 5   status_100     109 non-null    int64         
 6   status_101     109 non-null    int64         
 7   status_102     109 non-null    int64         
 8   status_103     109 non-null    int64         
 9   hour           109 non-null    int64         
 10  minute         109 non-null    int64         
 11  dayoftheyear   109 non-null    int64         
dtypes: datetime64[ns](1), float64(4), int64(7)
memory usage: 11.1 KB


In [40]:

def oneDimensionToSpace(space_X, sampleIndex, reading):
        
    for featureIndex, feature in enumerate(cubeFeatures):

        if feature == 'hour':
            space_X[sampleIndex, featureIndex] = np.ones((h,w)) * reading[indexOfFeature('hour')]
        elif feature == 'minute':
            space_X[sampleIndex, featureIndex] = np.ones((h,w)) * reading[indexOfFeature('minute')]
        elif feature == 'day_of_the_year':
            space_X[sampleIndex, featureIndex] = np.ones((h,w)) * reading[indexOfFeature('dayoftheyear')]

        elif feature in ['pm25_1_closest_concentration', 'pm25_2_closest_concentration', 'pm25_3_closest_concentration', 'pm25_4_closest_concentration']:
            
            nthClosest = int(feature[5]) # 5th char of 'feature' is '1', '2', (nth closest)
            nthClosestSensorIDMap = sensorDistanceMap[nthClosest]['sensorID']
            statusFeatureIndex = cubeFeatures.index(f"status_closest_{nthClosest}")

            for cell in mapCells:
                row, col = cell['row'], cell['column']
                
                cellSensorID = nthClosestSensorIDMap[row][col]
                rowX = X[sampleIndex]
                readingPM25 = rowX[indexOfFeature('pm2_5_avg_' + str(cellSensorID))]
                statusSensor = rowX[indexOfFeature('status_' + str(cellSensorID))]
                space_X[sampleIndex, featureIndex, row, col] = readingPM25
                space_X[sampleIndex, statusFeatureIndex, row, col] = statusSensor

        elif feature in ["status_closest_1", "status_closest_2", "status_closest_3", "status_closest_4" ]:
            continue # 

        

In [41]:
space_X_filepath = DATA_FOLDER_CACHE + '/space_X.npy'

n = df.shape[0]
c = len(cubeFeatures) # channels == FEATURES

try:
    space_X = np.load(space_X_filepath)
    print('File loaded from cache')
except FileNotFoundError:
    space_X = np.empty((n, c, h, w))
    space_X[:] = np.nan

    # Add HVAC vents locations
    hvacFeatureIndex = cubeFeatures.index(f"hvac_locations")
    space_X[:, hvacFeatureIndex] = mask_hvac
    
    # Add the PM2.5 closest distance features
    for k in [1, 2, 3, 4]:
        featureIndex = cubeFeatures.index(f"pm25_{k}_closest_distance")
        space_X[:, featureIndex] = sensorDistanceMap[k]['distance']

    for i, reading in tqdm(enumerate(X), total=len(X)):
        oneDimensionToSpace(space_X, i, reading)
    
    np.save(space_X_filepath, space_X)

 40%|████      | 44/109 [00:00<00:00, 217.67it/s]

100%|██████████| 109/109 [00:00<00:00, 184.40it/s]


In [42]:
assert(X.shape[0] == y.shape[0])

In [43]:
space_X

array([[[[  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         ...,
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.],
         [  0.,   0.,   0., ...,   0.,   0.,   0.]],

        [[ 15.,  15.,  15., ...,  15.,  15.,  15.],
         [ 15.,  15.,  15., ...,  15.,  15.,  15.],
         [ 15.,  15.,  15., ...,  15.,  15.,  15.],
         ...,
         [ 15.,  15.,  15., ...,  15.,  15.,  15.],
         [ 15.,  15.,  15., ...,  15.,  15.,  15.],
         [ 15.,  15.,  15., ...,  15.,  15.,  15.]],

        [[208., 208., 208., ..., 208., 208., 208.],
         [208., 208., 208., ..., 208., 208., 208.],
         [208., 208., 208., ..., 208., 208., 208.],
         ...,
         [208., 208., 208., ..., 208., 208., 208.],
         [208., 208., 208., ..., 208., 208., 208.],
         [208., 208., 208., ..., 208., 208., 208.]],

        ...,

  

In [44]:
space_X.shape

(109, 16, 8, 25)

In [45]:
t = 5 # 1 hour if sampling is 15 min

CHUNK_SIZE = space_X.shape[0]  
i = 0
    
new_X_filepath = DATA_FOLDER_CACHE + f"/new_X.npy"
new_X = np.empty((CHUNK_SIZE, t, c, h, w))

countPreviousReadings = 0
new_index = 0

with tqdm(total=CHUNK_SIZE) as progress_bar:
    while i < CHUNK_SIZE and new_index < CHUNK_SIZE:
        #countPreviousReadings = 0  # Reset every iteration? Might be an issue
        print(f"i: {i}, new_index: {new_index}, countPreviousReadings: {countPreviousReadings}, t-1: {t-1}")

        if countPreviousReadings >= t - 1:
            print("I executed this")
            new_X[new_index] = np.array([space_X[i-t+1 : i+1]])
            new_index += 1
            progress_bar.update(1)

        countPreviousReadings += 1
        print(f"Updated countPreviousReadings: {countPreviousReadings}")
        i += 1

np.save(new_X_filepath, new_X)

print("Files saved")


 96%|█████████▋| 105/109 [00:00<00:00, 11095.76it/s]


i: 0, new_index: 0, countPreviousReadings: 0, t-1: 4
Updated countPreviousReadings: 1
i: 1, new_index: 0, countPreviousReadings: 1, t-1: 4
Updated countPreviousReadings: 2
i: 2, new_index: 0, countPreviousReadings: 2, t-1: 4
Updated countPreviousReadings: 3
i: 3, new_index: 0, countPreviousReadings: 3, t-1: 4
Updated countPreviousReadings: 4
i: 4, new_index: 0, countPreviousReadings: 4, t-1: 4
I executed this
Updated countPreviousReadings: 5
i: 5, new_index: 1, countPreviousReadings: 5, t-1: 4
I executed this
Updated countPreviousReadings: 6
i: 6, new_index: 2, countPreviousReadings: 6, t-1: 4
I executed this
Updated countPreviousReadings: 7
i: 7, new_index: 3, countPreviousReadings: 7, t-1: 4
I executed this
Updated countPreviousReadings: 8
i: 8, new_index: 4, countPreviousReadings: 8, t-1: 4
I executed this
Updated countPreviousReadings: 9
i: 9, new_index: 5, countPreviousReadings: 9, t-1: 4
I executed this
Updated countPreviousReadings: 10
i: 10, new_index: 6, countPreviousReadings:

In [46]:
new_X

array([[[[[  0.,   0.,   0., ...,   0.,   0.,   0.],
          [  0.,   0.,   0., ...,   0.,   0.,   0.],
          [  0.,   0.,   0., ...,   0.,   0.,   0.],
          ...,
          [  0.,   0.,   0., ...,   0.,   0.,   0.],
          [  0.,   0.,   0., ...,   0.,   0.,   0.],
          [  0.,   0.,   0., ...,   0.,   0.,   0.]],

         [[ 15.,  15.,  15., ...,  15.,  15.,  15.],
          [ 15.,  15.,  15., ...,  15.,  15.,  15.],
          [ 15.,  15.,  15., ...,  15.,  15.,  15.],
          ...,
          [ 15.,  15.,  15., ...,  15.,  15.,  15.],
          [ 15.,  15.,  15., ...,  15.,  15.,  15.],
          [ 15.,  15.,  15., ...,  15.,  15.,  15.]],

         [[208., 208., 208., ..., 208., 208., 208.],
          [208., 208., 208., ..., 208., 208., 208.],
          [208., 208., 208., ..., 208., 208., 208.],
          ...,
          [208., 208., 208., ..., 208., 208., 208.],
          [208., 208., 208., ..., 208., 208., 208.],
          [208., 208., 208., ..., 208., 208., 208.

In [47]:
new_y_filepath = DATA_FOLDER_CACHE + f"/new_y.npy"
np.save(new_y_filepath, y)

In [48]:
y.shape

(109, 4)

In [49]:
new_X.shape

(109, 5, 16, 8, 25)

In [50]:
len(new_X), len(y)

(109, 109)

In [51]:
new_X

array([[[[[  0.,   0.,   0., ...,   0.,   0.,   0.],
          [  0.,   0.,   0., ...,   0.,   0.,   0.],
          [  0.,   0.,   0., ...,   0.,   0.,   0.],
          ...,
          [  0.,   0.,   0., ...,   0.,   0.,   0.],
          [  0.,   0.,   0., ...,   0.,   0.,   0.],
          [  0.,   0.,   0., ...,   0.,   0.,   0.]],

         [[ 15.,  15.,  15., ...,  15.,  15.,  15.],
          [ 15.,  15.,  15., ...,  15.,  15.,  15.],
          [ 15.,  15.,  15., ...,  15.,  15.,  15.],
          ...,
          [ 15.,  15.,  15., ...,  15.,  15.,  15.],
          [ 15.,  15.,  15., ...,  15.,  15.,  15.],
          [ 15.,  15.,  15., ...,  15.,  15.,  15.]],

         [[208., 208., 208., ..., 208., 208., 208.],
          [208., 208., 208., ..., 208., 208., 208.],
          [208., 208., 208., ..., 208., 208., 208.],
          ...,
          [208., 208., 208., ..., 208., 208., 208.],
          [208., 208., 208., ..., 208., 208., 208.],
          [208., 208., 208., ..., 208., 208., 208.

In [52]:
y

array([[2.08333333, 3.41666667, 4.3       , 7.81666667],
       [2.        , 3.75      , 4.23333333, 7.86666667],
       [2.        , 3.88333333, 4.18333333, 7.81666667],
       [2.        , 3.6       , 4.21666667, 7.96666667],
       [2.        , 3.76666667, 4.11666667, 7.78333333],
       [2.        , 3.7       , 4.08928571, 7.63333333],
       [2.        , 3.73333333, 4.1       , 7.6       ],
       [2.        , 3.78571429, 4.25      , 7.69230769],
       [2.        , 3.76666667, 4.2       , 7.7       ],
       [2.        , 3.73333333, 4.09615385, 7.78333333],
       [2.        , 3.9       , 4.325     , 7.98214286],
       [2.125     , 3.93333333, 4.31666667, 8.26666667],
       [2.125     , 3.8       , 4.31818182, 8.26666667],
       [2.125     , 3.86666667, 4.4       , 8.25      ],
       [2.52777778, 4.05      , 4.61363636, 8.55      ],
       [2.3125    , 3.93333333, 4.61363636, 8.46666667],
       [2.25      , 3.85      , 4.5625    , 8.31666667],
       [2.        , 3.8       ,