## **README**

This notebook takes in training/prediction csv data from the rhino-grasshopper simulation, processes it, and outputs a Pytorch graph data object.

## **Import Libraries**

In [1]:
# import libraries
import pandas as pd
import torch                                 # pytorch
from torch_geometric.data import Data        # to work with graph data
import datetime

print('loaded')

loaded


## **Set File Path**

In [2]:
# Input file path here
run_num = 'run_3'
file_path = rf'data\csv_training\{run_num}'

print('loaded')

loaded


## **Import Dataset**

In [3]:
# import csv 
sensor_df = pd.read_csv(f'{file_path}\\sensor.csv', header=None)
building_df= pd.read_csv(f'{file_path}\\building.csv', header = None)
label_df= pd.read_csv(f'{file_path}\\label.csv', header = None)
distance_df = pd.read_csv(f'{file_path}\\distance.csv', header= None)
sensor_length_df = pd.read_csv(f'{file_path}\\sensor_length.csv', header = None)
vertex_length_df = pd.read_csv(f'{file_path}\\vertex_length.csv', header = None)

print('imported')

imported


In [4]:
sensor_df

Unnamed: 0,0,1,2
0,477.06308,461.141174,5.142857
1,477.06308,461.141174,15.428571
2,477.06308,461.141174,25.714286
3,477.06308,461.141174,36.000000
4,477.06308,461.141174,46.285713
...,...,...,...
587,483.29660,465.890640,97.714283
588,483.29660,465.890640,108.000000
589,483.29660,465.890640,118.285717
590,483.29660,465.890640,128.571426


In [5]:
building_df

Unnamed: 0,0,1,2
0,461.530842,526.137223,0
1,461.483806,525.266190,0
2,461.483806,525.266190,144
3,461.530842,526.137223,144
4,467.275519,525.031849,0
...,...,...,...
299,461.090481,441.052050,81
300,458.656271,441.127307,0
301,458.656271,441.127307,81
302,458.438724,434.557430,0


In [6]:
label_df

Unnamed: 0,0
0,596.414648
1,592.142129
2,594.472556
3,588.271100
4,597.765451
...,...
587,75.272699
588,75.938974
589,77.658386
590,84.252479


In [7]:
distance_df

Unnamed: 0,0
0,67.023770
1,66.190485
2,153.740162
3,154.100756
4,64.840293
...,...
179963,66.764631
179964,143.184055
179965,67.585628
179966,144.502558


In [8]:
sensor_length_df

Unnamed: 0,0
0,592


In [9]:
vertex_length_df

Unnamed: 0,0
0,304


## **Prepare Dataset - edge.df**

In [10]:
#extract out the length values and convert to numerical from both dataframes

sensor_length = int(sensor_length_df.iloc[0,0])
print(sensor_length)
print(type(sensor_length))

vertex_length = int(vertex_length_df.iloc[0,0])
print(vertex_length)
print(type(vertex_length))

592
<class 'int'>
304
<class 'int'>


In [11]:
# Function to map sensor id onto vertex id for distance df

def create_temp_df(n, m):
    # n is the number of sensors, m is the number of vertices
    data = []  # List to hold the data
    for sensor_length in range(1, n+1):  # Iterate through each sensor
        for vertex_length in range(1, m + 1):  # Iterate through each vertex for the sensor
            # Append a tuple with the sensor ID and vertex ID to the list
            data.append((f'sensor_id_{sensor_length}', f'vertex_{vertex_length}'))
    
    # Create DataFrame from the data list
    temp_df = pd.DataFrame(data, columns=['sensor_id', 'vertex_id'])
    
    return temp_df

In [12]:
# Call function to create sensor id and vertex id dataframe

temp_df = create_temp_df(sensor_length, vertex_length)
temp_df

Unnamed: 0,sensor_id,vertex_id
0,sensor_id_1,vertex_1
1,sensor_id_1,vertex_2
2,sensor_id_1,vertex_3
3,sensor_id_1,vertex_4
4,sensor_id_1,vertex_5
...,...,...
179963,sensor_id_592,vertex_300
179964,sensor_id_592,vertex_301
179965,sensor_id_592,vertex_302
179966,sensor_id_592,vertex_303


In [13]:
# Add new column containing the distance values

temp_df['distance'] = distance_df[0]
edge_df = temp_df

In [14]:
# print check
edge_df

Unnamed: 0,sensor_id,vertex_id,distance
0,sensor_id_1,vertex_1,67.023770
1,sensor_id_1,vertex_2,66.190485
2,sensor_id_1,vertex_3,153.740162
3,sensor_id_1,vertex_4,154.100756
4,sensor_id_1,vertex_5,64.840293
...,...,...,...
179963,sensor_id_592,vertex_300,66.764631
179964,sensor_id_592,vertex_301,143.184055
179965,sensor_id_592,vertex_302,67.585628
179966,sensor_id_592,vertex_303,144.502558


## **Prepare Dataset - Null Check**

In [15]:
# function for null values check

def null_check(df):
    for column in df.columns:
        null_count = df[column].isnull().sum()
        if null_count > 0: 
            print('_'*20)
            print(f'{column}')
            print('_'*20)
            print(f'Null count: {null_count}')
        else:
            print('Nil')

In [16]:
null_check(sensor_df)
null_check(building_df)
null_check(label_df)
null_check(distance_df)

Nil
Nil
Nil
Nil
Nil
Nil
Nil
Nil


## **Preprocess Node Index and Naming**

In [17]:
# Resetting the index so 'sensor_id' is no longer the index column
sensor_df.reset_index(drop=True, inplace=True)

# Creating a new 'sensor_id' column with formatted values
sensor_df['sensor_id'] = ['sensor_id_' + str(i + 1) for i in sensor_df.index]

# Insert the 'sensor_id_formatted' column as the first column
sensor_df.insert(0, 'sensor_id', sensor_df.pop('sensor_id'))

In [18]:
# Resetting the index so 'sensor_id' is no longer the index column
building_df.reset_index(drop=True, inplace=True)

# Creating a new 'sensor_id' column with formatted values
building_df['vertex_id'] = ['vertex_id_' + str(i + 1) for i in building_df.index]

# Insert the 'sensor_id_formatted' column as the first column
building_df.insert(0, 'vertex_id', building_df.pop('vertex_id'))

In [19]:
# Resetting the index so 'sensor_id' is no longer the index column
label_df.reset_index(drop=True, inplace=True)

# Creating a new 'sensor_id' column with formatted values
label_df['sensor_id'] = ['sensor_id_' + str(i + 1) for i in label_df.index]

# Insert the 'sensor_id_formatted' column as the first column
label_df.insert(0, 'sensor_id', label_df.pop('sensor_id'))

## **Add Column Headers**

In [20]:
sensor_df.rename(columns={0: 'sensor_x_coordinate', 1: 'sensor_y_coordinate', 2: 'sensor_z_coordinate'}, inplace=True)

In [21]:
label_df.columns = ['sensor_id', 'hb_solar_radiation']

## **Final Check**

In [22]:
print('sensor_df')
print(sensor_df.head())
print('building_df')
print(building_df.head())
print('label_df')
print(label_df.head())
print('edge_df')
print(edge_df.head())

sensor_df
     sensor_id  sensor_x_coordinate  sensor_y_coordinate  sensor_z_coordinate
0  sensor_id_1            477.06308           461.141174             5.142857
1  sensor_id_2            477.06308           461.141174            15.428571
2  sensor_id_3            477.06308           461.141174            25.714286
3  sensor_id_4            477.06308           461.141174            36.000000
4  sensor_id_5            477.06308           461.141174            46.285713
building_df
     vertex_id           0           1    2
0  vertex_id_1  461.530842  526.137223    0
1  vertex_id_2  461.483806  525.266190    0
2  vertex_id_3  461.483806  525.266190  144
3  vertex_id_4  461.530842  526.137223  144
4  vertex_id_5  467.275519  525.031849    0
label_df
     sensor_id  hb_solar_radiation
0  sensor_id_1          596.414648
1  sensor_id_2          592.142129
2  sensor_id_3          594.472556
3  sensor_id_4          588.271100
4  sensor_id_5          597.765451
edge_df
     sensor_id vert

## **Overall Graph Object**

- Node Features
- Edge Index
- Edge Attributes
- Combine to form data object

## **Prepare Node Features**

Combine sensor and building information to create a unified node feature matrix. 

Combine sensor_df and building_df into a single dataframe, ensuring each has a unique identifier across sensors and vertices

In [23]:
# Add a column to distinguish between sensors and vertices
sensor_df['type'] = 'sensor'
building_df['type'] = 'vertex'

In [24]:
# Combine dataframes
all_nodes_df = pd.concat([sensor_df.assign(index=range(0, len(sensor_df))),
                          building_df.assign(index=range(len(sensor_df), len(sensor_df)+len(building_df)))])

In [25]:
# Main table to use for graph dataset
all_nodes_df

Unnamed: 0,sensor_id,sensor_x_coordinate,sensor_y_coordinate,sensor_z_coordinate,type,index,vertex_id,0,1,2
0,sensor_id_1,477.06308,461.141174,5.142857,sensor,0,,,,
1,sensor_id_2,477.06308,461.141174,15.428571,sensor,1,,,,
2,sensor_id_3,477.06308,461.141174,25.714286,sensor,2,,,,
3,sensor_id_4,477.06308,461.141174,36.000000,sensor,3,,,,
4,sensor_id_5,477.06308,461.141174,46.285713,sensor,4,,,,
...,...,...,...,...,...,...,...,...,...,...
299,,,,,vertex,891,vertex_id_300,461.090481,441.052050,81.0
300,,,,,vertex,892,vertex_id_301,458.656271,441.127307,0.0
301,,,,,vertex,893,vertex_id_302,458.656271,441.127307,81.0
302,,,,,vertex,894,vertex_id_303,458.438724,434.557430,0.0


In [26]:
# Prepare node features - example: using coordinates and a type flag (sensor=1, vertex=0)
all_nodes_df['type_flag'] = all_nodes_df['type'].apply(lambda x: 1 if x == 'sensor' else 0)
node_features = all_nodes_df[['sensor_x_coordinate', 'sensor_y_coordinate', 'sensor_z_coordinate', 'type_flag']].fillna(0).values
x = torch.tensor(node_features, dtype=torch.float)

# Print Check
all_nodes_df

Unnamed: 0,sensor_id,sensor_x_coordinate,sensor_y_coordinate,sensor_z_coordinate,type,index,vertex_id,0,1,2,type_flag
0,sensor_id_1,477.06308,461.141174,5.142857,sensor,0,,,,,1
1,sensor_id_2,477.06308,461.141174,15.428571,sensor,1,,,,,1
2,sensor_id_3,477.06308,461.141174,25.714286,sensor,2,,,,,1
3,sensor_id_4,477.06308,461.141174,36.000000,sensor,3,,,,,1
4,sensor_id_5,477.06308,461.141174,46.285713,sensor,4,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...
299,,,,,vertex,891,vertex_id_300,461.090481,441.052050,81.0,0
300,,,,,vertex,892,vertex_id_301,458.656271,441.127307,0.0,0
301,,,,,vertex,893,vertex_id_302,458.656271,441.127307,81.0,0
302,,,,,vertex,894,vertex_id_303,458.438724,434.557430,0.0,0


In [27]:
# Print the tensor to check its contents
print("Node Features Tensor:")
print(x)

Node Features Tensor:
tensor([[477.0631, 461.1412,   5.1429,   1.0000],
        [477.0631, 461.1412,  15.4286,   1.0000],
        [477.0631, 461.1412,  25.7143,   1.0000],
        ...,
        [  0.0000,   0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,   0.0000]])


## **Create Edge Index**

1. Map each sensor_id and vertex_id to a unique index.2. 
Use these mappings to create the edge_index tensor from edge_df.

Step 1: Prepare Mappings
First, we'll create dictionaries to map sensor_id and vertex_id to unique indices. We'll concatenate the indices of sensors and vertices to ensure uniqueness across the graph.

In [28]:
sensor_ids = sensor_df['sensor_id'].unique()
vertex_ids = building_df['vertex_id'].unique()

# Create a continuous index for sensors and vertices
sensor_index = {sensor_id: i for i, sensor_id in enumerate(sensor_ids)}
vertex_index = {vertex_id: i + len(sensor_index) for i, vertex_id in enumerate(vertex_ids)}

Step 2: Create Edge Index
Next, we'll use these mappings to create the edge_index tensor. Note that the vertex_id in edge_df appears to have a slight discrepancy (missing the "id_" part based on the example provided), so we'll adjust for that in our mapping.

In [29]:
# Adjust the vertex_id in edge_df to match the format in buildings_df
edge_df['adjusted_vertex_id'] = edge_df['vertex_id'].apply(lambda x: 'vertex_id_' + x.split('_')[-1])

# Map sensor_id and vertex_id to their respective indices
edge_index_list = edge_df.apply(lambda row: [sensor_index.get(row['sensor_id'], -1),
                                             vertex_index.get(row['adjusted_vertex_id'], -1)], axis=1)

# Filter out any edges that couldn't be mapped (-1 indicates a mapping failure)
filtered_edge_index_list = [pair for pair in edge_index_list if -1 not in pair]

# Convert to torch tensor
edge_index = torch.tensor(filtered_edge_index_list, dtype=torch.long).t().contiguous()


In [30]:
edge_index

tensor([[  0,   0,   0,  ..., 591, 591, 591],
        [592, 593, 594,  ..., 893, 894, 895]])

## **Edge Attributes**

This section extracts edge attributes from the edge_df and converts them into a torch tensor.

In [31]:
edge_attr = torch.tensor(edge_df[['distance']].values, dtype=torch.float)

In [32]:
edge_attr

tensor([[ 67.0238],
        [ 66.1905],
        [153.7402],
        ...,
        [ 67.5856],
        [144.5026],
        [ 70.3359]])

## **Target Labels**

This sections prepares the labels for sensors by:
1. Aligning with their respective indices
2. Converting to a torch tensor

In [33]:
# Ensure data type compatibility
label_df['hb_solar_radiation'] = label_df['hb_solar_radiation'].astype(float)

In [34]:
# Update labels for sensors with their radiation values
label_df['index'] = label_df['sensor_id'].map(sensor_index)

In [35]:
# Create torch tensor with compatible data type
labels = torch.zeros(len(label_df), dtype=torch.float)
labels[label_df['index']] = torch.tensor(label_df['hb_solar_radiation'].values, dtype=torch.float)

## **Create a PyTorch Geometric Data Object**

**Training Data Code**

In [36]:
# use this chunk for TRAINING data
data_training = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=labels)

In [37]:
# Print check for the Data object
print("Data Object:")
print(data_training)

Data Object:
Data(x=[896, 4], edge_index=[2, 179968], edge_attr=[179968, 1], y=[592])


**Prediction Data Code**

In [38]:
# use this chunk for PREDICTION data
data_predict = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

In [39]:
# Print check for the Data object
print("Data Object:")
print(data_predict)

Data Object:
Data(x=[896, 4], edge_index=[2, 179968], edge_attr=[179968, 1])


## **Saving the Data Object**

In [40]:
# Add timestamp to file name 
now = datetime.datetime.now()
formatted_now = now.strftime('%Y%m%d_%H%M%S')

**Training**

In [41]:
# UNCOMMENT to save
filename = rf'data/torch_data_object_training/{run_num}.pt'
torch.save(data_training, filename)

print('saved')

saved


**Prediction**

In [42]:
# UNCOMMENT to save

'''
filename = rf'data/torch_data_object_prediction/{run_num}.pt'
torch.save(data_predict, filename)

print('saved')
'''

"\nfilename = rf'data/torch_data_object_prediction/{run_num}.pt'\ntorch.save(data_predict, filename)\n\nprint('saved')\n"