## **Import Libraries**

In [1]:
# import libraries

import pandas as pd
import torch                                 # pytorch
from torch_geometric.data import Data        # to work with graph data

## **Import Dataset**

In [2]:
# import csv 

sensor_df_raw = pd.read_csv(r"C:\Users\colin\OneDrive\Desktop\Thesis Part 2\thesis - simulation\data\sensor.csv")
buildings_df_raw = pd.read_csv(r"C:\Users\colin\OneDrive\Desktop\Thesis Part 2\thesis - simulation\data\buildings.csv")
labels_df_raw = pd.read_csv(r"C:\Users\colin\OneDrive\Desktop\Thesis Part 2\thesis - simulation\data\labels.csv")
distance_df = pd.read_csv(r"C:\Users\colin\OneDrive\Desktop\Thesis Part 2\thesis - simulation\data\distance.csv")
sensor_length_df = pd.read_csv(r"C:\Users\colin\OneDrive\Desktop\Thesis Part 2\thesis - simulation\data\sensor_length.csv")
vertex_length_df = pd.read_csv(r"C:\Users\colin\OneDrive\Desktop\Thesis Part 2\thesis - simulation\data\vertex_length.csv")
print('imported')

imported


## **Prepare Dataset - sensor, building, labels.csv**

In [3]:
# transpose the data

sensor_df= sensor_df_raw.transpose()
buildings_df= buildings_df_raw.transpose()
labels_df= labels_df_raw.transpose()

In [4]:
# function to drop first row (integer indices)

def drop_indice(df):
    df.columns = df.iloc[0]   # Set the first row as column names
    df = df.iloc[1:]  # Remove the first row
    df.reset_index(drop=True, inplace=True)  # Reset the index
    return df

In [5]:
# drop indices

sensor_df = drop_indice(sensor_df)
buildings_df = drop_indice(buildings_df)
labels_df = drop_indice(labels_df)

In [6]:
sensor_df.head()

sensor_id,sensor_x_coordinate,sensor_y_coordinate,sensor_z_coordinate
0,477.047607,460.741455,5.142857
1,477.047607,460.741455,15.428571
2,477.047607,460.741455,25.714286
3,477.047607,460.741455,36.0
4,477.047607,460.741455,46.285713


In [55]:
sensor_df.shape

(592, 5)

In [7]:
buildings_df.head()

vertex_id,x_coordinate,y_coordinate,z_coordinate
0,461.530842,526.137223,0.0
1,461.483806,525.26619,0.0
2,461.483806,525.26619,144.0
3,461.530842,526.137223,144.0
4,467.275519,525.031849,0.0


In [56]:
buildings_df.shape

(304, 5)

In [8]:
labels_df.head()

sensor_id,hb_solar_radiation
0,614.938511
1,605.714347
2,601.056937
3,606.311304
4,607.236071


In [9]:
labels_df.shape

(592, 1)

## **Prepare Dataset - edge.csv**

In [10]:
distance_df = pd.read_csv(r"C:\Users\colin\OneDrive\Desktop\Thesis Part 2\thesis - simulation\data\distance.csv")
sensor_length_df = pd.read_csv(r"C:\Users\colin\OneDrive\Desktop\Thesis Part 2\thesis - simulation\data\sensor_length.csv")
vertex_length_df = pd.read_csv(r"C:\Users\colin\OneDrive\Desktop\Thesis Part 2\thesis - simulation\data\vertex_length.csv")
print('successful')

successful


In [11]:
#check
distance_df.head()

Unnamed: 0,distance
0,67.407903
1,66.574186
2,153.905749
3,154.268216
4,65.231875


In [12]:
distance_df.shape

(179968, 1)

In [13]:
#check
sensor_length_df.head()

Unnamed: 0,592


In [14]:
#check
vertex_length_df.head()

Unnamed: 0,304


In [15]:
#extract out the length values from both dataframes

sensor_length = sensor_length_df.columns[0]
print(sensor_length)
vertex_length = vertex_length_df.columns[0]
print(vertex_length)

592
304


In [16]:
# convert length to numerical from string values 

sensor_length = int(sensor_length)
print(type(sensor_length))

vertex_length = int(vertex_length)
print(type(vertex_length))

<class 'int'>
<class 'int'>


In [17]:
def create_temp_df(n, m):
    # n is the number of sensors, m is the number of vertices
    data = []  # List to hold the data
    for sensor_length in range(1, n+1):  # Iterate through each sensor
        for vertex_length in range(1, m + 1):  # Iterate through each vertex for the sensor
            # Append a tuple with the sensor ID and vertex ID to the list
            data.append((f'sensor_id_{sensor_length}', f'vertex_{vertex_length}'))
    
    # Create DataFrame from the data list
    temp_df = pd.DataFrame(data, columns=['sensor_id', 'vertex_id'])
    
    return temp_df

In [18]:
# call function to create sensor id and vertex id dataframe

temp_df = create_temp_df(sensor_length, vertex_length)
temp_df

Unnamed: 0,sensor_id,vertex_id
0,sensor_id_1,vertex_1
1,sensor_id_1,vertex_2
2,sensor_id_1,vertex_3
3,sensor_id_1,vertex_4
4,sensor_id_1,vertex_5
...,...,...
179963,sensor_id_592,vertex_300
179964,sensor_id_592,vertex_301
179965,sensor_id_592,vertex_302
179966,sensor_id_592,vertex_303


In [19]:
temp_df['distance'] = distance_df['distance']
edge_df = temp_df

In [20]:
edge_df

Unnamed: 0,sensor_id,vertex_id,distance
0,sensor_id_1,vertex_1,67.407903
1,sensor_id_1,vertex_2,66.574186
2,sensor_id_1,vertex_3,153.905749
3,sensor_id_1,vertex_4,154.268216
4,sensor_id_1,vertex_5,65.231875
...,...,...,...
179963,sensor_id_592,vertex_300,66.892615
179964,sensor_id_592,vertex_301,143.250578
179965,sensor_id_592,vertex_302,67.726446
179966,sensor_id_592,vertex_303,144.568339


## **Prepare Dataset - Null Check**

In [21]:
# function for null values check

def null_check(df):
    total_rows = len(df)
    for column in df.columns:
        null_count = df[column].isnull().sum()
        null_percentage = round((null_count/total_rows)*100,2)
        print('_'*20)
        print(f'{column}')
        print('_'*20)
        print(f'Null count: {null_count}')
        print(f'Percentage : {null_percentage}')

In [22]:
null_check(sensor_df)
null_check(buildings_df)
null_check(labels_df)
null_check(distance_df)

____________________
sensor_x_coordinate
____________________
Null count: 0
Percentage : 0.0
____________________
sensor_y_coordinate
____________________
Null count: 0
Percentage : 0.0
____________________
sensor_z_coordinate
____________________
Null count: 0
Percentage : 0.0
____________________
x_coordinate
____________________
Null count: 0
Percentage : 0.0
____________________
y_coordinate
____________________
Null count: 0
Percentage : 0.0
____________________
z_coordinate
____________________
Null count: 0
Percentage : 0.0
____________________
hb_solar_radiation
____________________
Null count: 0
Percentage : 0.0
____________________
distance
____________________
Null count: 0
Percentage : 0.0


## **Preprocess Node Index and Naming**

In [23]:
# Resetting the index so 'sensor_id' is no longer the index column
sensor_df.reset_index(drop=True, inplace=True)

# Creating a new 'sensor_id' column with formatted values
sensor_df['sensor_id'] = ['sensor_id_' + str(i + 1) for i in sensor_df.index]

# Insert the 'sensor_id_formatted' column as the first column
sensor_df.insert(0, 'sensor_id', sensor_df.pop('sensor_id'))

In [24]:
# Resetting the index so 'sensor_id' is no longer the index column
buildings_df.reset_index(drop=True, inplace=True)

# Creating a new 'sensor_id' column with formatted values
buildings_df['vertex_id'] = ['vertex_id_' + str(i + 1) for i in buildings_df.index]

# Insert the 'sensor_id_formatted' column as the first column
buildings_df.insert(0, 'vertex_id', buildings_df.pop('vertex_id'))

In [25]:
# Resetting the index so 'sensor_id' is no longer the index column
labels_df.reset_index(drop=True, inplace=True)

# Creating a new 'sensor_id' column with formatted values
labels_df['sensor_id'] = ['sensor_id_' + str(i + 1) for i in labels_df.index]

# Insert the 'sensor_id_formatted' column as the first column
labels_df.insert(0, 'sensor_id', labels_df.pop('sensor_id'))

## **Final Check**

In [26]:
print('sensor_df')
print(sensor_df.head())
print('buildings_df')
print(buildings_df.head())
print('labels_df')
print(labels_df.head())
print('edge_df')
print(edge_df.head())

sensor_df
sensor_id    sensor_id sensor_x_coordinate sensor_y_coordinate  \
0          sensor_id_1          477.047607          460.741455   
1          sensor_id_2          477.047607          460.741455   
2          sensor_id_3          477.047607          460.741455   
3          sensor_id_4          477.047607          460.741455   
4          sensor_id_5          477.047607          460.741455   

sensor_id sensor_z_coordinate  
0                    5.142857  
1                   15.428571  
2                   25.714286  
3                        36.0  
4                   46.285713  
buildings_df
vertex_id    vertex_id x_coordinate y_coordinate z_coordinate
0          vertex_id_1   461.530842   526.137223          0.0
1          vertex_id_2   461.483806    525.26619          0.0
2          vertex_id_3   461.483806    525.26619        144.0
3          vertex_id_4   461.530842   526.137223        144.0
4          vertex_id_5   467.275519   525.031849          0.0
labels_df
sensor

## **Overall Graph Object**

- Node Features
- Edge Index
- Edge Attributes
- Combine to form data object

## **Prepare Node Features**

Combine sensor and building information to create a unified node feature matrix. 

Combine sensor_df and buildings_df into a single dataframe, ensuring each has a unique identifier across sensors and vertices

In [27]:
# Add a column to distinguish between sensors and vertices
sensor_df['type'] = 'sensor'
buildings_df['type'] = 'vertex'

In [28]:
# Combine dataframes
all_nodes_df = pd.concat([sensor_df.assign(index=range(0, len(sensor_df))),
                          buildings_df.assign(index=range(len(sensor_df), len(sensor_df)+len(buildings_df)))])


In [29]:
all_nodes_df

Unnamed: 0,sensor_id,sensor_x_coordinate,sensor_y_coordinate,sensor_z_coordinate,type,index,vertex_id,x_coordinate,y_coordinate,z_coordinate
0,sensor_id_1,477.047607,460.741455,5.142857,sensor,0,,,,
1,sensor_id_2,477.047607,460.741455,15.428571,sensor,1,,,,
2,sensor_id_3,477.047607,460.741455,25.714286,sensor,2,,,,
3,sensor_id_4,477.047607,460.741455,36.0,sensor,3,,,,
4,sensor_id_5,477.047607,460.741455,46.285713,sensor,4,,,,
...,...,...,...,...,...,...,...,...,...,...
299,,,,,vertex,891,vertex_id_300,461.090481,441.05205,81.0
300,,,,,vertex,892,vertex_id_301,458.656271,441.127307,0.0
301,,,,,vertex,893,vertex_id_302,458.656271,441.127307,81.0
302,,,,,vertex,894,vertex_id_303,458.438724,434.55743,0.0


In [30]:
# Prepare node features - example: using coordinates and a type flag (sensor=1, vertex=0)
all_nodes_df['type_flag'] = all_nodes_df['type'].apply(lambda x: 1 if x == 'sensor' else 0)
node_features = all_nodes_df[['sensor_x_coordinate', 'sensor_y_coordinate', 'sensor_z_coordinate', 'type_flag']].fillna(0).values
x = torch.tensor(node_features, dtype=torch.float)

In [31]:
# Print the tensor to check its contents
print("Node Features Tensor:")
print(x)

Node Features Tensor:
tensor([[477.0476, 460.7415,   5.1429,   1.0000],
        [477.0476, 460.7415,  15.4286,   1.0000],
        [477.0476, 460.7415,  25.7143,   1.0000],
        ...,
        [  0.0000,   0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,   0.0000]])


## **Create Edge Index**

1. Map each sensor_id and vertex_id to a unique index.2. 
Use these mappings to create the edge_index tensor from edge_df.

Step 1: Prepare Mappings
First, we'll create dictionaries to map sensor_id and vertex_id to unique indices. We'll concatenate the indices of sensors and vertices to ensure uniqueness across the graph.

In [32]:
sensor_ids = sensor_df['sensor_id'].unique()
vertex_ids = buildings_df['vertex_id'].unique()

# Create a continuous index for sensors and vertices
sensor_index = {sensor_id: i for i, sensor_id in enumerate(sensor_ids)}
vertex_index = {vertex_id: i + len(sensor_index) for i, vertex_id in enumerate(vertex_ids)}

Step 2: Create Edge Index
Next, we'll use these mappings to create the edge_index tensor. Note that the vertex_id in edge_df appears to have a slight discrepancy (missing the "id_" part based on the example provided), so we'll adjust for that in our mapping.

In [33]:
# Adjust the vertex_id in edge_df to match the format in buildings_df
edge_df['adjusted_vertex_id'] = edge_df['vertex_id'].apply(lambda x: 'vertex_id_' + x.split('_')[-1])

# Map sensor_id and vertex_id to their respective indices
edge_index_list = edge_df.apply(lambda row: [sensor_index.get(row['sensor_id'], -1),
                                             vertex_index.get(row['adjusted_vertex_id'], -1)], axis=1)

# Filter out any edges that couldn't be mapped (-1 indicates a mapping failure)
filtered_edge_index_list = [pair for pair in edge_index_list if -1 not in pair]

# Convert to torch tensor
edge_index = torch.tensor(filtered_edge_index_list, dtype=torch.long).t().contiguous()


In [34]:
edge_index

tensor([[  0,   0,   0,  ..., 591, 591, 591],
        [592, 593, 594,  ..., 893, 894, 895]])

## **Edge Attributes**

In [35]:
edge_attr = torch.tensor(edge_df[['distance']].values, dtype=torch.float)

In [36]:
edge_attr

tensor([[ 67.4079],
        [ 66.5742],
        [153.9057],
        ...,
        [ 67.7264],
        [144.5683],
        [ 70.4709]])

## **Target Labels**

In [37]:
# Ensure data type compatibility
labels_df['hb_solar_radiation'] = labels_df['hb_solar_radiation'].astype(float)

In [44]:
# Update labels for sensors with their radiation values
labels_df['index'] = labels_df['sensor_id'].map(sensor_index)

In [45]:
# Create torch tensor with compatible data type
labels = torch.zeros(len(labels_df), dtype=torch.float)
labels[labels_df['index']] = torch.tensor(labels_df['hb_solar_radiation'].values, dtype=torch.float)

## **Create a PyTorch Geometric Data Object**

In [49]:
# use this chunk for TRAINING data
data_training = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=labels)

In [50]:
# Print check for the Data object
print("Data Object:")
print(data_training)

Data Object:
Data(x=[896, 4], edge_index=[2, 179968], edge_attr=[179968, 1], y=[592])


In [51]:
# use this chunk for PREDICTION data
data_predict = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

In [52]:
# Print check for the Data object
print("Data Object:")
print(data_predict)

Data Object:
Data(x=[896, 4], edge_index=[2, 179968], edge_attr=[179968, 1])


## **Saving the Data Object**

In [None]:
# UNCOMMENT to save

#torch.save(data_training, 'data/graph_data_training.pt')
print('saved')

In [54]:
# UNCOMMENT to save

torch.save(data_predict, 'data/graph_data_predict.pt')
print('saved')

saved
