In [4]:
import os
import numpy as np
import pandas as pd

# Define the folder containing the data files
data_folder = 'data'

# Define the percentages and distances
percentages = [2.5, 5, 7.5]
distances = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]

# Define the number of rows to use from each file
NUM_ROWS = 200000

# Function to load additional input data from a text file
def load_additional_data(file_path, num_rows):
    data = []
    with open(file_path, 'r') as file:
        # Read only up to num_rows + 1 lines (including header)
        for _ in range(num_rows + 1):
            line = file.readline()
            if not line:
                break
            if _ == 0:
                continue  # Skip the header row
            try:
                data.append(list(map(float, line.strip().split(',')[1:])))  # Split on commas, skip the first column, convert to float
            except ValueError:
                continue  # Skip lines that can't be converted to float
    data = np.array(data)
    return data[:, -3:]  # Last 3 columns are pressure, x-velocity, and y-velocity

# Function to extract percentage and distance from file path
def extract_percentage_distance(file_path):
    filename = os.path.basename(file_path)
    percentage, distance = filename.split('_')
    percentage = float(percentage[:-1])
    distance = float(distance[:-5])
    return percentage, distance

# Collect all file paths
file_paths = []
for percentage in percentages:
    for distance in distances:
        file_path = os.path.join(data_folder, f'{percentage}%_{distance}m.txt')
        if os.path.exists(file_path):
            file_paths.append(file_path)

# Load output data from the Excel file
output_data = pd.read_excel('pipeline.xlsx')
output_data_dict = {(row['percentage'], row['distance']): (row['pressure_diff'], row['leak_mass_flow']) 
                    for _, row in output_data.iterrows()}

# Load all input data and corresponding output data
all_input = []
all_output = []

for file_path in file_paths:
    additional_data = load_additional_data(file_path, NUM_ROWS)
    percentage, distance = extract_percentage_distance(file_path)
    pressure_diff, leak_mass_flow = output_data_dict[(percentage, distance)]
    
    # Concatenate the columns to form a single vector of length 600000
    concatenated_data = additional_data.flatten()
    
    all_input.append(concatenated_data)
    output_values = np.array([pressure_diff, leak_mass_flow])
    all_output.append(output_values)

all_input = np.array(all_input)
all_output = np.array(all_output)

print(f'Shape of the input data: {all_input.shape}')
print(f'Shape of the output data: {all_output.shape}')


Shape of the input data: (60, 600000)
Shape of the output data: (60, 2)


In [8]:
# Check if there are any None values in the list form of all_input
has_none_values = any(sample is None for sample in all_input)

if has_none_values:
    print("Input data contains None values.")
else:
    print("Input data does not contain None values.")


Input data does not contain None values.


In [9]:
# Check if there are any None values in the list form of all_output
has_none_values = any(sample is None for sample in all_output)

if has_none_values:
    print("Output data contains None values.")
else:
    print("Output data does not contain None values.")


Output data does not contain None values.


In [5]:
# Check for None values in the dataset
assert all_input is not None and all_output is not None, "Input or output data contains None values."
assert not np.any(np.isnan(all_input)), "Input data contains NaN values."
assert not np.any(np.isnan(all_output)), "Output data contains NaN values."

# Ensure the input data shape is correct (number_of_files, 600000)
X_train = X_train.reshape(-1, 600000)
X_test = X_test.reshape(-1, 600000)

# Check shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (48, 600000)
X_test shape: (12, 600000)
y_train shape: (48, 2)
y_test shape: (12, 2)


In [6]:
# Define the model architecture
def build_model(input_shape, output_shape):
    input_layer = Input(shape=input_shape, name='input_layer')
    x = Dense(512, activation='relu')(input_layer)
    x = Dense(256, activation='relu')(x)
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)
    output_layer = Dense(output_shape, activation='linear')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    return model

# Build and compile the model
model = build_model((600000,), 2)
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mse', 'mae'])
model.summary()


In [7]:
# Train the model
history = model.fit(
    X_train,
    y_train,
    epochs=100,
    validation_data=(X_test, y_test),
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)]
)

# Evaluate the model
loss, mse, mae = model.evaluate(X_test)
print(f'Loss: {loss}, MSE: {mse}, MAE: {mae}')

# Predict and calculate R^2 score
predictions_scaled = model.predict(X_test)
predictions = output_scaler.inverse_transform(predictions_scaled)
y_test_original = output_scaler.inverse_transform(y_test)

r2 = r2_score(y_test_original, predictions, multioutput='raw_values')
mse = mean_squared_error(y_test_original, predictions, multioutput='raw_values')
mae = mean_absolute_error(y_test_original, predictions, multioutput='raw_values')

print(f'R^2: {r2}, MSE: {mse}, MAE: {mae}')


Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - loss: 4611.8149 - mae: 24.2862 - mse: 4611.8149 - val_loss: 6716.5898 - val_mae: 67.8652 - val_mse: 6716.5898
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2s/step - loss: 4528.0825 - mae: 52.6810 - mse: 4528.0825 - val_loss: 137916.7031 - val_mae: 127.7544 - val_mse: 137916.7031
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2s/step - loss: 41968.0430 - mae: 62.3205 - mse: 41968.0430 - val_loss: 19411.7246 - val_mae: 50.7439 - val_mse: 19411.7246
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2s/step - loss: 5764.0146 - mae: 26.5355 - mse: 5764.0146 - val_loss: 35360.3828 - val_mae: 67.7491 - val_mse: 35360.3828
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2s/step - loss: 20308.9609 - mae: 34.9380 - mse: 20308.9609 - val_loss: 8846.9805 - val_mae: 35.6196 - val_mse: 8846.9805
Epoch 6/100
[1m

ValueError: None values not supported.

In [10]:
import numpy as np
import os
import pandas as pd

# Define the folder containing the data files
data_folder = 'data'

# Define the percentages and distances
percentages = [2.5, 5, 7.5]
distances = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]

# Define the number of rows to use from each file
NUM_ROWS = 200000

# Function to load additional input data from a text file
def load_additional_data(file_path, num_rows):
    data = []
    with open(file_path, 'r') as file:
        # Read only up to num_rows + 1 lines (including header)
        for _ in range(num_rows + 1):
            line = file.readline()
            if not line:
                break
            if _ == 0:
                continue  # Skip the header row
            try:
                data.append(list(map(float, line.strip().split(',')[1:])))  # Split on commas, skip the first column, convert to float
            except ValueError:
                continue  # Skip lines that can't be converted to float
    data = np.array(data)
    return data[:, -3:]  # Last 3 columns are pressure, x-velocity, and y-velocity

# Function to extract percentage and distance from file path
def extract_percentage_distance(file_path):
    filename = os.path.basename(file_path)
    percentage, distance = filename.split('_')
    percentage = float(percentage[:-1])
    distance = float(distance[:-5])
    return percentage, distance

# Collect all file paths
file_paths = []
for percentage in percentages:
    for distance in distances:
        file_path = os.path.join(data_folder, f'{percentage}%_{distance}m.txt')
        if os.path.exists(file_path):
            file_paths.append(file_path)

# Load all input data
all_input = []

for file_path in file_paths:
    additional_data = load_additional_data(file_path, NUM_ROWS)
    # Concatenate the columns to form a single vector of length 600000
    concatenated_data = additional_data.flatten()
    all_input.append(concatenated_data)

all_input = np.array(all_input)
print("Loaded input data:")
print(all_input.shape)
print(all_input[:3])  # Print first 3 rows of input data


Loaded input data:
(60, 600000)
[[ 1.17422735e-01  1.15993694e-02 -1.25489488e-05 ...  7.07894856e-02
   4.00098533e-03 -2.77209255e-09]
 [ 1.17422735e-01  1.15993694e-02 -1.25489488e-05 ...  7.07894856e-02
   4.00098533e-03 -2.77209255e-09]
 [ 1.55928723e-01  0.00000000e+00  0.00000000e+00 ...  3.24692202e-02
   7.49515345e-03 -4.04375338e-10]]


In [11]:
# Load output data from the Excel file
output_data = pd.read_excel('pipeline.xlsx')
output_data_dict = {(row['percentage'], row['distance']): (row['pressure_diff'], row['leak_mass_flow']) 
                    for _, row in output_data.iterrows()}

# Load all output data
all_output = []

for file_path in file_paths:
    percentage, distance = extract_percentage_distance(file_path)
    pressure_diff, leak_mass_flow = output_data_dict[(percentage, distance)]
    output_values = np.array([pressure_diff, leak_mass_flow])
    all_output.append(output_values)

all_output = np.array(all_output)
print("Loaded output data:")
print(all_output.shape)
print(all_output[:3])  # Print first 3 rows of output data


Loaded output data:
(60, 2)
[[ 0.14015002 -0.31060652]
 [ 0.14072789 -0.30094531]
 [ 0.14139264 -0.29158252]]


In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Normalize the input data
input_scaler = StandardScaler()
all_input_scaled = input_scaler.fit_transform(all_input)

# Normalize the output data
output_scaler = StandardScaler()
all_output_scaled = output_scaler.fit_transform(all_output)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_input_scaled, all_output_scaled, test_size=0.2, random_state=42)

# Ensure the input data shape is correct (number_of_files, 600000)
X_train = X_train.reshape(-1, 600000)
X_test = X_test.reshape(-1, 600000)

# Check for any None values in the dataset
assert X_train is not None and X_test is not None and y_train is not None and y_test is not None, "Data contains None values."

# Print the shapes of the training and testing sets
print("Shapes of training and testing sets:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

# Print the first few rows of the training and testing sets
print("First 3 rows of X_train:")
print(X_train[:3])
print("First 3 rows of y_train:")
print(y_train[:3])
print("First 3 rows of X_test:")
print(X_test[:3])
print("First 3 rows of y_test:")
print(y_test[:3])


Shapes of training and testing sets:
X_train: (48, 600000)
y_train: (48, 2)
X_test: (12, 600000)
y_test: (12, 2)
First 3 rows of X_train:
[[ 0.28020182 -0.18569534  0.18569534 ...  0.05834721  0.13167767
   0.04014273]
 [ 0.16991526 -0.18569534  0.18569534 ... -0.15649352  0.16899244
   0.1124291 ]
 [ 0.10569302 -0.18569534  0.18569534 ...  0.37610725  0.15676856
  -0.26102782]]
First 3 rows of y_train:
[[ 0.21604547  0.0067316 ]
 [ 0.06307197  0.33434222]
 [-0.02726671 -0.48066648]]
First 3 rows of X_test:
[[-3.73700907  5.38516481 -5.38516481 ...  4.5478124  -5.07433027
   0.11173591]
 [ 0.28964205 -0.18569534  0.18569534 ... -0.27722138  0.19294067
   0.11200101]
 [ 0.77196897 -0.18569534  0.18569534 ... -0.17996722  0.51218037
   0.11266411]]
First 3 rows of y_test:
[[-0.20128112  0.20244017]
 [ 0.23904631  0.42489637]
 [ 0.89707506  0.78542062]]


In [13]:
# Define the model architecture
def build_model(input_shape, output_shape):
    input_layer = Input(shape=input_shape, name='input_layer')
    x = Dense(512, activation='relu')(input_layer)
    x = Dense(256, activation='relu')(x)
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)
    output_layer = Dense(output_shape, activation='linear')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    return model

# Build and compile the model
model = build_model((600000,), 2)
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mse', 'mae'])
model.summary()


In [14]:
# Train the model
history = model.fit(
    X_train,
    y_train,
    epochs=100,
    validation_data=(X_test, y_test),
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)]
)

# Evaluate the model
loss, mse, mae = model.evaluate(X_test)
print(f'Loss: {loss}, MSE: {mse}, MAE: {mae}')

# Predict and calculate R^2 score
predictions_scaled = model.predict(X_test)
predictions = output_scaler.inverse_transform(predictions_scaled)
y_test_original = output_scaler.inverse_transform(y_test)

r2 = r2_score(y_test_original, predictions, multioutput='raw_values')
mse = mean_squared_error(y_test_original, predictions, multioutput='raw_values')
mae = mean_absolute_error(y_test_original, predictions, multioutput='raw_values')

print(f'R^2: {r2}, MSE: {mse}, MAE: {mae}')


Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 6s/step - loss: 1112.3588 - mae: 14.4017 - mse: 1112.3588 - val_loss: 49900.2773 - val_mae: 68.0892 - val_mse: 49900.2773
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3s/step - loss: 14686.4561 - mae: 30.6721 - mse: 14686.4561 - val_loss: 14033.7998 - val_mae: 57.4574 - val_mse: 14033.7998
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3s/step - loss: 5665.9712 - mae: 40.0021 - mse: 5665.9712 - val_loss: 8409.9189 - val_mae: 43.5325 - val_mse: 8409.9189
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3s/step - loss: 5596.0786 - mae: 43.4574 - mse: 5596.0786 - val_loss: 38878.4453 - val_mae: 72.6705 - val_mse: 38878.4453
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3s/step - loss: 12241.7178 - mae: 38.2623 - mse: 12241.7178 - val_loss: 24773.6328 - val_mae: 63.2885 - val_mse: 24773.6328
Epoch 6/100
[1m2

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3s/step - loss: 8.8053 - mae: 1.8688 - mse: 8.8053 - val_loss: 118.8872 - val_mae: 8.7861 - val_mse: 118.8872
Epoch 86/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3s/step - loss: 5.3143 - mae: 1.5049 - mse: 5.3143 - val_loss: 104.1404 - val_mae: 8.2292 - val_mse: 104.1404
Epoch 87/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3s/step - loss: 9.5630 - mae: 1.7925 - mse: 9.5630 - val_loss: 114.3404 - val_mae: 8.4671 - val_mse: 114.3404
Epoch 88/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3s/step - loss: 6.7874 - mae: 1.6933 - mse: 6.7874 - val_loss: 124.5257 - val_mae: 9.1583 - val_mse: 124.5257
Epoch 89/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3s/step - loss: 9.4071 - mae: 1.7148 - mse: 9.4071 - val_loss: 118.3660 - val_mae: 8.8458 - val_mse: 118.3660
Epoch 90/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3s/step - l

ValueError: None values not supported.

In [1]:
import os
import numpy as np
import pandas as pd

# Define the folder containing the data files
data_folder = 'data'

# Define the percentages and distances
percentages = [2.5, 5, 7.5]
distances = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]

# Define the number of rows to use from each file
NUM_ROWS = 250000

# Function to load additional input data from a text file
def load_additional_data(file_path, num_rows):
    data = []
    with open(file_path, 'r') as file:
        # Read only up to num_rows + 1 lines (including header)
        for _ in range(num_rows + 1):
            line = file.readline()
            if not line:
                break
            if _ == 0:
                continue  # Skip the header row
            try:
                data.append(list(map(float, line.strip().split(',')[1:])))  # Split on commas, skip the first column, convert to float
            except ValueError:
                continue  # Skip lines that can't be converted to float
    data = np.array(data)
    return data[:, -3:]  # Last 3 columns are pressure, x-velocity, and y-velocity

# Function to extract percentage and distance from file path
def extract_percentage_distance(file_path):
    filename = os.path.basename(file_path)
    percentage, distance = filename.split('_')
    percentage = float(percentage[:-1])
    distance = float(distance[:-5])
    return percentage, distance

# Collect all file paths
file_paths = []
for percentage in percentages:
    for distance in distances:
        file_path = os.path.join(data_folder, f'{percentage}%_{distance}m.txt')
        if os.path.exists(file_path):
            file_paths.append(file_path)

# Load all input data
all_input = []

for file_path in file_paths:
    additional_data = load_additional_data(file_path, NUM_ROWS)
    # Concatenate the columns to form a single vector of length 750000
    concatenated_data = additional_data.flatten()
    all_input.append(concatenated_data)

all_input = np.array(all_input)
print("Loaded input data:")
print(all_input.shape)
print(all_input[:3])  # Print first 3 rows of input data

# Check for None values in input data
if np.any(all_input == None):
    print("Input data contains None values.")
else:
    print("Input data does not contain any None values.")

# Load output data from the Excel file
output_data = pd.read_excel('pipeline.xlsx')
output_data_dict = {(row['percentage'], row['distance']): (row['pressure_diff'], row['leak_mass_flow']) 
                    for _, row in output_data.iterrows()}

# Load all output data
all_output = []

for file_path in file_paths:
    percentage, distance = extract_percentage_distance(file_path)
    pressure_diff, leak_mass_flow = output_data_dict[(percentage, distance)]
    output_values = np.array([pressure_diff, leak_mass_flow])
    all_output.append(output_values)

all_output = np.array(all_output)
print("Loaded output data:")
print(all_output.shape)
print(all_output[:3])  # Print first 3 rows of output data

# Check for None values in output data
if np.any(all_output == None):
    print("Output data contains None values.")
else:
    print("Output data does not contain any None values.")

# Normalize the input data
from sklearn.preprocessing import StandardScaler
input_scaler = StandardScaler()
all_input_scaled = input_scaler.fit_transform(all_input)

# Normalize the output data
output_scaler = StandardScaler()
all_output_scaled = output_scaler.fit_transform(all_output)

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(all_input_scaled, all_output_scaled, test_size=0.2, random_state=42)

# Ensure the input data shape is correct (number_of_files, 750000)
X_train = X_train.reshape(-1, 750000)
X_test = X_test.reshape(-1, 750000)

# Check for any None values in the dataset
assert X_train is not None and X_test is not None and y_train is not None and y_test is not None, "Data contains None values."

# Print the shapes of the training and testing sets
print("Shapes of training and testing sets:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

# Print the first few rows of the training and testing sets
print("First 3 rows of X_train:")
print(X_train[:3])
print("First 3 rows of y_train:")
print(y_train[:3])
print("First 3 rows of X_test:")
print(X_test[:3])
print("First 3 rows of y_test:")
print(y_test[:3])


Loaded input data:
(60, 750000)
[[ 1.17422735e-01  1.15993694e-02 -1.25489488e-05 ...  5.83271294e-02
   7.68093359e-03 -3.71882971e-09]
 [ 1.17422735e-01  1.15993694e-02 -1.25489488e-05 ...  5.83271294e-02
   7.68093359e-03 -3.71882971e-09]
 [ 1.55928723e-01  0.00000000e+00  0.00000000e+00 ...  4.06258912e-02
   1.61717300e-03 -5.00160172e-10]]
Input data does not contain any None values.
Loaded output data:
(60, 2)
[[ 0.14015002 -0.31060652]
 [ 0.14072789 -0.30094531]
 [ 0.14139264 -0.29158252]]
Output data does not contain any None values.
Shapes of training and testing sets:
X_train: (48, 750000)
y_train: (48, 2)
X_test: (12, 750000)
y_test: (12, 2)
First 3 rows of X_train:
[[ 0.28020182 -0.18569534  0.18569534 ...  0.17858205 -0.21895742
   0.17967591]
 [ 0.16991526 -0.18569534  0.18569534 ... -0.14175091 -0.22212558
   0.18215694]
 [ 0.10569302 -0.18569534  0.18569534 ...  3.10540157  1.29466878
  -6.47320482]]
First 3 rows of y_train:
[[ 0.21604547  0.0067316 ]
 [ 0.06307197  0.

In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

# Define the folder containing the data files
data_folder = 'data'

# Define the percentages and distances
percentages = [2.5, 5, 7.5]
distances = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]

# Define the number of rows to use from each file
NUM_ROWS = 250000

# Function to load additional input data from a text file
def load_additional_data(file_path, num_rows):
    data = []
    with open(file_path, 'r') as file:
        # Read only up to num_rows + 1 lines (including header)
        for _ in range(num_rows + 1):
            line = file.readline()
            if not line:
                break
            if _ == 0:
                continue  # Skip the header row
            try:
                data.append(list(map(float, line.strip().split(',')[1:])))  # Split on commas, skip the first column, convert to float
            except ValueError:
                continue  # Skip lines that can't be converted to float
    data = np.array(data)
    return data[:, -3:]  # Last 3 columns are pressure, x-velocity, and y-velocity

# Function to extract percentage and distance from file path
def extract_percentage_distance(file_path):
    filename = os.path.basename(file_path)
    percentage, distance = filename.split('_')
    percentage = float(percentage[:-1])
    distance = float(distance[:-5])
    return percentage, distance

# Collect all file paths
file_paths = []
for percentage in percentages:
    for distance in distances:
        file_path = os.path.join(data_folder, f'{percentage}%_{distance}m.txt')
        if os.path.exists(file_path):
            file_paths.append(file_path)

# Load all input data
all_input = []

for file_path in file_paths:
    additional_data = load_additional_data(file_path, NUM_ROWS)
    # Concatenate the columns to form a single vector of length 750000
    concatenated_data = additional_data.flatten()
    all_input.append(concatenated_data)

all_input = np.array(all_input)
print("Loaded input data:")
print(all_input.shape)
print(all_input[:3])  # Print first 3 rows of input data

# Check for None values in input data
if np.any(all_input == None):
    print("Input data contains None values.")
else:
    print("Input data does not contain any None values.")

# Load output data from the Excel file
output_data = pd.read_excel('pipeline.xlsx')
output_data_dict = {(row['percentage'], row['distance']): (row['pressure_diff'], row['leak_mass_flow']) 
                    for _, row in output_data.iterrows()}

# Load all output data
all_output = []

for file_path in file_paths:
    percentage, distance = extract_percentage_distance(file_path)
    pressure_diff, leak_mass_flow = output_data_dict[(percentage, distance)]
    output_values = np.array([pressure_diff, leak_mass_flow])
    all_output.append(output_values)

all_output = np.array(all_output)
print("Loaded output data:")
print(all_output.shape)
print(all_output[:3])  # Print first 3 rows of output data

# Check for None values in output data
if np.any(all_output == None):
    print("Output data contains None values.")
else:
    print("Output data does not contain any None values.")

# Normalize the input data
input_scaler = StandardScaler()
all_input_scaled = input_scaler.fit_transform(all_input)

# Normalize the output data
output_scaler = StandardScaler()
all_output_scaled = output_scaler.fit_transform(all_output)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_input_scaled, all_output_scaled, test_size=0.2, random_state=42)

# Ensure the input data shape is correct (number_of_files, 750000)
X_train = X_train.reshape(-1, 750000)
X_test = X_test.reshape(-1, 750000)

# Check for any None values in the dataset
assert X_train is not None and X_test is not None and y_train is not None and y_test is not None, "Data contains None values."

# Define the model architecture
def build_model(input_shape, output_shape):
    input_layer = Input(shape=input_shape, name='input_layer')
    x = Dense(512, activation='relu')(input_layer)
    x = Dense(256, activation='relu')(x)
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)
    output_layer = Dense(output_shape, activation='linear')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    return model

# Build and compile the model
model = build_model((750000,), 2)
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mse', 'mae'])

# Train the model
history = model.fit(
    X_train,
    y_train,
    epochs=100,
    validation_data=(X_test, y_test),
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)]
)

# Evaluate the model
loss, mse, mae = model.evaluate(X_test)

# Predict and calculate R^2 score
predictions_scaled = model.predict(X_test)
predictions = output_scaler.inverse_transform(predictions_scaled)
y_test_original = output_scaler.inverse_transform(y_test)

r2 = r2_score(y_test_original, predictions, multioutput='raw_values')
mse = mean_squared_error(y_test_original, predictions, multioutput='raw_values')
mae = mean_absolute_error(y_test_original, predictions, multioutput='raw_values')

print(f'Loss: {loss}, MSE: {mse}, MAE: {mae}, R^2: {r2}')

# Plot training history
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Training History')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


Loaded input data:
(60, 750000)
[[ 1.17422735e-01  1.15993694e-02 -1.25489488e-05 ...  5.83271294e-02
   7.68093359e-03 -3.71882971e-09]
 [ 1.17422735e-01  1.15993694e-02 -1.25489488e-05 ...  5.83271294e-02
   7.68093359e-03 -3.71882971e-09]
 [ 1.55928723e-01  0.00000000e+00  0.00000000e+00 ...  4.06258912e-02
   1.61717300e-03 -5.00160172e-10]]
Input data does not contain any None values.
Loaded output data:
(60, 2)
[[ 0.14015002 -0.31060652]
 [ 0.14072789 -0.30094531]
 [ 0.14139264 -0.29158252]]
Output data does not contain any None values.
Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 6s/step - loss: 2769.2478 - mae: 19.4723 - mse: 2769.2478 - val_loss: 20469.1426 - val_mae: 103.9380 - val_mse: 20469.1426
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step - loss: 10276.2158 - mae: 66.4005 - mse: 10276.2158 - val_loss: 355480.7812 - val_mae: 148.4969 - val_mse: 355480.7812
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2s/step - loss: 4966.4512 - mae: 11.7049 - mse: 4966.4512 - val_loss: 9481.0166 - val_mae: 38.9463 - val_mse: 9481.0166
Epoch 83/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2s/step - loss: 2693.6997 - mae: 11.6512 - mse: 2693.6997 - val_loss: 9612.5752 - val_mae: 40.3830 - val_mse: 9612.5752
Epoch 84/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2s/step - loss: 5081.2285 - mae: 12.1913 - mse: 5081.2285 - val_loss: 13650.1982 - val_mae: 38.9637 - val_mse: 13650.1982
Epoch 85/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2s/step - loss: 3936.0715 - mae: 12.3988 - mse: 3936.0715 - val_loss: 4412.2324 - val_mae: 28.8636 - val_mse: 4412.2324


ValueError: None values not supported.