In [1]:
import pandas as pd
import time
import joblib
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
import os

# List of main folder numbers to iterate over
main_folder_numbers = [500, 1000, 1500, 2000, 2500]  # Update with your actual main folder numbers
subfolder_numbers = [1, 2, 3, 4, 5]  # Subfolders labeled 1 to 5
base_path = r'C:\Users\path\TRAIN'

# Function to train and evaluate model
def train_and_evaluate(main_folder_number, subfolder_number):
    # Construct the paths to the training files
    X_train_path = os.path.join(base_path, f'X_train_{main_folder_number}_{subfolder_number}.csv')
    y_train_path = os.path.join(base_path, f'y_train_{main_folder_number}_{subfolder_number}.csv')

    # Load the feature and target data
    try:
        print(f"Loading data from {X_train_path} and {y_train_path}")
        X_train = pd.read_csv(X_train_path)
        y_train = pd.read_csv(y_train_path)
    except FileNotFoundError as e:
        print(f"Error loading files: {e}")
        return

    # Define the columns for X_train
    l = []
    for x in range(1, 21):
        l.append("Force_x_" + str(x))
        l.append("Force_y_" + str(x))
        for p in range(1, 565):  # elements number
            l.append("Strain_x_" + str(p) + "_" + str(x))
            l.append("Strain_y_" + str(p) + "_" + str(x))
            l.append("Strain_xy_" + str(p) + "_" + str(x))

    # Assign the defined column names to X_train
    X_train.columns = l
    print(f"X_train shape: {X_train.shape}")
    print(f"X_train columns: {X_train.columns.tolist()}")

    # Start the timer for training
    start_time_training = time.monotonic()

    # Train the model on the training data
    try:
        modelo = MultiOutputRegressor(xgb.XGBRegressor(learning_rate=0.02, max_depth=15, n_estimators=1000)).fit(X_train, y_train)
    except Exception as e:
        print(f"Error training model: {e}")
        return

    # Stop the timer for training
    end_time_training = time.monotonic()
    training_duration = end_time_training - start_time_training
    print(f"Training duration for model {main_folder_number}_{subfolder_number}: {training_duration} seconds")

    # Save the trained model
    model_filename = f'modelo_xgboost_{main_folder_number}_{subfolder_number}.joblib'
    try:
        joblib.dump(modelo, model_filename)
        print(f"Model saved as {model_filename}")
    except Exception as e:
        print(f"Error saving model: {e}")
        return

    # Predict on the training data
    try:
        y_train_pred = modelo.predict(X_train)
    except Exception as e:
        print(f"Error predicting on training data: {e}")
        return

    # Performance on training data
    try:
        r2_train = r2_score(y_train, y_train_pred)
        mae_train = mean_absolute_error(y_train, y_train_pred)
        mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
        print(f'R-squared on Train Data for {main_folder_number}_{subfolder_number}: {r2_train}')
        print(f'MAE on Train Data for {main_folder_number}_{subfolder_number}: {mae_train}')
        print(f'MAPE on Train Data for {main_folder_number}_{subfolder_number}: {mape_train}')
    except Exception as e:
        print(f"Error calculating performance metrics: {e}")
        return

    return {
        'main_folder_number': main_folder_number,
        'subfolder_number': subfolder_number,
        'r2': r2_train,
        'mae': mae_train,
        'mape': mape_train,
        'training_duration': training_duration
    }

# Iterate over the main folder numbers and subfolder numbers to train and evaluate models
results = []
for main_folder_number in main_folder_numbers:
    for subfolder_number in subfolder_numbers:
        result = train_and_evaluate(main_folder_number, subfolder_number)
        if result:  # Ensure result is not None
            results.append(result)

# Save overall results
results_df = pd.DataFrame(results)
overall_results_path = os.path.join(base_path, 'overall_performance_metrics.csv')
results_df.to_csv(overall_results_path, index=False)
print(f"Overall performance metrics saved to {overall_results_path}")


# IMPORT LIBRARY

In [1]:
import pandas as pd;
import numpy as np;
import random;
import matplotlib.pyplot as plt;
from sklearn.preprocessing import StandardScaler,MinMaxScaler;
from sklearn.model_selection import train_test_split;
from sklearn.linear_model import LinearRegression;
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error,mean_absolute_percentage_error; 
from sklearn.ensemble import RandomForestRegressor;
from sklearn.svm import SVR;
import xgboost as xgb;
from IPython.core.interactiveshell import InteractiveShell;
from IPython.display import display;
import time;
import joblib;
from sklearn.multioutput import MultiOutputRegressor;

# CHOOSE DATASET FOR TRAINING 

In [2]:
#IMPORT THE FILTERED DATA FOR TRAINING
#THE DATA SHOULD ALREADY BE NORMALIZED
#THE DATA SHOULD ONLY CONTAIN USEFUL SIMULATIONS (Fxy20>Fxy19)
#COPY TRAINING SET FROM "(...)\Datasets\Datatrain\XXXX\N\ to (...)\TRAIN
#CHANGE NUMBERS ON THE TRAINING NAME FILE FOR THE DESIRED MODEL

X_train = pd.read_csv(r'C:\Users\gambo\Drive\Universidade de Aveiro\OP_DPE_2023 24_João_Marques_(Dissertação_ML) - General\Machine Learning\TRAIN\X_train_500_1.csv')

#DEFINE X COLUMNS
l=[]
for x in range(1,21):
    l.append("Force_x_"+str(x))
    l.append("Force_y_"+str(x))
    for p in range(1,565):#elements number
        l.append("Strain_x_"+str(p)+"_"+str(x))
        l.append("Strain_y_"+str(p)+"_"+str(x))
        l.append("Strain_xy_"+str(p)+"_"+str(x))
X_train.columns = l

display(X_train)

Unnamed: 0,Force_x_1,Force_y_1,Strain_x_1_1,Strain_y_1_1,Strain_xy_1_1,Strain_x_2_1,Strain_y_2_1,Strain_xy_2_1,Strain_x_3_1,Strain_y_3_1,...,Strain_xy_561_20,Strain_x_562_20,Strain_y_562_20,Strain_xy_562_20,Strain_x_563_20,Strain_y_563_20,Strain_xy_563_20,Strain_x_564_20,Strain_y_564_20,Strain_xy_564_20
0,0.266128,1.415468,-1.211796,1.257280,-1.245770,-1.218439,1.263649,-1.188465,-1.218013,1.270970,...,1.593195,0.857547,-0.978087,1.795889,0.857243,-1.006064,1.457292,0.701631,-0.914467,1.607979
1,0.830771,0.653952,0.760128,-0.866536,1.024421,0.761036,-0.869146,1.069098,0.771046,-0.880929,...,0.686807,-0.121648,0.035280,0.554327,-0.172618,0.011162,0.436012,-0.251603,0.040870,0.508650
2,-0.350623,-0.055952,0.322190,-0.163163,-0.264205,0.358506,-0.198379,-0.262932,0.433006,-0.271895,...,1.312366,0.820798,-0.859548,1.006821,0.852927,-0.906549,0.634729,0.762834,-0.852245,0.887156
3,-0.364417,0.078782,0.103307,0.005920,-0.783919,0.123094,-0.013018,-0.915795,0.079401,0.015082,...,-1.303052,-1.475684,1.524507,-0.957024,-1.373788,1.687353,-0.262947,-1.080997,1.663870,-0.580444
4,-1.379887,-1.741839,-1.511295,1.025989,-0.776234,-1.489363,0.996659,-0.556148,-1.433747,0.923846,...,-1.272260,0.821150,-0.712408,-1.436564,0.846180,-0.752843,-1.651461,0.929614,-0.865884,-1.579886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,-0.216837,-0.412654,1.119981,-1.184624,0.007469,1.138961,-1.209989,-0.043813,1.174267,-1.256442,...,0.280512,-0.535680,0.637398,0.399367,-0.569065,0.691112,0.628324,-0.569780,0.727459,0.566152
496,1.554957,1.311537,0.719898,-0.890625,1.554692,0.693305,-0.865197,1.567418,0.657041,-0.832633,...,1.289375,-0.493874,0.311714,1.073459,-0.559700,0.243683,0.927584,-0.733722,0.304338,1.056625
497,-0.400090,-0.702910,1.854105,-1.828024,0.372227,1.856847,-1.839275,0.182578,1.841955,-1.842775,...,-0.538502,-1.960070,2.275618,-0.197442,-1.760458,2.153065,0.531134,-1.498058,2.003490,0.285301
498,-0.543275,-0.785037,-0.729541,0.622320,0.326139,-0.678021,0.579784,0.501560,-0.574660,0.493106,...,0.190884,0.882813,-0.841402,0.310078,0.880372,-0.826808,0.100622,0.817618,-0.783249,0.275287


In [3]:
#IMPORT THE FILTERED DATA FOR TRAINING
#THE DATA SHOULD ALREADY BE NORMALIZED
#THE DATA SHOULD ONLY CONTAIN USEFUL SIMULATIONS (Fxy20>Fxy19)
#COPY TRAINING SET FROM "(...)\Datasets\Datatrain\XXXX\N\ to (...)\TRAIN
#CHANGE NUMBERS ON THE TRAINING NAME FILE FOR THE DESIRED MODEL

y_train = pd.read_csv(r'C:\Users\gambo\Drive\Universidade de Aveiro\OP_DPE_2023 24_João_Marques_(Dissertação_ML) - General\Machine Learning\TRAIN\y_train_500_1.csv')
display(y_train)

Unnamed: 0,F,G,N,sigma0,k,n
0,0.0937,0.4466,3.3757,234.50,669.26,0.292
1,0.1946,0.1872,1.9544,210.79,574.39,0.184
2,0.2113,0.3307,3.1871,138.41,624.65,0.209
3,0.1448,0.3008,0.4990,164.55,428.35,0.234
4,0.5553,0.1859,3.2725,150.87,359.98,0.235
...,...,...,...,...,...,...
495,0.2694,0.1906,1.3964,132.32,620.95,0.229
496,0.1516,0.1544,1.8873,248.88,605.59,0.197
497,0.2846,0.1459,0.7405,132.76,513.60,0.230
498,0.4051,0.2620,3.9609,185.48,613.00,0.290


# TRAIN MODEL

In [4]:
#START TIMER
start_time_training = time.monotonic()

#TRAIN MODEL ON TRAINING DATA
modelo = MultiOutputRegressor(xgb.XGBRegressor(learning_rate=0.02, max_depth=15, n_estimators=1000)).fit(X_train, y_train)

#FILE NAME
caminho_do_arquivo_modelo = 'modelo_xgboost_500_1.joblib' #DEFINIR NOME "modelo_xgboost_xxxx_y"
joblib.dump(modelo, caminho_do_arquivo_modelo)

#STOP TIMER
end_time_training = time.monotonic()

#CALCULATE TIME FOR TRAINING SESSION
training_duration = end_time_training - start_time_training
print(f"Training duration: {training_duration}")

KeyboardInterrupt: 

In [12]:
#PREDICT TRAINING VALUES 
y_train_pred = modelo.predict(X_train)

#PERFORMANCE ON TRAINING
r2_train = r2_score(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
print(f'R-squared on Train Data: {r2_train}')
print(f'MAE on Train Data: {mae_train}')
print(f'MAPE on Train Data: {mape_train}')

R-squared on Train Data: 0.014857205253932118
MAE on Train Data: 20.708174498345887
MAPE on Train Data: 0.3503880170571771
