In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing Libraries

In [None]:
import joblib
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

In [None]:
df = pd.read_csv("/kaggle/input/critical-heat-flux-prediction/Data_CHF_Zhao_2020_ATE.csv")

df.shape

In [None]:
df.head(3)

In [None]:
df.info()

## Checking missing values

In [None]:
df.isnull().sum()

## Preprocessing

In [None]:
def preprocess_inputs(df):
    # Drop "id" and "author" columns
    df = df.drop(columns=["id", "author"], axis=1)
    
    # Remove duplicate records
    df = df.drop_duplicates()
    
    # Rename columns
    df = df.rename(columns={
        'pressure [MPa]': 'pressure_mp',
        'mass_flux [kg/m2-s]': 'mass_flux',
        'x_e_out [-]': 'xe_out',
        'D_e [mm]': 'de_mm',
        'D_h [mm]': 'dh_mm',
        'length [mm]': 'length',
        'chf_exp [MW/m2]': 'chf_exp'
    })
    
    # Split the dataset into X and y
    X = df.drop(["xe_out"], axis=1)
    y = df["xe_out"]
    
    return X, y

In [None]:
X, y = preprocess_inputs(df)

In [None]:
X

In [None]:
y

In [None]:
X["geometry"].unique()

In [None]:
# categorical columns and numeric columns

categorical_columns = ['geometry']  

numeric_columns = [col for col in X.columns if col not in categorical_columns]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())  
])

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Create an ANN model
ann_model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)

# Create a pipeline that combines data preprocessing and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', ann_model)
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Calculate the Root Mean Squared Error (RMSE) and R-squared (R2) score to evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared (R2) Score: {r2}")

# Save the trained model
joblib.dump(pipeline, 'ann_model.pkl')