# Machine Learning Pipeline with Reactor data

**Course: Applied AI in Chemical and Process Engineering**

This notebook witll build a pipeline to prepare data and develop an ML algorithm

In [34]:
# Load all the libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, GridSearchCV
from scipy.optimize import minimize
from sklearn.impute import SimpleImputer

# Data Preperation


In [35]:
df = pd.read_csv('https://raw.githubusercontent.com/dissabnd/Applied-AI-in-Chemical-and-Process-Engineering/refs/heads/main/data/reactor.csv')

In [36]:
# prompt: list all columns

df.columns

Index(['Temperature (°C)', 'Pressure (atm)', 'Catalyst Concentration (wt%)',
       'Feed Flow Rate (L/min)', 'Yield (%)', 'Recorded By'],
      dtype='object')

In [37]:
# Print data table
df.head(10)

Unnamed: 0,Temperature (°C),Pressure (atm),Catalyst Concentration (wt%),Feed Flow Rate (L/min),Yield (%),Recorded By
0,205.0,5.0,2.4,9.5,70.0,Nuwan
1,200.0,5.0,2.1,9.0,70.0,Mali
2,225.0,6.0,3.0,10.25,75.75,Nuwan
3,200.0,6.0,2.4,10.5,70.0,Kavi
4,220.0,6.5,3.0,9.5,95.0,Nuwan
5,210.0,4.0,4.0,11.75,73.42,Janith
6,180.0,5.0,3.1,11.25,70.0,Mali
7,190.0,4.75,4.0,9.25,82.51,Janith
8,220.0,5.5,2.2,10.25,71.12,Janith
9,195.0,6.25,2.9,11.5,70.9,Kavi


In [38]:
# Size of the data table

df.shape

(80, 6)

# Data Quality Check

In [30]:
# Remove the text column as it is not important for models

df=df[['Temperature (°C)',
'Pressure (atm)', 'Catalyst Concentration (wt%)','Feed Flow Rate (L/min)',
'Yield (%)']]

df

Unnamed: 0,Temperature (°C),Pressure (atm),Catalyst Concentration (wt%),Feed Flow Rate (L/min),Yield (%)
0,205.0,5.00,2.4,9.50,70.00
1,200.0,5.00,2.1,9.00,70.00
2,225.0,6.00,3.0,10.25,75.75
3,200.0,6.00,2.4,10.50,70.00
4,220.0,6.50,3.0,9.50,95.00
...,...,...,...,...,...
75,400.0,4.50,3.0,9.00,
76,150.0,6.00,12.0,13.00,
77,250.0,6.00,2.2,10.50,95.00
78,215.0,4.00,2.6,10.25,95.00


## Check for missing data

In [31]:
print(df.isna().sum())

Temperature (°C)                1
Pressure (atm)                  1
Catalyst Concentration (wt%)    0
Feed Flow Rate (L/min)          0
Yield (%)                       3
dtype: int64


## Fill the missing data wiht median using imputer

In [32]:
imputer = SimpleImputer(strategy='median')

df2 = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
print(df2.isna().sum())

print (df2.shape)

Temperature (°C)                0
Pressure (atm)                  0
Catalyst Concentration (wt%)    0
Feed Flow Rate (L/min)          0
Yield (%)                       0
dtype: int64
(80, 5)


In [33]:
df2

Unnamed: 0,Temperature (°C),Pressure (atm),Catalyst Concentration (wt%),Feed Flow Rate (L/min),Yield (%)
0,205.0,5.00,2.4,9.50,70.00
1,200.0,5.00,2.1,9.00,70.00
2,225.0,6.00,3.0,10.25,75.75
3,200.0,6.00,2.4,10.50,70.00
4,220.0,6.50,3.0,9.50,95.00
...,...,...,...,...,...
75,400.0,4.50,3.0,9.00,76.68
76,150.0,6.00,12.0,13.00,76.68
77,250.0,6.00,2.2,10.50,95.00
78,215.0,4.00,2.6,10.25,95.00


## Check for duplicates

In [39]:
print("Number of duplicate rows:", df2.duplicated().sum())

print("with duplciates:", df2.shape)

Number of duplicate rows: 3
with duplciates: (80, 5)
without duplciates: (80, 5)


In [40]:
df3=df2.drop_duplicates()

print("without duplciates:", df3.shape)

without duplciates: (77, 5)
