# Data analysis using ML models with OPTIMEO

Let's create an `experimental_data(temp, conc)` function that simulates the yield of a chemical reaction based on temperature, concentration A, concentration B and concentration C.

In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

def experimental_data(temp, cA, cB, cC):
    """
    This function simulates experimental data based on temperature and concentrations.
    The function is not based on any real experimental data and is purely for demonstration purposes.
    """
    out = .2*temp + .5*temp*cA + (cA)/3 + (1 - cB)**2/2 + (3 - cC)/1.5 + np.random.normal(0, 0.2, len(temp))
    return out

def generate_data(N=100):
    temp = np.random.uniform(0, 100, N)
    cA = np.random.uniform(0, 1, N)
    cB = np.random.uniform(0, 1, N)
    cC = np.random.uniform(0, 1, N)
    exp_response = experimental_data(temp, cA, cB, cC)
    # Create a DataFrame with the generated data
    df = pd.DataFrame({'temp': temp, 
                       'cA': cA, 
                       'cB': cB, 
                       'cC': cC, 
                       'response': exp_response})
    return df

df = generate_data(50)
df.to_csv('dataML.csv', index=False)
df.head()

Unnamed: 0,temp,cA,cB,cC,response
0,99.559442,0.508816,0.562837,0.305008,47.281963
1,64.676725,0.60764,0.596785,0.230884,34.33543
2,5.247533,0.600887,0.222615,0.767632,4.197434
3,8.110967,0.556264,0.156656,0.174742,6.482757
4,81.261297,0.969534,0.584825,0.096255,57.762612


Now, we will use the OPTIMEO package to analyse the data.

In [None]:
from optimeo.analysis import *

data = pd.read_csv('dataML.csv')
factors = data.columns[:-1]
response = data.columns[-1]
analysis = DataAnalysis(data, factors, response)
analysis

DataAnalysis(data=(50, 5), factors=Index(['temp', 'cA', 'cB', 'cC'], dtype='object'), response=response, model_type=None, split_size=0.2, encoders={})

First, let's look at a simple linear model:

In [3]:
analysis.compute_linear_model()
analysis.linear_model.summary()

0,1,2,3
Dep. Variable:,response,R-squared:,0.943
Model:,OLS,Adj. R-squared:,0.938
Method:,Least Squares,F-statistic:,184.9
Date:,"Wed, 16 Apr 2025",Prob (F-statistic):,2.6e-27
Time:,17:45:10,Log-Likelihood:,-141.9
No. Observations:,50,AIC:,293.8
Df Residuals:,45,BIC:,303.4
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-8.4730,2.279,-3.718,0.001,-13.063,-3.883
temp,0.4827,0.020,24.109,0.000,0.442,0.523
cA,22.4716,2.200,10.215,0.000,18.041,26.902
cB,-0.5527,2.216,-0.249,0.804,-5.016,3.911
cC,-3.5063,2.288,-1.533,0.132,-8.114,1.101

0,1,2,3
Omnibus:,1.971,Durbin-Watson:,1.779
Prob(Omnibus):,0.373,Jarque-Bera (JB):,1.416
Skew:,-0.41,Prob(JB):,0.493
Kurtosis:,3.092,Cond. No.,287.0


In [4]:
figs = analysis.plot_linear_model()
for fig in figs:
    fig.show()

The equation used for the fit is this one, you can change it if you want, e.g [to add interaction terms or other polynomial terms](https://www.statsmodels.org/dev/examples/notebooks/generated/formulas.html):

In [5]:
analysis.write_equation()

'response ~ temp + cA + cB + cC '

In [6]:
analysis.equation = 'response ~ temp+ temp:cA + cA + cB + cC'
analysis.compute_linear_model()
analysis.linear_model.summary()

0,1,2,3
Dep. Variable:,response,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,53540.0
Date:,"Wed, 16 Apr 2025",Prob (F-statistic):,4.6999999999999995e-82
Time:,17:45:10,Log-Likelihood:,4.4787
No. Observations:,50,AIC:,3.043
Df Residuals:,44,BIC:,14.51
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.5067,0.152,16.497,0.000,2.200,2.813
temp,0.1980,0.003,77.868,0.000,0.193,0.203
temp:cA,0.5030,0.004,123.747,0.000,0.495,0.511
cA,0.1542,0.216,0.713,0.479,-0.281,0.590
cB,-0.4540,0.120,-3.785,0.000,-0.696,-0.212
cC,-0.6750,0.126,-5.360,0.000,-0.929,-0.421

0,1,2,3
Omnibus:,0.482,Durbin-Watson:,2.165
Prob(Omnibus):,0.786,Jarque-Bera (JB):,0.605
Skew:,-0.004,Prob(JB):,0.739
Kurtosis:,2.461,Cond. No.,481.0


In [7]:
figs = analysis.plot_linear_model()
for fig in figs:
    fig.show()

Now let's make a ML model to predict the yield based on the temperature and concentrations of A, B and C.

In [8]:
analysis.model_type = "ElasticNetCV"
# analysis.model_type = "RidgeCV"
# analysis.model_type = "LinearRegression"
# analysis.model_type = "RandomForest"
# analysis.model_type = "GaussianProcess"
# analysis.model_type = "GradientBoosting"
MLmodel = analysis.compute_ML_model()
figs = analysis.plot_ML_model()
for fig in figs:
    fig.show()

And if we want to make a prediction:

In [9]:
new_value = pd.DataFrame({'temp': [50], 
                          'cA': [0.35], 
                          'cB': [0.5], 
                          'cC': [0.5]})
analysis.predict(new_value)

Unnamed: 0,prediction,model
0,21.269013,ElasticNetCV
1,20.698001,Linear Model
