In [1]:
import sklearn
import pandas as pd
from scipy.stats import entropy
import numpy as np

## 1. Investigate the data, perform feature engineering

In [2]:
heo_data = pd.read_csv('data.csv')
heo_data.head()

Unnamed: 0,Al,Ca,Fe,Mg,Mn,Ni,overpotential
0,0.123204,0.106073,0.528617,0.203194,0.003486,0.035425,1.7122
1,0.036817,0.044191,0.743975,0.107923,0.001843,0.065252,1.7164
2,0.067023,0.150251,0.470664,0.291139,0.007421,0.013501,1.7228
3,0.059822,0.088446,0.430872,0.396597,0.005039,0.019225,1.725
4,0.060222,0.101457,0.405555,0.414243,0.005944,0.012578,1.7264


In [3]:
# investigate the data distirbution column-by-column:
for column in heo_data.columns:
    avg = heo_data[column].mean()
    std = heo_data[column].std()
    print(f'{column} - avg: {avg}, std: {std}')

Al - avg: 0.10798809831275721, std: 0.06905093371796822
Ca - avg: 0.13385606113580248, std: 0.03550491768900502
Fe - avg: 0.40952377399588474, std: 0.0837796759656596
Mg - avg: 0.32925771282716054, std: 0.09825194929898094
Mn - avg: 0.005817977823045268, std: 0.0015632724903564198
Ni - avg: 0.013556375851851852, std: 0.010000855740236058
overpotential - avg: 1.8514679012345678, std: 0.14000893403650996


In [4]:
# investigate the mutual information between the columns:
X = heo_data.copy()
y = X.pop('overpotential')


In [11]:
# Function to calculate entropy for a row
def calculate_row_entropy(row):
    values = row.values  # Get row values
    probabilities = values / np.sum(values)  # Normalize to probabilities
    return entropy(probabilities)  # Calculate entropy

# Apply the entropy function to each row and add as a new column
X["entropy"] = X.apply(calculate_row_entropy, axis=1)

# Print the updated DataFrame
print(X.head())

         Al        Ca        Fe        Mg        Mn        Ni   entropy
0  0.123204  0.106073  0.528617  0.203194  0.003486  0.035425  1.294826
1  0.036817  0.044191  0.743975  0.107923  0.001843  0.065252  0.909414
2  0.067023  0.150251  0.470664  0.291139  0.007421  0.013501  1.274399
3  0.059822  0.088446  0.430872  0.396597  0.005039  0.019225  1.215177
4  0.060222  0.101457  0.405555  0.414243  0.005944  0.012578  1.217944


In [5]:
mi = sklearn.feature_selection.mutual_info_regression(X, y)

for col, score in zip(X.columns, mi):
    # Calculate correlation coefficient
    correlation = X[col].corr(y)  # Pandas corr() method
    print(f"Column: {col}")
    print(f"  Mutual Information: {score:.4f}")
    print(f"  Correlation Coefficient: {correlation:.4f}")

Column: Al
  Mutual Information: 0.0705
  Correlation Coefficient: 0.3758
Column: Ca
  Mutual Information: 0.3176
  Correlation Coefficient: 0.3911
Column: Fe
  Mutual Information: 0.0632
  Correlation Coefficient: -0.3814
Column: Mg
  Mutual Information: 0.2197
  Correlation Coefficient: -0.0429
Column: Mn
  Mutual Information: 0.0747
  Correlation Coefficient: 0.0409
Column: Ni
  Mutual Information: 0.3077
  Correlation Coefficient: -0.3731


## 2. Perform regression with classical ML

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import root_mean_squared_error

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models to compare
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "SVR": SVR()
}

# Train, predict, and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)  # Train
    y_pred = model.predict(X_test)  # Predict
    rmse = root_mean_squared_error(y_test, y_pred)  # Evaluate
    results[name] = rmse  # Store results
    print(f"{name}: R Mean Squared Error = {rmse:.4f}")

# Find the best model
best_model = min(results, key=results.get)
print(f"\nBest Model: {best_model} with RMSE = {results[best_model]:.4f}")

Linear Regression: R Mean Squared Error = 0.1312
Ridge Regression: R Mean Squared Error = 0.1337
Lasso Regression: R Mean Squared Error = 0.1511
Random Forest: R Mean Squared Error = 0.0634
SVR: R Mean Squared Error = 0.1025

Best Model: Random Forest with RMSE = 0.0634


## 3. Perform regression with deep learning

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import numpy as np

In [8]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.values)
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # Single output for regression
        )

    def forward(self, x):
        return self.model(x)

# Initialize the model, loss, and optimizer
input_size = X_train.shape[1]
model = MLP(input_size)
criterion = nn.MSELoss()  # Mean Squared Error loss for regression
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    predictions = model(X_train_tensor)
    loss = criterion(predictions, y_train_tensor)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

# Evaluate the model
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor)
    test_loss = criterion(predictions, y_test_tensor)
    print(f"Test Loss: {test_loss.item():.4f}")

[[0.0679073  0.12260057 0.4573748  0.32948194 0.00635819 0.0162772 ]
 [0.18714545 0.19900462 0.37287375 0.2305398  0.00712121 0.00331517]
 [0.13584441 0.12849829 0.40620097 0.30771921 0.00486585 0.01687127]
 ...
 [0.0567545  0.08583702 0.37087341 0.46916684 0.00528214 0.01208609]
 [0.06316047 0.12986363 0.45819846 0.32804983 0.00694488 0.01378274]
 [0.04166139 0.14171875 0.48040747 0.32049812 0.00888877 0.00682551]]
Epoch 10/100, Loss: 3.2002
Epoch 20/100, Loss: 2.6387
Epoch 30/100, Loss: 2.0874
Epoch 40/100, Loss: 1.4870
Epoch 50/100, Loss: 0.8663
Epoch 60/100, Loss: 0.3430
Epoch 70/100, Loss: 0.0622
Epoch 80/100, Loss: 0.0243
Epoch 90/100, Loss: 0.0344
Epoch 100/100, Loss: 0.0232
Test Loss: 0.0259


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


In [9]:
#save the model to use in the environment
torch.save(model.state_dict(), 'regression_heo.pth')

In [17]:
test_a = 0.1*torch.ones(6)
model(test_a)

tensor([1.5983], grad_fn=<ViewBackward0>)