# Exfoliation Energy Prediction

This notebook demonstrates a simple ML model for predicting exfoliation energy.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

## Step 1: Load the dataset

In [None]:
file_path = '/data/db(in).csv'
data = pd.read_csv(file_path)
data.head()

## Step 2: Data Cleaning and Preprocessing

In [None]:
# Dropping rows with missing target values (exfoliation_energy_per_atom)
data_cleaned = data[['energy_per_atom', 'total_magnetization', 'decomposition_energy', 'exfoliation_energy_per_atom']].dropna()


## Step 3: Feature Selection- This is an example, and you should select more features, Hint: Featurize composition using matminer and include spacegroups

In [None]:
#yt video for featurization: https://youtu.be/nFQZLn8VFsQ?si=X3NmGDnU-XimfcKG

X = data_cleaned[['energy_per_atom', 'total_magnetization', 'decomposition_energy']]
y = data_cleaned['exfoliation_energy_per_atom']


## Step 4: Split the data into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Step 5: Model Training - Linear Regression

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

## Step 6: Predictions and Evaluation

In [None]:

y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")


## Step 7: Plotting Actual vs Predicted

In [None]:

plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2, color='red')
plt.xlabel("Actual Exfoliation Energy")
plt.ylabel("Predicted Exfoliation Energy")
plt.title("Actual vs Predicted Exfoliation Energy")
plt.show()
