In [69]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
from sympy.combinatorics.fp_groups import fp_group

In [70]:
FP_group_45 = '../../fingerprints/generators/45FP/45_fingerprints_grouped_avg_energy.csv'
FP_group_120 = '../../fingerprints/generators/120FP/120_fingerprints_grouped_avg_energy.csv'

FP_CO2_45 = '../../fingerprints/generators/45FP/45_fingerprints_CO2_grouped_avg_energy.csv'
FP_H2O_45 = '../../fingerprints/generators/45FP/45_fingerprints_H2O_grouped_avg_energy.csv'

FP_CO2_120 = '../../fingerprints/generators/120FP/120_fingerprints_CO2_grouped_avg_energy.csv'
FP_H2O_120 = '../../fingerprints/generators/120FP/120_fingerprints_H2O_grouped_avg_energy.csv'


def load_data(path):
    return pd.read_csv(path)

In [71]:
loaded_data = load_data(FP_group_45)
data = loaded_data.drop(columns=['MOF'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9183 entries, 0 to 9182
Data columns (total 46 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   atomic_num_mean                       9183 non-null   float64
 1   atomic_num_geometric_mean             9183 non-null   float64
 2   atomic_num_standard_deviation         9183 non-null   float64
 3   atomic_num_max                        9183 non-null   float64
 4   atomic_num_min                        9183 non-null   float64
 5   group_mean                            9183 non-null   float64
 6   group_geometric_mean                  9183 non-null   float64
 7   group_standard_deviation              9183 non-null   float64
 8   group_max                             9183 non-null   float64
 9   group_min                             9183 non-null   float64
 10  period_mean                           9183 non-null   float64
 11  period_geometric_

In [72]:
X = data.drop(columns=['energy'])
y = data['energy']

In [73]:
print(f"Mean value: {np.mean(y):.4f}")
print(f"Average value: {np.average(y):.4f}")
print(f"Maximum value: {np.max(y):.4f}")
print(f"Minimum value: {np.min(y):.4f}")
print(f"Standard deviation: {np.std(y):.4f}")
print(f"Median value: {np.median(y):.4f}")
print(f"Number of samples: {len(y)}")

Mean value: -0.1486
Average value: -0.1486
Maximum value: 29.0126
Minimum value: -1.9534
Standard deviation: 0.9527
Median value: -0.2776
Number of samples: 9183


In [74]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

In [75]:
models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "ExtraTrees": ExtraTreesRegressor(n_estimators=100, random_state=42),
    "Bagging": BaggingRegressor(n_estimators=100, random_state=42),
}
pipeline = Pipeline([
    # ('imputer', SimpleImputer(strategy='constant', fill_value=0, keep_empty_features=True)),
    ('imputer', SimpleImputer(strategy='mean', keep_empty_features=True)),
    ('scaler', StandardScaler()),
])


In [76]:
def print_error(name, y_valid, pred):
    print('---------------------------------------------------------\n')
    print(f'Model: {name}')

    mae = mean_absolute_error(y_valid, pred)
    mse = mean_squared_error(y_valid, pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_valid, pred)

    random_indices = np.random.choice(len(y_valid), 5, replace=False)
    print("\nRandom 5 predictions vs actual values:")
    for idx in random_indices:
        print(f"Predicted: {pred[idx]:.4f}, Actual: {y_valid.iloc[idx]:.4f}")

    print(f"\nMean Absolute Error (MAE): {mae}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R² Score: {r2 * 100:.2f}%")


In [77]:
pipeline_reg = Pipeline([
    # ('imputer', SimpleImputer(strategy='mean', keep_empty_features=True)),
    # ('scaler', StandardScaler()),
    ('regressor', None)
])

param_grid = {'regressor': [
    RandomForestRegressor(n_estimators=100, random_state=42),
    ExtraTreesRegressor(n_estimators=100, random_state=42),
    BaggingRegressor(n_estimators=100, random_state=42)
]
}


In [78]:
X_train_transformed = pipeline.fit_transform(X_train)
X_valid_transformed = pipeline.transform(X_valid)

In [79]:
grid_search = GridSearchCV(pipeline_reg,
                           param_grid,
                           cv=5,
                           scoring='r2',
                           return_train_score=True
                           )
grid_search.fit(X_train_transformed, y_train)

0,1,2
,estimator,"Pipeline(step...ssor', None)])"
,param_grid,"{'regressor': [RandomForestR...ndom_state=42), ExtraTreesReg...ndom_state=42), ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


In [80]:
best_params = grid_search.best_params_
print(f'Best parameters found: {best_params}')

Best parameters found: {'regressor': ExtraTreesRegressor(random_state=42)}


In [81]:
y_pred = grid_search.best_estimator_.predict(X_valid_transformed)

In [82]:
print_error('Grid', y_valid, y_pred)

---------------------------------------------------------

Model: Grid

Random 5 predictions vs actual values:
Predicted: 1.6015, Actual: 0.1554
Predicted: 0.0375, Actual: 0.0542
Predicted: -0.3159, Actual: -0.1932
Predicted: -0.0996, Actual: -0.5448
Predicted: -0.3890, Actual: -0.3224

Mean Absolute Error (MAE): 0.29532785297948566
Mean Squared Error (MSE): 0.4570315636437234
Root Mean Squared Error (RMSE): 0.6760410961204381
R² Score: 13.90%


In [83]:
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean', keep_empty_features=True)),
    ('pca', PCA(n_components=10)),

    ('scaler', StandardScaler()),
    ('reg', ExtraTreesRegressor(n_estimators=100, random_state=42))
])

# param_grid_pca = {
#     'pca__n_components': [5, 8, 10, 12],
#     'reg__n_estimators': [100, 200, 300],
#     'reg__max_depth': [None, 10, 20, 30],
#     'reg__min_samples_split': [2, 5, 10],
#     'reg__min_samples_leaf': [1, 2, 4],
#     'reg__max_features': ['auto', 'sqrt', 0.5],
#     'reg__bootstrap': [True, False],
#     'reg__max_samples': [None, 0.7],
# }


param_grid_pca = {'reg': [
    RandomForestRegressor(n_estimators=100, random_state=42),
    ExtraTreesRegressor(n_estimators=100, random_state=42),
]
}

grid_search_pca = GridSearchCV(pipe, param_grid_pca, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search_pca.fit(X_train, y_train)

best_params = grid_search_pca.best_params_
print(f'Best parameters found: {best_params}')

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters found: {'reg': ExtraTreesRegressor(random_state=42)}


In [84]:
best_pca_model = grid_search_pca.best_estimator_

pred = best_pca_model.predict(X_valid)

In [85]:
print_error('PCA', y_valid, pred)

---------------------------------------------------------

Model: PCA

Random 5 predictions vs actual values:
Predicted: -0.2840, Actual: -0.2232
Predicted: -0.0143, Actual: -0.1931
Predicted: -0.0724, Actual: -0.1624
Predicted: -0.0673, Actual: -0.4096
Predicted: -0.3581, Actual: -0.3820

Mean Absolute Error (MAE): 0.2870651588940267
Mean Squared Error (MSE): 0.4397554323742455
Root Mean Squared Error (RMSE): 0.6631405826627152
R² Score: 17.15%
