In [None]:
import json
import pickle
import pandas as pd  # 1.5.0
import numpy as np  # 1.23.5
import seaborn as sns  # 0.12.0
import matplotlib.pyplot as plt  # 3.6.2
import plotly.express as px  # 5.10.0
import umap  # 0.5.3
import shap  # 0.43.0
shap.initjs()

from rdkit import Chem  # 2023.03.3
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

from mordred import Calculator, descriptors  # 1.2.0

from sklearn.preprocessing import StandardScaler  # 1.2.2
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import mean_absolute_error
from sklearn.cross_decomposition import PLSRegression
from scipy.stats import pearsonr  # 1.9.1
from lightgbm import LGBMRegressor  # 4.0.0

from tqdm.auto import tqdm

#### Content
1) Feature generation  
2) First feature selection  
3) Second Feature selection  
4) Dimensionality reduction with PLS and UMAP  
5) LGBMRegressor  
6) SHAP analysis  
7) Analysis of the BCUTpe-1l descriptor  
  
*Each chapter can be executed independently from the others.*

# 1) Feature generation

##### 1a) Read data

In [None]:
# Read csv file.

df = pd.read_csv(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k.csv.gz")
df.shape

In [None]:
# Formatting.

descriptor_df = df[["Compound", "la_smiles"]]
descriptor_df = descriptor_df.set_index("Compound")
descriptor_df.shape

##### 1b) Calculate features

In [None]:
# Set up results dictionary.

df_dict = descriptor_df.to_dict(orient="index")

In [None]:
# Run the feature calculation with mordred and RDKit.
# Hydrogen atoms are added before features are calculated after generating the mol objects from SMILES strings.

for compound in tqdm(df_dict):
    mol = Chem.MolFromSmiles(df_dict[compound]["la_smiles"])
    mol = Chem.AddHs(mol)
    
    # mordred
    calc_mordred = Calculator(descriptors, ignore_3D=True)
    for idx, value in enumerate(calc_mordred(mol)):
        df_dict[compound][f"mordred__{calc_mordred.descriptors[idx]}"] = value
    
    # rdkit
    calc_rdkit = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    rdkit_descriptor_names = calc_rdkit.GetDescriptorNames()
    rdkit_descriptors = calc_rdkit.CalcDescriptors(mol)
    for name, value in zip(rdkit_descriptor_names, rdkit_descriptors):
        df_dict[compound][f"rdkit__{name}"] = value

##### 1c) Save data

In [None]:
# Delete SMILES strings from results.

for compound in df_dict:
    del df_dict[compound]["la_smiles"]

In [None]:
# Create results data frame.

final_df = pd.DataFrame(df_dict).T
final_df.shape

In [None]:
# Formatting.

final_df = final_df.reset_index().rename(columns={"index": "Compound"}) 
final_df

In [None]:
# Save results to csv file.

final_df.to_csv("FIA_DATA_49015_mol_descriptors.csv", index=False)

# 2) First feature selection

##### 2a) Read data

In [None]:
# Read csv files.

df = pd.read_csv(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k_2d_descriptors.csv.gz", low_memory=False)
df2 = pd.read_csv(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k.csv.gz")

df.shape, df2.shape

In [None]:
# Formatting.

df = df.merge(df2[["Compound", "set_assignment"]], on="Compound")
df.shape

In [None]:
# Set up train data frame.

train_df = df[df["set_assignment"]=="train"]
train_df.shape

In [None]:
# These columns should not be removed during feature selection (see below).

keep = ["Compound", "set_assignment"]

##### 2b) Get columns which have strings in them which cannot be converted to floats, done with full df

In [None]:
# Go through all columns and look for integer and float instances.

unwanted_dict = dict(df.applymap(lambda x: isinstance(x, (int, float))).all())

In [None]:
# Set up results dictionary; these colums should be removed.

unwanted_columns = {key: "does not contain numbers only" for key in unwanted_dict if not unwanted_dict[key]}
len(unwanted_columns)

##### 2c) Get columns which have inf values, done with full df

In [None]:
# Get columns which should be removed.

helper_df = df[[x for x in unwanted_dict if unwanted_dict[x] == True]].astype(np.float32)

for col, is_inf in zip(helper_df.columns, helper_df.isin([np.inf]).any()):
    if is_inf is True:
        if col not in unwanted_columns:
            unwanted_columns[col] = "has inf values"
len(unwanted_columns)

##### 2d) Get columns which have NaN values, done with full df

In [None]:
# Get columns which should be removed.

for col, is_nan in zip(df.columns, df.isna().any()):
    if is_nan is True:
        if col not in unwanted_columns:
            unwanted_columns[col] = "has NaN values"
len(unwanted_columns)

##### 2e) Get columns which only have one unique value, done with train_df

In [None]:
# Get columns which should be removed.

for col in train_df.columns[train_df.nunique()==1]:
    if col not in unwanted_columns:
        unwanted_columns[col] = "nunique()==1"
len(unwanted_columns)

##### 2f) Remove columns which were so far selected for elimination

In [None]:
# Remove columns except for those specified in the keep list.

df = df.filter([col for col in df.columns if col not in unwanted_columns.keys() or col in keep], axis=1)
df.shape

In [None]:
# Also remove the columns from the train data frame.

train_df = df[df["set_assignment"]=="train"]
train_df.shape

##### 2g) Check linear correlation between features

In [None]:
# Formatting.

to_be_droped = [col for col in train_df.columns if all(["mordred__" not in col, "rdkit__" not in col, col != "Compound"])]

df_2 = train_df.drop(to_be_droped, axis=1)
df_2 = df_2.set_index("Compound")
df_2.shape

In [None]:
# Calculate pairwise correlation data frame.

df_corr = df_2.corr()
df_corr.shape

In [None]:
# Identify all columns which should be removed.

unwanted = []

CUTOFF = 0.8

for f1 in df_corr:
    if f1 not in unwanted:
        for idx, value in enumerate(df_corr[f1]):
            f2 = df_corr.columns[idx]
            if f1 != f2:
                if abs(value) > CUTOFF and f2 not in unwanted:
                    unwanted.append(f2)
len(unwanted)

##### 2h) Remove columns which were selected for elimination

In [None]:
# Remove columns except for those specified in the keep list.

keep = ["Compound"]
unwanted.append("set_assignment")
df = df.filter([col for col in df.columns if col not in unwanted or col in keep], axis=1)
df.shape

##### 2i) Save data

In [None]:
with open("FIA49k_2d_descriptors_selection.json", "w") as f:
    json.dump({"first_feature_selection": [col for col in df.columns]}, f, indent=4)

# 3) Second Feature selection

##### 3a) Read and prepare data

In [None]:
# Read files.

with open(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k_2d_descriptors_selection.json", "r") as f:
    first_selection = json.load(f)["first_feature_selection"]

final_df = pd.read_csv(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k_2d_descriptors.csv.gz", low_memory=False)
final_df = final_df.drop([col for col in final_df.columns if col not in first_selection], axis=1)

df2 = pd.read_csv(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k.csv.gz")

final_df.shape, df2.shape, len(first_selection)

In [None]:
# Formatting.

final_df = final_df.merge(df2[[
    "Compound", 
    "set_assignment",
    "fia_gas-DSDBLYP",
    "fia_solv-DSDBLYP"]], on="Compound")
final_df = final_df.sample(frac=1, random_state=100)
final_df.shape

In [None]:
# Set up train and test data frames.

X_train = final_df.loc[final_df["set_assignment"] == "train"]
X_test = final_df.loc[final_df["set_assignment"] == "test"]

y_train_gas = X_train["fia_gas-DSDBLYP"]
y_train_solv = X_train["fia_solv-DSDBLYP"]

y_test_gas = X_test["fia_gas-DSDBLYP"]
y_test_solv = X_test["fia_solv-DSDBLYP"]

compounds_train = list(X_train["Compound"])
compounds_test = list(X_test["Compound"])

to_be_droped = [col for col in list(final_df.columns) if all(["mordred__" not in col, "rdkit__" not in col])]

In [None]:
# Drop all columns which are not feature columns.

X_train = X_train.drop(to_be_droped, axis=1)
X_test = X_test.drop(to_be_droped, axis=1)

X_train.shape, X_test.shape

##### 3b) Standard-scale features

In [None]:
# Feature preprocessing.

feature_names = list(X_train.columns)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

##### 3c) Do LassoCV

In [None]:
# Do LassoCV.

lasso = LassoCV(
    n_alphas=500,
    max_iter=10000,
    cv=5, 
    random_state=0,
    verbose=1,
    n_jobs=-1
)

lasso.fit(X_train, y_train_solv)

In [None]:
# Get results and train a final Lasso model (not required).

best_alpha = lasso.alpha_
print(f"Found alpha value: {best_alpha}")

model = Lasso(alpha=best_alpha)
model.fit(X_train, y_train_solv)

In [None]:
# Get and visualize prediction results (not relevant).

predictions = model.predict(X_test)

print(f"MAE: {round(mean_absolute_error(y_test_solv, predictions), 2)} kJ/mol")
print(f"r2: {round(pearsonr(y_test_gas, predictions)[0]**2, 4)}")

fig = sns.scatterplot(
    x=y_test_solv,
    y=predictions
)

fig.set(
    xlabel="FIA calculated by DFT [kJ/mol]",
    ylabel="FIA prediction [kJ/mol]",
    title="Predicted FIA vs. DFT FIA"
)
fig

##### 3d) Do feature selection

In [None]:
# Set up feature importance data frame.

importance_df = pd.DataFrame({"Feature": feature_names, "Importance": abs(model.coef_)})
importance_df = importance_df.sort_values(by="Importance", ascending=False)

print(f"n features:                  {len(feature_names)}")
print(f"n features with coef <0.1:   {len(importance_df[importance_df.Importance<0.1])}")

In [None]:
# Inspect feature importances.

for _, data in importance_df.iterrows():
    print(data.Importance, "\t", data.Feature)

In [None]:
# Prepare list of columns which should be removed.

unwanted = list(importance_df[importance_df.Importance<0.1].Feature)

unwanted.append("set_assignment")
unwanted.append("fia_gas-DSDBLYP")
unwanted.append("fia_solv-DSDBLYP")

In [None]:
# Remove columns.

final_df = final_df.filter([col for col in final_df.columns if col not in unwanted], axis=1)
final_df = final_df.sort_values(by="Compound")
final_df.shape

##### 3e) Save data

In [None]:
# Save data to json file.

with open(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k_2d_descriptors_selection.json", "w") as f:
    json.dump({"second_feature_selection": list(final_df.columns)}, f, indent=4)

# 4) Dimensionality reduction with PLS and UMAP

##### 4a) Load data

In [None]:
# Read files.

with open(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k_2d_descriptors_selection.json", "r") as f:
    selected_features = json.load(f)["second_feature_selection"]

df = pd.read_csv(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k_2d_descriptors.csv.gz", low_memory=False)
df = df.drop([col for col in df.columns if col not in selected_features], axis=1)

df2 = pd.read_csv(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k.csv.gz")

len(selected_features), df.shape, df2.shape

In [None]:
# Reduce the data set to the train and test set.

df2 = df2[df2["set_assignment"].isin(["train", "test"])]
df = df[df["Compound"].isin(df2.Compound)]

df.shape, df2.shape

In [None]:
# Formatting.

df = df.sort_values(by="Compound")
df2 = df2.sort_values(by="Compound")

for name1, name2 in zip(df.Compound, df2.Compound):
    if name1 != name2:
        print("Error")

df = df.reset_index(drop=True)
df2 = df2.reset_index(drop=True)

df.shape, df2.shape

In [None]:
# Set up train data frames.

df2_train = df2.loc[df2["set_assignment"] == "train"]
df_train = df[df["Compound"].isin(df2_train.Compound)]

df_train.shape, df2_train.shape

In [None]:
# Check if data frames have same order.

df_train = df_train.sort_values(by="Compound")
df2_train = df2_train.sort_values(by="Compound")

for name1, name2 in zip(df_train.Compound, df2_train.Compound):
    if name1 != name2:
        print("Error")

df_train = df_train.reset_index(drop=True)
df2_train = df2_train.reset_index(drop=True)

df_train.shape, df2_train.shape

##### 4b) Do supervised PLS regression

In [None]:
# Train PLS reducers with the full feature space.

pls_reducer_gas = PLSRegression(n_components=25)
pls_reducer_gas.fit(
    df_train.drop(["Compound"], axis=1),
    df2_train["fia_gas-DSDBLYP"]
)

pls_reducer_solv = PLSRegression(n_components=25)
pls_reducer_solv.fit(
    df_train.drop(["Compound"], axis=1),
    df2_train["fia_solv-DSDBLYP"]
)

In [None]:
# Get PLS embeddings for the train set.

pls_embedding_train_gas = pls_reducer_gas.transform(df_train.drop(["Compound"], axis=1))

pls_embedding_train_solv = pls_reducer_solv.transform(df_train.drop(["Compound"], axis=1))

##### 4c) Do UMAP

In [None]:
# Train UMAP reducer with the PLS embeddings of the train set.

umap_reducer_gas = umap.UMAP(random_state=42)
umap_reducer_gas.fit(pls_embedding_train_gas)
print("Reducer_gas done.")

umap_reducer_solv = umap.UMAP(random_state=42)
umap_reducer_solv.fit(pls_embedding_train_solv)
print("Reducer_solv done.")

In [None]:
# Get UMAP embeddings for the entire data set.

umap_embedding_gas = umap_reducer_gas.transform(pls_reducer_gas.transform(df.drop(["Compound"], axis=1)))

umap_embedding_solv = umap_reducer_solv.transform(pls_reducer_solv.transform(df.drop(["Compound"], axis=1)))

In [None]:
# Formatting.

df2["PLS-UMAP_1_gas"] = list(umap_embedding_gas[:,0])
df2["PLS-UMAP_2_gas"] = list(umap_embedding_gas[:,1])

df2["PLS-UMAP_1_solv"] = list(umap_embedding_solv[:,0])
df2["PLS-UMAP_2_solv"] = list(umap_embedding_solv[:,1])

In [None]:
# Make figure (gas).

plt.rcParams['figure.dpi'] = 300

order = [
    "B(III)", "Al(III)", "Ga(III)", "In(III)", 
    "Si(II)", "Ge(II)", "Sn(II)", "Pb(II)",
    "Si(IV)", "Ge(IV)", "Sn(IV)", "Pb(IV)",
    "P(III)", "As(III)", "Sb(III)", "Bi(III)", 
    "P(V)", "As(V)", "Sb(V)", "Bi(V)",
    "Te(IV)"
]

ax = sns.scatterplot(
    data=df2,
    x="PLS-UMAP_1_gas",
    y="PLS-UMAP_2_gas",
    hue="ca_class",
    palette=px.colors.qualitative.Alphabet,
    hue_order=order,
    alpha=0.6
)

ax.legend_.remove()
# sns.move_legend(ax, "lower center", ncol=7, title="Central atom class", fontsize="small", handletextpad=0, columnspacing=1)

ax.set_xlabel("UMAP 1", size=14)
ax.set_ylabel("UMAP 2", size=14)

ax.set(xticklabels=[])
ax.set(yticklabels=[])
ax.tick_params(left=False, bottom=False)

ax.collections[0].set_sizes([10])

ax

In [None]:
# Make figure (gas).

plt.rcParams['figure.dpi'] = 300

order = [
    "B(III)", "Al(III)", "Ga(III)", "In(III)", 
    "Si(II)", "Ge(II)", "Sn(II)", "Pb(II)",
    "Si(IV)", "Ge(IV)", "Sn(IV)", "Pb(IV)",
    "P(III)", "As(III)", "Sb(III)", "Bi(III)", 
    "P(V)", "As(V)", "Sb(V)", "Bi(V)",
    "Te(IV)"
]

ax = sns.scatterplot(
    data=df2,
    x="PLS-UMAP_1_solv",
    y="PLS-UMAP_2_solv",
    hue="ca_class",
    palette=px.colors.qualitative.Alphabet,
    hue_order=order,
    alpha=0.6
)

# ax.legend_.remove()
sns.move_legend(ax, "lower center", ncol=7, title="Central atom class", fontsize="small", handletextpad=0, columnspacing=1)

ax.set_xlabel("UMAP 1", size=14)
ax.set_ylabel("UMAP 2", size=14)

ax.set(xticklabels=[])
ax.set(yticklabels=[])
ax.tick_params(left=False, bottom=False)

ax.collections[0].set_sizes([10])

plt.ylim(-10, 15)

ax

In [None]:
px.scatter(
    df2,
    x="PLS-UMAP_1_gas",
    y="PLS-UMAP_2_gas",
    color="ca_class",
    color_discrete_sequence=px.colors.qualitative.Dark24
)

# 5) LGBMRegressor

##### 5a) Load and prepare data

In [None]:
# Read files.

with open(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k_2d_descriptors_selection.json", "r") as f:
    selected_features = json.load(f)["second_feature_selection"]

final_df = pd.read_csv(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k_2d_descriptors.csv.gz", low_memory=False)
final_df = final_df.drop([col for col in final_df.columns if col not in selected_features], axis=1)

df2 = pd.read_csv(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k.csv.gz")

len(selected_features), final_df.shape, df2.shape

In [None]:
# Formatting.

final_df = final_df.merge(df2[["Compound", "set_assignment", "fia_gas-DSDBLYP", "fia_solv-DSDBLYP"]], on="Compound")
final_df.shape

In [None]:
# Set up train, validation, and test data frames.

X_train = final_df.loc[final_df["set_assignment"] == "train"]
X_validate = final_df.loc[final_df["set_assignment"] == "validate"]
X_test = final_df.loc[final_df["set_assignment"] == "test"]

y_train_gas = X_train["fia_gas-DSDBLYP"]
y_train_solv = X_train["fia_solv-DSDBLYP"]

y_validate_gas = X_validate["fia_gas-DSDBLYP"]
y_validate_solv = X_validate["fia_solv-DSDBLYP"]

y_test_gas = X_test["fia_gas-DSDBLYP"]
y_test_solv = X_test["fia_solv-DSDBLYP"]

compounds_train = list(X_train["Compound"])
compounds_validate = list(X_validate["Compound"])
compounds_test = list(X_test["Compound"])

to_be_droped = [col for col in list(final_df.columns) if all(["mordred__" not in col, "rdkit__" not in col])]

In [None]:
# Drop all columns which are not feature columns. 

X_train = X_train.drop(to_be_droped, axis=1)
X_validate = X_validate.drop(to_be_droped, axis=1)
X_test = X_test.drop(to_be_droped, axis=1)

print("Data shapes")
print("train:      ", X_train.shape, y_train_gas.shape, y_train_solv.shape)
print("validate:   ", X_validate.shape, y_validate_gas.shape, y_validate_solv.shape)
print("test:       ", X_test.shape, y_test_gas.shape, y_test_solv.shape)

In [None]:
# Print out used features.

for feature in X_train.columns:
    print(feature)

##### 5b) FIA_gas model

In [None]:
# Train LGBMRegressor with FIA_gas.

lgbm_gas = LGBMRegressor(
    n_estimators=2000,
    early_stopping_round=15,
    verbose=2
)

lgbm_gas.fit(
    X_train, 
    y_train_gas,
    eval_set=[(X_validate, y_validate_gas)],
    eval_metric="r2"
)

In [None]:
# Get and visualize prediction results.

predictions_gas = lgbm_gas.predict(X_test)

print(f"MAE_gas: {round(mean_absolute_error(y_test_gas, predictions_gas), 3)} kJ/mol")
print(f"r2_gas: {round(pearsonr(y_test_gas, predictions_gas)[0]**2, 4)}")

fig = sns.scatterplot(
    x=y_test_gas,
    y=predictions_gas
)

fig.set(xlabel="FIA calculated by DFT [kJ/mol]", ylabel="FIA prediction [kJ/mol]", title="Predicted FIA vs. DFT FIA")
fig

In [None]:
# Save model.

with open("lgbm_regressor_mol_desc_fia_gas.pkl", "wb") as f:
    pickle.dump(lgbm_gas, f)

##### 5c) FIA_solv model

In [None]:
# Train LGBMRegressor with FIA_solv.

lgbm_solv = LGBMRegressor(
    n_estimators=2000,
    early_stopping_round=15,
    verbose=2
)

lgbm_solv.fit(
    X_train, 
    y_train_solv,
    eval_set=[(X_validate, y_validate_solv)],
    eval_metric="r2"
)

In [None]:
# Get and visualize prediction results.

predictions_solv = lgbm_solv.predict(X_test)

print(f"MAE_solv: {round(mean_absolute_error(y_test_solv, predictions_solv), 3)} kJ/mol")
print(f"r2_solv: {round(pearsonr(y_test_solv, predictions_solv)[0]**2, 4)}")

fig = sns.scatterplot(
    x=y_test_solv,
    y=predictions_solv
)

fig.set(xlabel="FIA calculated by DFT [kJ/mol]", ylabel="FIA prediction [kJ/mol]", title="Predicted FIA vs. DFT FIA")
fig

In [None]:
# Save model.

with open("lgbm_regressor_mol_desc_fia_solv.pkl", "wb") as f:
    pickle.dump(lgbm_solv, f)

# 6) SHAP analysis

##### 6a) Load and prepare data

In [None]:
# Execute all cells given above under 5a).

##### 6b) Load trained model

In [None]:
# Unpickle a fitted model.

with open("lgbm_regressor_mol_desc_fia_solv.pkl", "rb") as f:
    loaded_model = pickle.load(f)
loaded_model.fitted_

##### 6c) Get Shaply values

In [None]:
# Calculate SHAP values based on TreeExplainer.

explainer = shap.TreeExplainer(loaded_model)
shap_values = explainer(X_test)

In [None]:
# Visualize SHAP data.

shap.summary_plot(
    shap_values, 
    plot_type="bar",
    max_display=50,
    plot_size=(10,15)
)

# 7) Analysis of the BCUTpe-1l descriptor

##### 7a) Load data

In [None]:
# Read files.

with open(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k_2d_descriptors_selection.json", "r") as f:
    selected_features = json.load(f)["second_feature_selection"]

df = pd.read_csv(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k_2d_descriptors.csv.gz", low_memory=False)
df = df.drop([col for col in df.columns if col not in selected_features], axis=1)

df2 = pd.read_csv(r"F:\FIA_GENERATION\for_publication\FINAL\FIA49k.csv.gz")

len(selected_features), df.shape, df2.shape

In [None]:
df2 = df2.merge(df[["Compound", "mordred__BCUTpe-1l"]], on="Compound")
df2.shape

##### 7b) Correlate the BCUTpe-1l descriptor with the DFT-calculated Mulliken charge of the central atom of the Lewis acid

In [None]:
# Get central atom class wise linear correlations

all_r2 = {}

for ca_class, data in df2.groupby("ca_class"):
    r2 = pearsonr(
        data["mordred__BCUTpe-1l"],
        data["la-ca_mulliken_charge-PBEh3c"],
    )[0]**2
    all_r2[ca_class] = r2

all_r2 = dict(sorted(all_r2.items(), key=lambda x: x[1], reverse=True))
all_r2

In [None]:
# Average correlation.

np.mean(list(all_r2.values()))

In [None]:
# Make plot I.

g = sns.FacetGrid(
    data=df2, 
    col="ca_class", 
    col_wrap=3, 
    sharex=False, 
    sharey=False, 
    col_order=list(all_r2.keys())
)

g.map(
    sns.regplot,
    "mordred__BCUTpe-1l",
    "la-ca_mulliken_charge-PBEh3c",
    line_kws=dict(color="darkred")
)

g.set_xlabels("BCUTpe-1l descriptor")
g.set_ylabels("Central atom Mulliken charge / e")
g.set_titles(col_template="{col_name}", row_template="{row_name}", size=16)

In [None]:
# Make plot II.

CA_CLASS = "Bi(III)"

fig = sns.regplot(
    df2.loc[df2["ca_class"] == CA_CLASS],
    x="mordred__BCUTpe-1l",
    y="la-ca_mulliken_charge-PBEh3c",
    line_kws=dict(color="darkred")
)

fig.set(
    xlabel="BCUTpe-1l descriptor", 
    ylabel="Central atom Mulliken charge / e",
    title=f"Correlation plot {CA_CLASS}-based molecules "
)
fig