# Installation

For BO environment  
- pip install -r requirement.txt

For magpie descriptor  
- install magpie java version: https://wolverton.bitbucket.io/tutorial.html
- put magpie-latest folder in the same folder of src
- mv compound.in into magpie-latest folder

# Data preprocessing

## Composition information preprocessing

In [1]:
import re, os, shutil, json
import numpy as np
import pandas as pd

In [None]:
def separate_atoms(molecule):
    split_molecule = re.findall(r'[A-Z][a-z]?(?:\d+\.\d+|\d+(?:\.\d+)?)?', molecule)
    #print(split_molecule)
    result = dict()
    eles = []
    for split in split_molecule:
        m = re.match('([A-Z][a-z]?)(\d+(\.\d*)?)?', split)
        element = m.group(1)
        count = float(m.group(2)) if m.group(2) else 1
        eles.append(element)
        #result.append([element, str(count)])
        result[element] = count
    return [eles,result]

def write_magpie(select_set, proplist, trianset=True):
    if trianset:
        file_name = 'chen_composition_train.txt'
    else:
        file_name = 'chen_composition_pred.txt'
    with open(f'./Dataprocessing/{file_name}', 'w') as f:
        f.write(f'name   {"   ".join([prop for prop in proplist])}\n')
        for num in range(len(select_set)):
            # print(select_set['Composition'].iloc[num])
            ele, res = separate_atoms(select_set['composition'].iloc[num])
            for j in ele:
                f.write(f'{j},{res[j]},')
            if trianset:
                f.write(f'    {"    ".join([str(list(select_set[prop])[num] if prop in select_set.columns else "None") for prop in proplist ])}\n')
            else:
                f.write(f'    {"    ".join(["None" for prop in proplist])}\n')

def magpie_to_csv(root, sets, fileN='chen_composition'):

    for train in sets:
        if train:
            file_name = f'{fileN}_train'
        else:
            file_name = f'{fileN}_pred'
            
        if os.path.exists(f'{root}/Dataprocessing/{file_name}.txt'):
            shutil.copyfile(f'{root}/Dataprocessing/{file_name}.txt', f'{root}/Dataprocessing/processing_data.txt')
            os.chdir('./magpie-latest')
            os.system('java -jar dist/Magpie.jar compound.in')
            shutil.copyfile(f'{root}/Dataprocessing/processing_data.json', f'{root}/Dataprocessing/{file_name}.json')
            os.chdir(root)
    
    ### json data --> csv
    
    for train in sets:
        if train:
            file_name = f'{fileN}_train'
        else:
            file_name = f'{fileN}_pred'
    
        if os.path.exists(f'{root}/Dataprocessing/{file_name}.json'):
        
            with open(f'{root}/Dataprocessing/{file_name}.json') as f:
               jsdata = json.load(f)
        
            all_info = []
            for i in jsdata['entries']:
                if train:
                    info = [i['composition']]+i['attributes']+[i['properties'][num]['measured'] if 'measured' in i['properties'][num].keys() else "None" for num in range(len(i['properties']))]
                else:
                    info = [i['composition']]+i['attributes']+['None'  for num in range(len(i['properties']))]
                all_info.append(info)
            print(all_info[0])
            print(len(all_info))
            
            infodf = pd.DataFrame(all_info, columns = ['composition']+jsdata['attribute-names']+[jsdata['properties'][num]['name'] for  num in range(len(jsdata['properties']))])
            infodf.to_csv(f'{root}/Dataprocessing/{file_name}.csv', index=False)

In [None]:
### file should contain at least a column of composition information and a column of properties. If for candidate data, no property is available, each line in the column can be filled with 'None'.

### specify the data path and name of properties
root = os.getcwd()
database_name = os.getcwd()+'/Dataprocessing/chen_data.csv'
alldata = pd.read_csv(database_name) #, sep='\s+')
_prop = ['formation_enthalpy(eV/atom)']
os.chdir(root)

### write magpie file for train and predict (if exist) set
predict_set = alldata[alldata[_prop].isnull().any(axis=1)]
if not predict_set.empty:
    write_magpie(predict_set, _prop, trianset=False)
    sets = [True, False]
else:
    sets = [True]
    
train_set = alldata[alldata[_prop].notnull().all(axis=1)]

write_magpie(train_set, _prop, trianset=True)

magpie_to_csv(root, sets)

## Candidates generation

In [None]:
import numpy as np
from tqdm import tqdm
import pickle

### Candidate ratio generation
step_size = 0.05
values = np.arange(0, 1 + step_size, step_size)
num_elements = 11

def find_combinations(num_elements, target_sum, current_combination, results):
    if num_elements == 1:
        if np.isclose(target_sum, round(target_sum, 2)) and target_sum >= 0 and target_sum <= 1:
            results.append(current_combination + [round(target_sum, 2)])
        return

    for value in values:
        if value > target_sum:
            break
        find_combinations(num_elements - 1, target_sum - value, current_combination + [value], results)

results = []
find_combinations(num_elements, 1.0, [], results)

with open('combinations_11.pickle', 'wb') as f:
    pickle.dump(results, f)

In [None]:
def generate_combinations(n, total, combination):
    if n == 1:
        yield combination + [total]
    else:
        for i in range(total + 1):
            yield from generate_combinations(n - 1, total - i, combination + [i])

combinations_list = list(generate_combinations(11, 20, []))

print(len(combinations_list))

In [None]:
### Prepare the files for Magpie of candidates
import os, pickle
import numpy as np

#[LiCl, NaCl, KCl, RbCl, CsCl, MgCl2, CaCl2, SrCl2, BaCl2, ZnCl2, ZrCl4]
comp_list = ['Li', 'Na', 'K', 'Rb', 'Cs', 'Mg', 'Ca', 'Sr', 'Ba', 'Zn', 'Zr']
coord_list = np.array([1,1,1,1,1,2,2,2,2,2,4])

### set the temp to be 1000
def write_data(npdf, name, comp_list=comp_list, coord_list=coord_list):
    with open(name+'.txt', 'w') as f:
        f.write('name   Temp   denstiy\n')
        for i in npdf:
            f.write(''.join([f'{comp_list[num]},{round(i[num],3)},' for num in range(len(comp_list))]))
            f.write(f"Cl,{round(np.sum(np.array(i)*coord_list),3)},   ")
            f.write(f"1000   None\n")
    with open('finish.txt', 'w') as f:
        f.write('Finished!')

In [None]:
with open('combinations_11.pickle', 'rb') as f:
    combines = pickle.load(f)

write_data(combines, 'chen_magpie_candidates_train')
sets = [True]
magpie_to_csv(root, sets=sets, fileN='chen_magpie_candidates')

In [None]:
### Candidate separation code for those very large candidate file
import pandas as pd
import os

data = pd.read_csv('./Dataprocessing/chen_magpie_candidates_train.csv')

sepreate_num = 20
sep_grid = int(len(data)/sepreate_num)+1
for i in range(20):
    next = min((i+1)*sep_grid, len(data))
    new_data = data[i*sep_grid:next]
    new_data.to_csv(f'{os.getcwd()}/sep_cand_{i+1}.csv', index=False)

Then move the data from Dataprocessing to Data folder

# Closed pool test

In [1]:
import os, time
from src.bayesian_optimization import BayesianOptimization
import pandas as pd
import matplotlib.pyplot as plt

root = os.getcwd()
os.popen(f'rm -r {root}/performance_record.txt {root}/model_weights')

target_props = ['B0', 'Ms']
feature_props = ['NComp', 'Comp_L3Norm', 'Comp_L5Norm', 'Comp_L7Norm', 'Comp_L10Norm', 'mean_Number', 'maxdiff_Number', 'dev_Number', 'max_Number', 'min_Number', 'most_Number', 'mean_MendeleevNumber', 'maxdiff_MendeleevNumber', 'dev_MendeleevNumber', 'max_MendeleevNumber', 'min_MendeleevNumber', 'most_MendeleevNumber', 'mean_AtomicWeight', 'maxdiff_AtomicWeight', 'dev_AtomicWeight', 'max_AtomicWeight', 'min_AtomicWeight', 'most_AtomicWeight', 'mean_MeltingT', 'maxdiff_MeltingT', 'dev_MeltingT', 'max_MeltingT', 'min_MeltingT', 'most_MeltingT', 'mean_Column', 'maxdiff_Column', 'dev_Column', 'max_Column', 'min_Column', 'most_Column', 'mean_Row', 'maxdiff_Row', 'dev_Row', 'max_Row', 'min_Row', 'most_Row', 'mean_CovalentRadius', 'maxdiff_CovalentRadius', 'dev_CovalentRadius', 'max_CovalentRadius', 'min_CovalentRadius', 'most_CovalentRadius', 'mean_Electronegativity', 'maxdiff_Electronegativity', 'dev_Electronegativity', 'max_Electronegativity', 'min_Electronegativity', 'most_Electronegativity', 'mean_NsValence', 'maxdiff_NsValence', 'dev_NsValence', 'max_NsValence', 'min_NsValence', 'most_NsValence', 'mean_NpValence', 'maxdiff_NpValence', 'dev_NpValence', 'max_NpValence', 'min_NpValence', 'most_NpValence', 'mean_NdValence', 'maxdiff_NdValence', 'dev_NdValence', 'max_NdValence', 'min_NdValence', 'most_NdValence', 'mean_NfValence', 'maxdiff_NfValence', 'dev_NfValence', 'max_NfValence', 'min_NfValence', 'most_NfValence', 'mean_NValance', 'maxdiff_NValance', 'dev_NValance', 'max_NValance', 'min_NValance', 'most_NValance', 'mean_NsUnfilled', 'maxdiff_NsUnfilled', 'dev_NsUnfilled', 'max_NsUnfilled', 'min_NsUnfilled', 'most_NsUnfilled', 'mean_NpUnfilled', 'maxdiff_NpUnfilled', 'dev_NpUnfilled', 'max_NpUnfilled', 'min_NpUnfilled', 'most_NpUnfilled', 'mean_NdUnfilled', 'maxdiff_NdUnfilled', 'dev_NdUnfilled', 'max_NdUnfilled', 'min_NdUnfilled', 'most_NdUnfilled', 'mean_NfUnfilled', 'maxdiff_NfUnfilled', 'dev_NfUnfilled', 'max_NfUnfilled', 'min_NfUnfilled', 'most_NfUnfilled', 'mean_NUnfilled', 'maxdiff_NUnfilled', 'dev_NUnfilled', 'max_NUnfilled', 'min_NUnfilled', 'most_NUnfilled', 'mean_GSvolume_pa', 'maxdiff_GSvolume_pa', 'dev_GSvolume_pa', 'max_GSvolume_pa', 'min_GSvolume_pa', 'most_GSvolume_pa', 'mean_GSbandgap', 'maxdiff_GSbandgap', 'dev_GSbandgap', 'max_GSbandgap', 'min_GSbandgap', 'most_GSbandgap', 'mean_GSmagmom', 'maxdiff_GSmagmom', 'dev_GSmagmom', 'max_GSmagmom', 'min_GSmagmom', 'most_GSmagmom', 'mean_SpaceGroupNumber', 'maxdiff_SpaceGroupNumber', 'dev_SpaceGroupNumber', 'max_SpaceGroupNumber', 'min_SpaceGroupNumber', 'most_SpaceGroupNumber', 'frac_sValence', 'frac_pValence', 'frac_dValence', 'frac_fValence', 'CanFormIonic', 'MaxIonicChar', 'MeanIonicChar']
# data_file = 'Data/chen_comp_train_20.csv'
data_file = 'Data/quat_4magpie.csv'

# available models: 
# ['Ridge', 'Lasso', 'ElasticNet', 'KNeighborsRegressor', 'DecisionTreeRegressor', 'RandomForest', 'SVR', 'MLPRegressor', 'GradientBoostingRegressor', 'AdaBoostRegressor', 'ExtraTreesRegressor', 'XGBoost', 'LightGBM', 'GaussianProcess', 'FastKAN'] 

model_list = ['Ridge', 'Lasso', 'ElasticNet', 'DecisionTreeRegressor', 'RandomForest', 'SVR', 'MLPRegressor']
candi_data_file = 'Data/quat_4magpie.csv'
BO = BayesianOptimization(target_props, data_file=data_file, feature_props=feature_props, model_list=model_list, acq_method='ucb', stacking=True, close_pool=True)#, select_region=[[2.200],[2.201]])
# BO.close_pooling_test(n_bootstrap_sample_nums=20, n_iter=100, batch_size=10, hpar=0.1)

rm: cannot remove '/mnt/Database/Yixuan/TMM_BO_project_Chen/performance_record.txt': No such file or directory
rm: cannot remove '/mnt/Database/Yixuan/TMM_BO_project_Chen/model_weights': No such file or directory
2024-11-26 14:28:28,933	INFO worker.py:1772 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


In [None]:
for i in range(1,2):
    # BO.close_pooling_test(n_bootstrap_sample_nums=10, n_iter=100, batch_size=10, hpar=0.2)
    
    file_path = f'{root}/performance_record.txt'
    data = pd.read_csv(file_path, sep=r'\t', engine='python')
    
    # 绘制图形
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # 绘制 current_max_value 折线
    ax.plot(data['number_of_samples'], data['current_best_value'], label='Max Value in train set', marker='o', lw=3, markersize=10)
    
    ax.plot(data['number_of_samples'], data['best_value_of_this_iteration'], label='Max value found in each iteration', marker='o', lw=3, markersize=10)
    
    # 绘制 mean_value_of_this_iteration 折线及其置信区间
    ax.errorbar(data['number_of_samples'], data['mean_value_of_this_iteration'],
                yerr=data['std_of_this_iteration'], label='Mean Value of each iteration with Confidence Interval', marker='o', lw=3, capsize=5, markersize=10)
    
    # 添加标签和标题
    ax.set_xlabel('Number of Samples', size=20)
    ax.set_ylabel('Target value', size=20)
    ax.set_title('Algorithm Performance Over Iterations', size=20)
    ax.legend(prop=dict(size=14))
    ax.tick_params(axis='both', which='major', labelsize=16)
    #ax.xaxis.set_major_locator(plt.MaxNLocator(integer=True))
    ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%d'))
    #ax.yaxis.set_major_locator(plt.MaxNLocator(integer=True))
    ax.yaxis.set_major_formatter(plt.FormatStrFormatter('%.1f'))
    
    # 显示图形
    plt.grid(True)
    plt.savefig(f'{root}/performance_record.png', bbox_inches='tight', dpi=150)
    plt.show()

    os.popen(f'mv model_weights model_weights_{i}')
    os.popen(f'mv performance_record* model_weights_{i}')
    time.sleep(1)

# Training and prediction

## Calling TMMBO, training, loading model and predicting candidate set

In [2]:
import os
from src.bayesian_optimization import BayesianOptimization
from src.io import IOManager

root = os.getcwd()

target_props = ['density']
data_file = 'Data/chen_comp_train.csv'
# data_file = 'Data/chen_comp_region_train_10.csv'
model_list = ['Ridge', 'Lasso', 'ElasticNet', 'DecisionTreeRegressor', 'RandomForest', 'SVR', 'MLPRegressor', 'GradientBoostingRegressor', 'ExtraTreesRegressor', 'XGBoost', 'LightGBM']#, 'GaussianProcess']
candi_data_file = 'Data/chen_comp_train.csv'
BO = BayesianOptimization(data_file, target_props, model_list=model_list, acq_method='ucb', candidate_file=candi_data_file)#, select_region=[[2.200],[2.201]])
BO.optimize(batch_size=10, n_bootstrap_sample_nums=81, sampling_method='gaussian', num_candidate=100, n_samples=1000, iterations=5, hpar=0.1)

drop feature which is non numeric: composition
length of cleaned data: 400
used feature set: ['NComp', 'Comp_L2Norm', 'Comp_L3Norm', 'Comp_L5Norm', 'Comp_L7Norm', 'Comp_L10Norm', 'mean_Number', 'maxdiff_Number', 'dev_Number', 'max_Number', 'min_Number', 'most_Number', 'mean_MendeleevNumber', 'maxdiff_MendeleevNumber', 'dev_MendeleevNumber', 'max_MendeleevNumber', 'min_MendeleevNumber', 'most_MendeleevNumber', 'mean_AtomicWeight', 'maxdiff_AtomicWeight', 'dev_AtomicWeight', 'max_AtomicWeight', 'min_AtomicWeight', 'most_AtomicWeight', 'mean_MeltingT', 'maxdiff_MeltingT', 'dev_MeltingT', 'max_MeltingT', 'min_MeltingT', 'most_MeltingT', 'mean_Column', 'maxdiff_Column', 'dev_Column', 'max_Column', 'min_Column', 'most_Column', 'mean_Row', 'maxdiff_Row', 'dev_Row', 'max_Row', 'min_Row', 'most_Row', 'mean_CovalentRadius', 'maxdiff_CovalentRadius', 'dev_CovalentRadius', 'max_CovalentRadius', 'min_CovalentRadius', 'most_CovalentRadius', 'mean_Electronegativity', 'maxdiff_Electronegativity', 'dev

(array([[2.00000000e+00, 7.45355992e-01, 6.93361274e-01, ...,
         7.24240177e-01, 3.21884523e-01, 1.25000000e+03],
        [3.00000000e+00, 7.07815297e-01, 6.62063797e-01, ...,
         7.24240177e-01, 3.31226139e-01, 1.00000000e+03],
        [3.00000000e+00, 7.18688679e-01, 6.71639139e-01, ...,
         7.45613369e-01, 3.30599916e-01, 1.00000000e+03],
        ...,
        [4.00000000e+00, 7.00749920e-01, 6.72338177e-01, ...,
         7.24240177e-01, 2.93058729e-01, 1.10000000e+03],
        [2.00000000e+00, 7.45355992e-01, 6.93361274e-01, ...,
         4.34488699e-01, 1.93106089e-01, 7.63000000e+02],
        [3.00000000e+00, 6.84653197e-01, 6.39653743e-01, ...,
         7.45613369e-01, 3.42903660e-01, 1.15000000e+03]]),
 array([395, 377, 378, 376, 398, 396, 387, 375, 399, 374]))

## Fitting results check

In [None]:
import pandas as pd
import pickle, os
from src.bayesian_optimization import BayesianOptimization
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error


root = os.getcwd()
target_props = ['density']
data_file = 'Data/chen_comp_train.csv'
# data_file = 'Data/chen_comp_train_22r6.csv'
model_list = ['Ridge', 'Lasso', 'ElasticNet', 'DecisionTreeRegressor', 'RandomForest', 'SVR', 'MLPRegressor', 'GradientBoostingRegressor', 'ExtraTreesRegressor', 'XGBoost', 'LightGBM']
model_list = ['Lasso', 'ElasticNet']
candi_data_file = 'Data/chen_comp_train.csv'

BO = BayesianOptimization(data_file, target_props, model_list=model_list, acq_method='ucb', candidate_file=candi_data_file)#, select_region=[[2.200],[2.201]])
X_train, y_train = BO.io_manager.read_data(data_file, target_props=BO.target_props, feature_props=BO.feature_props, handle_null=True, drop_non_numeric=True)
# X_candi = io_manager.read_candidate_data(candi_data_file, target_props=BO.target_props, feature_props=BO.feature_props, drop_non_numeric=True)
# X_scaled, y_scaled, candidate_X_scaled = io_manager.standardize_data(X_train, y_train, X_candi)
X_candi, y_candi = BO.io_manager.read_data(candi_data_file, target_props=BO.target_props, feature_props=BO.feature_props, handle_null=True, drop_non_numeric=True)
X_scaled, y_scaled, candidate_X_scaled, candidate_y_scaled = BO.io_manager.standardize_data(X_train, y_train, X_candi, y_candi)
# candidate_X_scaled, candidate_y_scaled = io_manager.standardize_data(X_candi, y_candi)
# X_scaled, y_scaled = io_manager.standardize_data(X_train, y_train)

model_path = f'model_weights'
models = [i for i in os.listdir(f'{model_path}') if i.endswith('.pkl')]
# models = [i for i in os.listdir(f'{model_path}') if i.endswith('.pkl') and i.split('0')[0][:-1] in model_list]

for m in models:
    with open(f'{model_path}/{m}', 'rb') as f:
        model = pickle.load(f)
    if 'bootstrap' in m:
        bs_models = model['models']
        pred_y = np.mean([m.predict(candidate_X_scaled) for m in bs_models], axis=0)
    else:
        bs_models = model['stacking_model']
        pred_y = bs_models.predict(candidate_X_scaled)

    # print(pred_y.reshape(-1,1))
    # io_manager.scaler_y.fit(y_train)
    y_prediction = BO.io_manager.scaler_y.inverse_transform(pred_y.reshape(-1,1))

    # print(y_prediction)

    r2 = r2_score(y_prediction, y_candi)
    mse = mean_squared_error(y_prediction, y_candi)

    fig, ax = plt.subplots(figsize=(10, 10))
    ax.scatter(y_candi, y_prediction.reshape(-1))
    miny, maxy = min(np.min(y_prediction), np.min(y_candi)), max(np.max(y_prediction), np.max(y_candi))
    X_axis = np.linspace(miny, maxy, num=50)
    ax.plot(X_axis, X_axis, c='r')
    ax.set_title(f'Models {m}', size=20)
    ax.text(miny+0.05, maxy-0.1, s=f"MSE: {round(mse,3)}", fontsize=18, color='black')
    ax.text(miny+0.05, maxy-0.2, s=f"R2: {round(r2,3)}", fontsize=18, color='black')
    plt.savefig(f'{model_path}/{m[:-4]}.png', bbox_inches='tight', dpi=150)
    plt.show()
    
    csv_file = pd.read_csv(candi_data_file)
    csv_file['density'] = y_prediction
    csv_file.to_csv(f'{model_path}/{m[:-4]}.csv', index=False)
    

## Candidates selection

### Subset screening

In [None]:
import os
from src.bayesian_optimization import BayesianOptimization
from src.io import IOManager

root = os.getcwd()
# os.popen(f'rm -r {root}/model_weights')

target_props = ['density']
# data_file = 'Data/chen_comp_train_20.csv'
data_file = 'Data/chen_comp_train_maxr3.csv'
model_list = ['Ridge', 'Lasso', 'ElasticNet', 'DecisionTreeRegressor', 'RandomForest', 'SVR', 'MLPRegressor', 'GradientBoostingRegressor', 'ExtraTreesRegressor', 'XGBoost', 'LightGBM']#, 'GaussianProcess']

for count in range(1,21):
    candi_data_file = f'Chen_candidate/sep_candidate_{count}.csv'
    # candi_data_file = f'Chen_candidate/filtered_compositions_{count}.csv'
    
    BO = BayesianOptimization(data_file, target_props, model_list=model_list, acq_method='ucb', candidate_file=candi_data_file)#, select_region=[[2.200],[2.201]])
    # print(BO.model_list)
    BO.optimize(batch_size=10000, n_bootstrap_sample_nums=81, sampling_method='gaussian', num_candidate=1000, n_samples=1000, iterations=5, hpar=0.1, if_train=False)  
    os.popen(f'mv {root}/suggested_samples_original.csv {root}/Chen_candidate/suggested_samples_original_{count}.csv')

### merge screened out candidates set

In [None]:
import pandas as pd
import os

# trainset_path = 'Data/chen_comp_train_20.csv'
trainset_path = 'Data/chen_comp_train_maxr3.csv'
train_df = pd.read_csv(trainset_path)

feature_columns = train_df.columns.drop('density')
# feature_columns = train_df.columns

combined_test_df = pd.DataFrame()

testset_folder = 'Chen_candidate'

for file_name in os.listdir(testset_folder):
    if file_name.endswith('.csv') and file_name.startswith('suggested_samples_original_'):
        testset_path = os.path.join(testset_folder, file_name)
        
        test_df = pd.read_csv(testset_path)
        
        test_df.columns = feature_columns
        
        test_df['density'] = None
        
        combined_test_df = pd.concat([combined_test_df, test_df], ignore_index=True)

combined_test_df.to_csv(f'Chen_candidate/combined_testset.csv', index=False)

In [None]:
import os
from src.bayesian_optimization import BayesianOptimization
from src.io import IOManager

root = os.getcwd()

target_props = ['density']
# data_file = 'Data/chen_comp_train.csv'
data_file = 'Data/chen_comp_train_maxr3.csv'
model_list = ['Ridge', 'Lasso', 'ElasticNet', 'DecisionTreeRegressor', 'RandomForest', 'SVR', 'MLPRegressor', 'GradientBoostingRegressor', 'ExtraTreesRegressor', 'XGBoost', 'LightGBM']#, 'GaussianProcess']
candi_data_file = 'Chen_candidate/combined_testset.csv'
BO = BayesianOptimization(data_file, target_props, model_list=model_list, acq_method='ucb', candidate_file=candi_data_file)#, select_region=[[2.200],[2.201]])
BO.optimize(batch_size=200, n_bootstrap_sample_nums=81, sampling_method='gaussian', num_candidate=100, n_samples=1000, iterations=5, hpar=0.1, if_train=False)

testset_path = 'Chen_candidate/suggested_samples_final.csv'
os.popen(f'mv suggested_samples_original.csv {testset_path}')
os.popen(f'rm suggested_samples_indexes.csv suggested_samples.csv')

train_df = pd.read_csv(data_file)
feature_columns = train_df.columns.drop('density')
test_df = pd.read_csv(testset_path)
test_df.columns = feature_columns
test_df['density'] = None

test_df.to_csv(testset_path, index=False)

# Other side functions

### check model, plotting and predicting candidates

In [None]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error

scaler_method='standard'
io_manager = IOManager(method=scaler_method)
X_train, y_train = io_manager.read_data(data_file, target_props=BO.target_props, feature_props=BO.feature_props, handle_null=True, drop_non_numeric=True)
# X_candi = io_manager.read_candidate_data(candi_data_file, target_props=BO.target_props, feature_props=BO.feature_props, drop_non_numeric=True)
# X_scaled, y_scaled, candidate_X_scaled = io_manager.standardize_data(X_train, y_train, X_candi)
X_candi, y_candi = io_manager.read_data(candi_data_file, target_props=BO.target_props, feature_props=BO.feature_props, handle_null=True, drop_non_numeric=True)
X_scaled, y_scaled, candidate_X_scaled, candidate_y_scaled = io_manager.standardize_data(X_train, y_train, X_candi, y_candi)
# candidate_X_scaled, candidate_y_scaled = io_manager.standardize_data(X_candi, y_candi)
# X_scaled, y_scaled = io_manager.standardize_data(X_train, y_train)

model_path = f'model_weights'
models = [i for i in os.listdir(f'{model_path}') if i.endswith('.pkl')]

for m in models:
    with open(f'{model_path}/{m}', 'rb') as f:
        model = pickle.load(f)
    if 'bootstrap' in m:
        bs_models = model['models']
        pred_y = np.mean([m.predict(candidate_X_scaled) for m in bs_models], axis=0)
    else:
        bs_models = model['stacking_model']
        pred_y = bs_models.predict(candidate_X_scaled)

    # print(pred_y.reshape(-1,1))
    # io_manager.scaler_y.fit(y_train)
    y_prediction = io_manager.scaler_y.inverse_transform(pred_y.reshape(-1,1))

    print(y_prediction)

    r2 = r2_score(y_prediction, y_candi)
    mse = mean_squared_error(y_prediction, y_candi)

    fig, ax = plt.subplots(figsize=(10, 10))
    ax.scatter(y_candi, y_prediction.reshape(-1))
    miny, maxy = min(np.min(y_prediction), np.min(y_candi)), max(np.max(y_prediction), np.max(y_candi))
    X_axis = np.linspace(miny, maxy, num=50)
    ax.plot(X_axis, X_axis, c='r')
    ax.set_title(f'Models {m}', size=20)
    ax.text(miny+0.05, maxy-0.1, s=f"MSE: {round(mse,3)}", fontsize=18, color='black')
    ax.text(miny+0.05, maxy-0.2, s=f"R2: {round(r2,3)}", fontsize=18, color='black')
    plt.savefig(f'{model_path}/{m[:-4]}.png', bbox_inches='tight', dpi=150)
    plt.show()
    
    csv_file = pd.read_csv(candi_data_file)
    csv_file['density'] = y_prediction
    csv_file.to_csv(f'{model_path}/{m[:-4]}.csv', index=False)
    
    
#     fig, ax = plt.subplots(figsize=(10, 10))
#     ax.scatter(pred_y, y_scaled.reshape(-1))
#     ax.set_title(f'Models {m}', size=20)
#     ax.set_xlim(-3, 3)
#     ax.set_ylim(-3, 3)
#     plt.savefig(f'{root}/{data_path}/{m[:-4]}.png', bbox_inches='tight', dpi=150)
#     plt.show()  

### select specific number of element

In [None]:
import pandas as pd
import re

for num in range(1,21):
    df = pd.read_csv(f'Chen_candidate/sep_candidate_{num}.csv')
    
    element_pattern = re.compile(r'([A-Z][a-z]?)(\d*\.?\d*)')
    
    def get_elements(composition):
        elements = element_pattern.findall(composition)
        element_dict = {elem: float(count) if count else 1.0 for elem, count in elements}
        return element_dict
    
    def filter_compositions(row):
        elements = get_elements(row['composition'])
        elements_filtered = {k: v for k, v in elements.items() if k not in 'Cl'}
        if len(elements_filtered) == 5:
            if 'Zr' not in elements_filtered.keys() and 'Zn' not in elements_filtered.keys():
                return True
            else:
                return False
        else:
            return False
    
    df_filtered = df[df.apply(filter_compositions, axis=1)]
    df_filtered.to_csv(f'Chen_candidate/filtered_5_compositions_{num}.csv', index=False)
    
    print(f"Finish filtering，saveing results to filtered_compositions_{num}.csv")

### position transition

In [None]:
import re
import numpy as np
import pandas as pd

element_to_cl_ratio = {
    'Li': 1, 'Na': 1, 'K': 1, 'Rb': 1, 'Cs': 1,
    'Mg': 2, 'Ca': 2, 'Sr': 2, 'Ba': 2, 'Zn': 2,
    'Zr': 4
}

element_pattern = re.compile(r'([A-Z][a-z]?)(\d*\.?\d*)')

def get_elements(composition):
    elements = element_pattern.findall(composition)
    element_dict = {elem: float(count) if count else 1.0 for elem, count in elements}
    return element_dict

def round_to_grid(value, grid=0.05):
    return round(value / grid) * grid

def normalize_composition(composition, grid=0.05):
    elements = get_elements(composition)
    
    non_cl_elements = {k: v for k, v in elements.items() if k != 'Cl'}
    total_non_cl = sum(non_cl_elements.values())

    normalized_elements = {k: v / total_non_cl for k, v in non_cl_elements.items()}

    total_cl = sum(normalized_elements[k] * element_to_cl_ratio[k] for k in normalized_elements)

    normalized_elements = {k: round_to_grid(v, grid) for k, v in normalized_elements.items()}
    total_cl_rounded = round_to_grid(total_cl, grid)
    
    # normalized_composition = ''.join([f"{k}{v:.2f}" for k, v in normalized_elements.items()])
    # normalized_composition += f"Cl{total_cl_rounded:.2f}"

    normalized_composition = ''.join([f"{k}{round(v, 2)}" for k, v in normalized_elements.items()])
    normalized_composition += f"Cl{round(total_cl_rounded, 3)}"
    
    return normalized_composition

### Tests
file = 'Dataprocessing/collection_4.csv'
data = pd.read_csv(file)
compositions = data['composition']
normalized_compositions = [normalize_composition(composition) for composition in compositions]
data['norm_composition'] = normalized_compositions
data.to_csv(file, index=False)