In [5]:
import pickle
import numpy as np

# Assert that each record in the list has "angles" property
with open('combined_train.pickle', 'rb') as file:
    crystal_dict_list = pickle.load(file)
    assert all('angles' in record for record in crystal_dict_list)
    assert all(len(record['angles']) == 3 for record in crystal_dict_list)
    
    # Extract HOA and zeolite codes
    hoa = np.array([entry['hoa'] for entry in crystal_dict_list])
    zeo_code = np.array([entry['zeolite_code'] for entry in crystal_dict_list])

    # Find unique zeolite codes
    unique_zeo_codes = np.unique(zeo_code)

    # Find the maximum HOA for each zeolite type
    mean_hoa_per_zeo_code = {}
    std_hoa_per_zeo_code = {}
    for code in unique_zeo_codes:
        # Get the HOA values corresponding to the current zeolite code
        hoa_values_for_code = np.array([entry['hoa'] for entry in crystal_dict_list if entry['zeolite_code'] == code])
        mean_hoa_per_zeo_code[code] = np.mean(hoa_values_for_code)
        std_hoa_per_zeo_code[code] = np.std(hoa_values_for_code)

        print(code, mean_hoa_per_zeo_code[code], std_hoa_per_zeo_code[code])


    # Add normalized HOA
    for entry in crystal_dict_list:
        mean_hoa = mean_hoa_per_zeo_code[entry['zeolite_code']]
        std_hoa = std_hoa_per_zeo_code[entry['zeolite_code']]
        entry['norm_hoa'] = (entry['hoa'] - mean_hoa) / std_hoa

BEC 36.34289794238683 5.210157699183983
CHA 35.303938008130075 4.343490673437941
DDRch1 37.183385534591196 4.656251908625993
DDRch2 37.26874822006472 5.036733631742368
ERI 36.76764695121951 4.309114916361526
FAU 34.0741245416079 7.580728704024696
FAUch 33.30132694736842 4.5424740464003985
FER 45.683843089430894 3.893401202750922
HEU 42.717769715447155 7.357330367124597
ITW 44.31117683544304 5.289644607023737
LTA 33.09635591182364 3.622782797880845
LTL 32.04783272357724 2.7392199632077143
MEL 42.499840151515144 8.887492609181827
MELch 43.91289302325581 6.875302079594195
MER 38.1302890946502 4.386876942672151
MFI 42.736209705215416 5.5731885347370405
MOR 37.64077196666667 4.403738000748621
MTW 44.40935183673469 6.198169121189254
NAT 45.10882409638554 3.454827055421614
RHO 29.490237241379308 2.9173342488517533
TON 43.63866666666666 8.046479578714326
TON2 43.22047142857143 6.547443840148456
TON3 43.48229890710383 7.410828179503221
TON4 44.19614354838709 7.850420383209014
TONch 49.551264935

In [6]:
import pickle

# Find all unique zeolite codes
with open('combined_train.pickle', 'rb') as file:
    data = pickle.load(file)
    unique_zeolite_codes = {}
    for record in data:
        unique_zeolite_codes.update({record["zeolite_code"]: record['lengths']})
    print(len(unique_zeolite_codes))


print(unique_zeolite_codes)
print(len(unique_zeolite_codes))

unique_zeolite_codes_mapping = {code: i for i, code in enumerate(unique_zeolite_codes)}
print(unique_zeolite_codes_mapping)

26
{'DDRch1': [13.795, 13.795, 40.75], 'DDRch2': [13.795, 13.795, 40.75], 'FAU': [24.345, 24.345, 24.345], 'FAUch': [24.345, 24.345, 24.345], 'ITW': [10.45, 8.954, 8.954], 'MEL': [20.27, 20.27, 13.459], 'MELch': [20.27, 20.27, 13.459], 'MFI': [20.09, 19.738, 13.142], 'MOR': [18.256, 20.534, 7.542], 'RHO': [15.031, 15.031, 15.031], 'TON': [14.1, 17.84, 5.25], 'TON2': [14.105, 17.842, 5.256], 'TON3': [14.105, 17.842, 5.256], 'TON4': [14.105, 17.842, 5.256], 'TONch': [14.105, 17.842, 5.256], 'BEC': [12.77, 12.77, 12.977], 'CHA': [13.675, 13.675, 14.767], 'ERI': [13.054, 13.054, 15.175], 'FER': [19.018, 14.303, 7.541], 'HEU': [17.523, 17.644, 7.401], 'LTA': [11.919, 11.919, 11.919], 'LTL': [18.126, 18.126, 7.567], 'MER': [14.012, 14.012, 9.954], 'MTW': [25.552, 5.256, 12.117], 'NAT': [13.85, 13.85, 6.42], 'YFI': [18.181, 31.841, 12.641]}
26
{'DDRch1': 0, 'DDRch2': 1, 'FAU': 2, 'FAUch': 3, 'ITW': 4, 'MEL': 5, 'MELch': 6, 'MFI': 7, 'MOR': 8, 'RHO': 9, 'TON': 10, 'TON2': 11, 'TON3': 12, 'TON4