In [1]:
%reset -f
%reload_ext autoreload
%autoreload 2

import os
import sys
import importlib
import numpy as np
import pandas as pd

# Set root directory dynamically
ROOT_DIR = r'C:\Users\edens\Documents\GitHub\LabCode\MolFeatures'
sys.path.append(ROOT_DIR)
os.chdir(ROOT_DIR)
sys.path.append('M3_modeler')
sys.path.append('M2_data_extractor')
# Remove old module from cache
sys.modules.pop('data_extractor', None)
# Import modules and reload
try:
    os.chdir('M2_data_extractor')
    from data_extractor import Molecules, extract_connectivity
    from feather_extractor import logs_to_feather
    import data_extractor
    import feather_extractor
    import gaussian_handler
    from gaussian_handler import feather_file_handler
    os.chdir('../utils')
    from help_functions import dict_to_horizontal_df, get_df_from_file
    from visualize import show_single_molecule
    import help_functions
    os.chdir('../M3_modeler')
    from modeling import (
        ClassificationModel,
        LinearRegressionModel,
        fit_and_evaluate_single_combination_regression
    )
    import plot
    import modeling


    
    
    
except ModuleNotFoundError as e:
    print(f"Error: {e}")
    print("Make sure the module is in the correct directory and the Python environment is set up properly.")

# Ensure Pandas displays all columns
pd.set_option('display.max_columns', None)

# Define paths
log_files_path = None  # Example: r'C:\Users\edens\Documents\GitHub\LabCode\MolFeatures\Blackwell_logs'
if log_files_path:
    logs_to_feather(log_files_path)

feather_path = r'C:\Users\edens\Documents\GitHub\lucas_project\new_logs_hirshfeld\new_feather'
# feather_path=r'C:\Users\edens\Documents\GitHub\lucas_project\Secondary_Sphere\feather_benzal'
# feather_path=r'C:\Users\edens\Documents\GitHub\lucas_project\Doyle2021\logfiles\feather_files'
os.chdir(feather_path)

# Load molecular data
mols = data_extractor.Molecules(feather_path)

# Define dictionary of answers
answers_dict = {
    "Ring Vibration atoms": "9",
    "Strech Vibration atoms": "1,2 4,5",
    "Bending Vibration atoms": "22,23",
    "Dipole atoms": "11,6 4 10,11,12 11,12 5 4,5,1 4,3,2",
    "NPA manipulation atoms": "11,6 4 10,11,12 11,12,5 4,5,1 4,3,2",
    "Sub-Atoms": "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25",
    "Charge values": "1,3,5,11,8,12,14",
    "Charge difference": "11,5 3,5 11,12 12,5",
    "Sterimol atoms": "4,6 6,4 4,3 11,6",
    "Bond length": "11,12 6,4 12,13 12,5 12,14 4,3",
    "Bond Angle": "11,6 4,6 4,5 1,5,4,3 11,6,4,5"
}

list_answers = [
    [9,8], [1600], [], [1200], [22, 23],
    [[11, 6, 4], [10, 11, 12], [11, 12, 5], [4, 5, 1], [4, 3, 2]],
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25],
    [[5,1,25], [10, 11, 12], [11, 12, 5]],
    [1, 3, 5, 11, 8, 12, 14],
    [[11, 5], [3, 5], [11, 12], [12, 5]],
    [[4, 6], [6, 4], [4, 3], [11, 6]],
    [[11, 12],[3,11],[3,12],[5,11], [12, 14],[4,12]],
    [[11, 6, 4], [6, 4, 5], [1, 5, 4, 3], [11, 6, 4, 5]]

]


Molecules Loaded: ['LS1621.feather', 'LS1622.feather', 'LS1630.feather', 'LS1641.feather', 'LS1642.feather', 'LS1643.feather', 'LS1661.feather', 'LS1663.feather', 'LS1685.feather', 'LS1687.feather', 'LS1688.feather', 'LS1689.feather', 'LS1690.feather', 'LS1695.feather', 'LS1696.feather', 'LS1714.feather', 'LS1715.feather', 'LS1716.feather', 'LS1717.feather', 'LS1721.feather', 'LS1723.feather', 'LS1726.feather', 'LS1734.feather', 'LS1736.feather', 'LS2000.feather', 'LS2001.feather', 'LS2002.feather', 'LS2003.feather', 'LS2004.feather', 'LS2005.feather', 'LS2006.feather', 'LS2007.feather', 'LS2008.feather'] Failed Molecules: []


In [3]:
steric=mols.get_sterimol_dict([[4, 6], [6, 4], [4, 3], [11, 6]],radii='CPK')

In [3]:
steric=dict_to_horizontal_df(steric)

In [2]:
df=mols.get_molecules_comp_set_app(answers_dict=None,answers_list_load=list_answers)
df

None
Benzene ring found: [9, 8, 7, 6, 11, 10]
Second atom is in the benzene ring.
Benzene ring found: [9, 8, 7, 6, 11, 10]
Second atom is in the benzene ring.
Benzene ring found: [9, 8, 7, 6, 11, 10]
Second atom is in the benzene ring.
Benzene ring found: [9, 8, 7, 6, 11, 10]
Second atom is in the benzene ring.
Benzene ring found: [9, 8, 7, 6, 11, 10]
Second atom is in the benzene ring.
Benzene ring found: [9, 8, 7, 6, 11, 10]
Second atom is in the benzene ring.
Benzene ring found: [9, 8, 7, 6, 11, 10]
Second atom is in the benzene ring.
Benzene ring found: [9, 8, 7, 6, 11, 10]
Second atom is in the benzene ring.
Benzene ring found: [9, 8, 7, 6, 11, 10]
Second atom is in the benzene ring.
Benzene ring found: [9, 8, 7, 6, 11, 10]
Second atom is in the benzene ring.
Benzene ring found: [9, 8, 7, 6, 11, 10]
Second atom is in the benzene ring.
Benzene ring found: [9, 8, 7, 6, 11, 10]
Second atom is in the benzene ring.
Benzene ring found: [9, 8, 7, 6, 11, 10]
Second atom is in the benzene 

Unnamed: 0,cross,cross_angle,para,para_angle,Frequency_Bending_22-23,Cross_mag_Bending_22-23,dip_x_NPA_11-6-4,dip_y_NPA_11-6-4,dip_z_NPA_11-6-4,total_dipole_NPA_11-6-4,dip_x_NPA_10-11-12,dip_y_NPA_10-11-12,dip_z_NPA_10-11-12,total_dipole_NPA_10-11-12,dip_x_NPA_11-12-5,dip_y_NPA_11-12-5,dip_z_NPA_11-12-5,total_dipole_NPA_11-12-5,dip_x_NPA_4-5-1,dip_y_NPA_4-5-1,dip_z_NPA_4-5-1,total_dipole_NPA_4-5-1,dip_x_NPA_4-3-2,dip_y_NPA_4-3-2,dip_z_NPA_4-3-2,total_dipole_NPA_4-3-2,dipole_x_5-1-25,dipole_y_5-1-25,dipole_z_5-1-25,total_dipole_5-1-25,dipole_x_10-11-12,dipole_y_10-11-12,dipole_z_10-11-12,total_dipole_10-11-12,dipole_x_11-12-5,dipole_y_11-12-5,dipole_z_11-12-5,total_dipole_11-12-5,nbo_atom_1,nbo_atom_3,nbo_atom_5,nbo_atom_11,nbo_atom_8,nbo_atom_12,nbo_atom_14,hirshfeld_atom_1,hirshfeld_atom_3,hirshfeld_atom_5,hirshfeld_atom_11,hirshfeld_atom_8,hirshfeld_atom_12,hirshfeld_atom_14,cm5_atom_1,cm5_atom_3,cm5_atom_5,cm5_atom_11,cm5_atom_8,cm5_atom_12,cm5_atom_14,B1_4-6,B5_4-6,L_4-6,loc_B5_4-6,B1_B5_angle_4-6,B1_6-4,B5_6-4,L_6-4,loc_B5_6-4,B1_B5_angle_6-4,B1_4-3,B5_4-3,L_4-3,loc_B5_4-3,B1_B5_angle_4-3,B1_11-6,B5_11-6,L_11-6,loc_B5_11-6,B1_B5_angle_11-6,bond_length_11-12,bond_length_3-11,bond_length_3-12,bond_length_5-11,bond_length_12-14,bond_length_4-12,"angle_[11, 6, 4]","angle_[6, 4, 5]","dihedral_[1, 5, 4, 3]","dihedral_[11, 6, 4, 5]"
LS1621,1674.6111,39.227103,1652.6065,53.790361,3176.5085,0.341497,1.160731,4.299914,-0.409019,4.472567,-3.054492,3.064794,-0.171324,4.330385,4.325054,-0.650147,-0.077242,4.374328,4.430421,-1.231978,0.38239,4.614393,0.915549,4.46103,-0.808884,4.62529,-1.863747,-1.024725,-10.519571,10.7325,-10.180303,-3.377346,-0.374115,10.7325,3.3602,-10.191187,0.183729,10.7325,-0.08843,-0.47523,-0.44749,-0.37993,-0.15272,0.38106,-0.70421,0.025547,-0.131285,-0.108202,-0.053677,-0.004202,0.164793,-0.144169,0.022678,-0.207793,-0.353321,-0.314159,-0.057741,0.397356,-0.340934,1.6998,3.2852,6.2555,3.7923,91.0482,1.9316,4.859,7.193,4.5669,119.2581,1.6991,6.144,3.5204,1.3677,144.753,1.9574,7.3621,5.4352,3.2715,119.8702,2.178381,3.594038,4.262064,2.745039,1.973706,2.963223,115.22588,123.183505,3.866926,4.982389
LS1622,1688.819,7.836688,1660.1501,84.348787,1530.3972,0.359935,0.272959,1.839751,-0.989847,2.106891,-3.199226,1.077458,-0.910373,3.496389,2.476554,-1.778912,0.796034,3.151431,0.554543,-0.58057,1.047279,1.319611,-0.380821,0.612844,-0.99482,1.22893,-1.79245,1.775714,-9.693686,10.0166,-9.997078,-0.459818,-0.424948,10.0166,5.574201,-8.321639,0.110408,10.0166,-0.08915,-0.47427,-0.44016,-0.37463,-0.12868,0.37809,-0.70028,0.026314,-0.130423,-0.105224,-0.048963,0.000883,0.16509,-0.141094,0.023349,-0.206961,-0.349665,-0.308815,-0.048492,0.396865,-0.337926,2.0629,3.2858,7.6548,3.7996,109.0228,1.9312,4.8565,7.1931,4.5734,119.1972,1.6991,6.1445,3.5161,1.3591,144.8043,2.3398,7.3589,5.4389,3.28,111.8759,2.181839,3.594216,4.268282,2.74547,1.973025,2.969747,115.242874,123.122975,3.817904,5.056583
LS1630,1643.0843,30.852374,1678.5785,60.148641,1531.9099,0.376626,2.667708,4.898217,-0.305293,5.58591,-2.316098,4.650567,-0.077794,5.195972,5.269708,0.872177,-0.204786,5.345321,6.159097,-0.30225,0.142644,6.168158,2.549087,5.582991,-0.915997,6.205375,-2.42366,-0.080908,-11.988257,12.2311,-11.86804,-2.951194,-0.197577,12.2311,4.792779,-11.252821,-0.047437,12.2311,-0.08857,-0.47873,-0.44479,-0.40645,0.35094,0.38062,-0.70331,0.025554,-0.132824,-0.107156,-0.068794,0.101392,0.163014,-0.144128,0.022714,-0.209683,-0.351706,-0.328876,0.121196,0.394774,-0.340945,1.7537,5.1704,6.2538,4.6347,93.1665,1.9315,4.8603,7.1936,4.564,119.3033,1.6991,6.1418,3.5222,1.3708,144.6567,2.0843,7.3651,5.4272,3.2544,116.8778,2.177188,3.59168,4.267622,2.742675,1.972927,2.966716,115.180219,123.174901,3.88654,4.792056
LS1641,1647.4172,29.770644,1680.0379,59.709925,3173.2922,0.364401,2.046429,1.754159,-2.318574,3.555381,0.902531,2.610897,-2.381469,3.647292,2.192409,2.223128,1.961964,3.687582,4.096519,0.93648,2.318399,4.799316,2.357352,3.199425,-2.842995,4.886313,-0.358293,-4.037364,-10.465565,11.2231,-9.141267,-6.508608,-0.17892,11.2231,1.142262,-11.164756,-0.013786,11.2231,-0.09182,-0.47461,-0.45328,-0.42328,0.35712,0.38659,-0.70487,0.024283,-0.130683,-0.107141,-0.086512,0.099119,0.172918,-0.147755,0.021639,-0.207058,-0.359136,-0.32589,0.118888,0.396393,-0.344853,1.6998,4.3976,7.1148,3.3635,105.1094,1.9157,4.8849,7.1951,4.5619,119.695,1.6991,6.1833,3.5017,1.3199,132.8759,1.9926,7.4122,5.4308,3.2139,117.3367,2.263051,3.60935,4.191478,2.750317,1.980412,2.925409,115.488508,122.873644,2.315135,5.944809
LS1642,1642.4555,4.440006,1667.7782,76.476774,1532.7426,0.394142,1.766727,2.657969,-1.834042,3.68101,-1.407753,2.665817,-1.754589,3.488113,3.318882,0.592192,1.159044,3.564976,3.561186,0.278986,1.811652,4.005242,1.645234,2.961473,-2.178823,4.027951,-0.026167,-2.789254,-9.003339,9.4256,-8.055489,-4.859899,-0.575514,9.4256,1.532936,-9.295079,0.303879,9.4256,-0.09158,-0.47581,-0.45509,-0.39992,-0.14776,0.38112,-0.6862,0.02429,-0.132222,-0.108067,-0.063147,-0.007035,0.178655,-0.145062,0.021741,-0.208504,-0.35879,-0.303767,-0.060546,0.40131,-0.341222,2.2087,8.2551,7.7595,5.1398,120.4247,1.9,4.8947,7.1913,4.5492,119.8334,1.6991,6.1852,3.499,1.3094,132.7625,2.9663,7.4027,5.43,3.2233,97.6683,2.268091,3.603733,4.204376,2.748761,1.978709,2.936209,115.501921,122.927556,2.373212,6.613158
LS1643,1636.0736,4.780466,1668.2129,76.343265,1533.2747,0.40249,2.09096,2.607707,-1.715628,3.757078,-1.30356,2.957542,-1.527703,3.574941,3.408,0.864,1.003267,3.656159,3.719134,0.503508,1.732801,4.133773,1.974965,3.025479,-2.057148,4.157628,-0.511415,-4.388,-8.653471,9.7159,-7.446441,-6.239694,0.12398,9.7159,0.007991,-9.715069,-0.126561,9.7159,-0.09197,-0.47519,-0.45436,-0.3989,-0.14709,0.37978,-0.68904,0.024157,-0.132262,-0.109256,-0.071207,-0.00784,0.175047,-0.146515,0.021659,-0.208344,-0.359811,-0.308898,-0.061287,0.397066,-0.342716,1.762,8.7601,8.7068,7.0913,99.1564,1.9467,4.8443,7.2284,4.6589,118.6067,1.6991,6.2148,3.4768,1.2793,130.3102,2.0781,7.4734,5.3958,3.1146,112.8403,2.274812,3.603294,4.18414,2.748798,1.979472,2.922278,115.45701,123.038129,1.443926,4.006796
LS1661,1653.0726,10.340303,1674.3443,76.471735,1533.2702,0.389691,0.68856,0.769306,-2.446064,2.655029,0.594216,0.874514,-2.388702,2.612234,1.008261,0.893624,2.191419,2.572445,2.128386,0.346092,2.490508,3.294303,0.919699,1.71192,-2.738471,3.357936,-0.277978,-2.424097,-9.687418,9.9899,-8.82948,-4.660438,-0.346673,9.9899,2.229749,-9.73787,-0.041068,9.9899,-0.09186,-0.47521,-0.45347,-0.39542,-0.14568,0.38581,-0.70316,0.024401,-0.131333,-0.107481,-0.071238,-0.006699,0.174022,-0.146607,0.021757,-0.207715,-0.359614,-0.310404,-0.060201,0.397235,-0.343752,1.6997,4.4108,6.2323,3.3432,102.0138,1.9155,4.8802,7.1982,4.5721,119.5792,1.6991,6.1863,3.499,1.3159,132.6797,1.8882,7.4077,5.436,3.2334,119.3167,2.268127,3.605632,4.187883,2.752791,1.980426,2.922057,115.626706,122.981755,2.208761,6.127085
LS1663,1652.2832,3.849523,1662.9231,89.452627,1536.3359,0.381549,0.99523,0.966063,-1.016553,1.719634,-0.063841,1.400754,-1.031241,1.740587,1.178196,0.843423,1.012033,1.767405,1.511421,0.340994,0.997034,1.842484,0.877222,1.173235,-1.133822,1.852444,-3.514183,-2.974375,-8.474539,9.6444,-8.50809,-4.437991,-0.964796,9.6444,2.185315,-9.348054,0.923297,9.6444,-0.09251,-0.47126,-0.47939,-0.39662,-0.15212,0.21677,-0.85877,0.016741,-0.131975,-0.11306,-0.101082,-0.012914,0.071966,-0.190632,0.015263,-0.206961,-0.373508,-0.310716,-0.066306,0.250657,-0.382677,1.9246,5.5964,7.0254,4.0044,118.5855,2.0946,5.6026,7.3048,3.6952,90.5267,1.6991,6.6032,3.4923,-1.5099,110.6668,2.3022,7.8022,5.3326,0.0168,94.0327,2.664355,3.594834,4.333581,2.727009,2.013282,3.092688,114.729343,122.990867,1.471326,3.01769
LS1685,1693.494,20.903271,1652.0781,71.928016,1526.2269,0.331147,-2.589397,1.267166,-2.289069,3.681104,-2.224748,-1.55546,-2.696773,3.826427,0.899347,-2.694656,2.537138,3.808814,3.088414,-0.908456,1.692423,3.637017,0.099547,1.98965,-3.088254,3.675042,-7.155545,5.593327,4.072631,9.9535,-9.556989,0.697817,-2.692667,9.9535,6.633991,-7.408098,0.428594,9.9535,-0.0372,-0.43155,-0.60912,-0.40897,-0.11941,0.36686,-0.70339,0.023801,-0.074823,-0.087509,-0.05587,-0.000818,0.170249,-0.142555,0.019028,-0.328432,-0.480994,-0.325162,-0.050049,0.406323,-0.339065,2.1128,3.2527,7.6442,3.8971,71.3105,1.8033,8.6841,6.6189,4.8275,114.5387,1.7088,6.8776,7.275,5.9483,103.402,4.1122,7.4074,10.6909,2.3121,165.9953,2.141072,3.658606,4.330893,2.780686,1.975565,2.990188,116.785473,110.294901,30.293878,39.324441
LS1687,1675.3979,32.983838,1643.1653,54.573665,3156.6866,0.274509,2.45013,4.770688,-0.252738,5.36903,-1.74283,4.347877,-0.994602,4.788604,4.884433,0.908996,0.473461,4.990804,6.069935,-0.34349,1.220177,6.200881,2.453067,5.7511,-0.022528,6.252455,-11.399283,-1.991069,-6.751301,13.3973,-12.420068,-5.018041,0.221882,13.3973,2.833491,-13.081945,-0.567547,13.3973,-0.06062,-0.44304,-0.42439,-0.41659,0.34914,0.38972,-0.69268,0.024342,-0.061243,-0.111462,-0.072595,0.09973,0.166376,-0.138327,0.023777,-0.303697,-0.336342,-0.333751,0.119338,0.388345,-0.335498,1.7413,4.3087,7.158,5.9734,94.5633,3.039,8.1954,7.6051,2.2617,93.4312,1.8334,6.5018,6.7036,4.2812,108.5665,3.0604,7.6856,8.3374,3.1737,87.9733,2.172981,3.672568,4.38517,2.771226,1.974005,3.036094,115.897583,122.293488,6.322174,14.179659


In [None]:
first_set_indices=[0,1,2,3,4,5,6,7,15,16,17,18,19,20,21,22,23,24,25,26,27,30,31,32]
second_set_indices=[8,9,10,11,12,13,14,28,29]
first_df=df.iloc[first_set_indices]
second_df=df.iloc[second_set_indices]
print(first_df.head(),second_df.head())

In [2]:
os.chdir(r'C:\Users\edens\Documents\GitHub\lucas_project\new_update_logs')
# combined_df.to_csv('combined_features1.csv', index=False)

In [4]:
csv_path=r'C:\Users\edens\Documents\GitHub\lucas_project\new_update_logs\combined_features1.csv'


csv_filepaths = {
'features_csv_filepath': csv_path,
'target_csv_filepath': ''
}

# ('B1_B5_angle_4-3', 'L_11-6', 'bond_length_11-12')
Regression_model = LinearRegressionModel(csv_filepaths, process_method='one csv', output_name='output', leave_out=[15,16], min_features_num=2, max_features_num=None, metrics=None, return_coefficients=False)
features_combination=['B1_B5_angle_4-3', 'L_11-6', 'bond_length_11-12']
features_combination_2=['dipole_z_11-12-5', 'nbo_atom_14', 'bond_length_11-12', 'bond_length_12-14']
# ('B1_B5_angle_4-3', 'L_11-6', 'bond_length_11-12')
plot.generate_and_display_q2_scatter_plot(Regression_model,features_combination_2)
# fit_and_evaluate_single_combination_regression(Regression_model,['B1_B5_angle_4-3', 'L_11-6', 'bond_length_11-12'],r2_threshold=0,bool_parallel=False)
# ('Frequency_Bending_22-23', 'angle_[11, 6, 4].1', 'bite_angle_inverted', 'L_[11, 6]')
# ('para', 'angle_[11, 6, 4].1', 'bite_angle_inverted', 'L_[11, 6]')
# ('cross_angle', 'angle_[11, 6, 4].1', 'bite_angle_inverted', 'L_[11, 6]')


Creating table at location: results_combined_features1.db
Table has been created successfully at location: results_combined_features1.db 
Created flag: True
linear model selected
leave_out: [15, 16]
Starting generate_and_display_q2_scatter_plot...
Extracting features from model.features_df...
Extraction complete. X shape: (31, 4), y shape: (31,)
Fitting the model...
Model fitted successfully.
Generating predictions with covariance calculation...
Predictions generated. pred shape: (31,)
Retrieving coefficient estimates...
Coefficient estimates retrieved:
                   Estimate  Std. Error    t value       p value
(Intercept)        1.061225    0.070488  15.055424  2.375877e-14
dipole_z_11-12-5  -0.222085    0.071449  -3.108315  4.517450e-03
nbo_atom_14        1.447149    0.180271   8.027624  1.658101e-08
bond_length_11-12 -1.683193    0.384669  -4.375686  1.745721e-04
bond_length_12-14  3.258770    0.478050   6.816798  3.098546e-07
Calculating cross-validation metrics for 3-fold CV

In [None]:
os.chdir(r'C:\Users\edens\Documents\GitHub\smiles_data')
csv_path=r'xyz_fingerprints.csv'


csv_filepaths = {
'features_csv_filepath': csv_path,
'target_csv_filepath': ''
}

Regression_model = LinearRegressionModel(csv_filepaths, process_method='one csv', output_name='output', leave_out=None, min_features_num=4, max_features_num=4, metrics=None, return_coefficients=False)
results=Regression_model.fit_and_evaluate_combinations(top_n=None)

Creating table at location: results_sterimol_data.db
Table has been created successfully at location: results_sterimol_data.db 
Created flag: True
linear model selected
leave_out: None
Using 1 jobs for evaluation. found 20 cores


Calculating combos with threshold 0.85 (single-core): 100%|██████████| 46904/46904 [14:39<00:00, 53.33it/s]


All Q2 values are -inf, recalculating with a new R2 threshold...
new threshold 0.2538525482436125


Calculating combos with threshold 0.2538525482436125 (single-core):   6%|▌         | 2875/46904 [00:57<14:44, 49.76it/s]


PermissionError: [Errno 13] Permission denied: 'results_sterimol_data.csv'

In [12]:
os.chdir(r'C:\Users\edens\Documents\GitHub\lucas_project')
sterimol_df=dict_to_horizontal_df(sterimol)
sterimol_df.to_csv('sterimol_new.csv')

In [8]:
# visualize molecules 0-5 with mols.molecules[0:5] and visualize molecule me
mols.visualize_molecules([0,1,2,3,4,5])

{frozenset({0, 1}): 1.54, frozenset({1, 2}): 1.45, frozenset({2, 3}): 1.34, frozenset({3, 4}): 1.27, frozenset({5, 6}): 1.38, frozenset({6, 7}): 1.39, frozenset({8, 7}): 1.38, frozenset({8, 9}): 1.39, frozenset({9, 10}): 1.33, frozenset({10, 11}): 2.18, frozenset({11, 12}): 2.32, frozenset({13, 14}): 1.09, frozenset({24, 25}): 1.53, frozenset({28, 29}): 1.09, frozenset({29, 30}): 1.09, frozenset({3, 5}): 1.47, frozenset({11, 13}): 1.97, frozenset({13, 15}): 1.09, frozenset({25, 27}): 1.09, frozenset({16, 13}): 1.09, frozenset({26, 29}): 1.09, frozenset({34, 31}): 1.09, frozenset({0, 4}): 1.47, frozenset({35, 31}): 1.09, frozenset({10, 5}): 1.34, frozenset({24, 29}): 1.53, frozenset({36, 31}): 1.09, frozenset({11, 4}): 2.29, frozenset({24, 31}): 1.53, frozenset({32, 25}): 1.09, frozenset({25, 33}): 1.09, frozenset({17, 6}): 1.08, frozenset({18, 7}): 1.08, frozenset({8, 19}): 1.08, frozenset({9, 20}): 1.08, frozenset({24, 0}): 1.55, frozenset({0, 23}): 1.09, frozenset({1, 22}): 1.09, fro

{frozenset({0, 1}): 1.54, frozenset({1, 2}): 1.45, frozenset({2, 3}): 1.34, frozenset({3, 4}): 1.26, frozenset({5, 6}): 1.38, frozenset({6, 7}): 1.39, frozenset({8, 7}): 1.38, frozenset({8, 9}): 1.39, frozenset({9, 10}): 1.32, frozenset({10, 11}): 2.18, frozenset({11, 12}): 2.32, frozenset({13, 14}): 1.09, frozenset({24, 25}): 1.53, frozenset({3, 5}): 1.47, frozenset({11, 13}): 1.97, frozenset({13, 15}): 1.09, frozenset({25, 27}): 1.09, frozenset({28, 30}): 1.09, frozenset({16, 13}): 1.09, frozenset({33, 30}): 1.09, frozenset({0, 4}): 1.47, frozenset({34, 30}): 1.09, frozenset({10, 5}): 1.34, frozenset({24, 29}): 1.53, frozenset({24, 30}): 1.53, frozenset({25, 31}): 1.09, frozenset({35, 29}): 1.09, frozenset({11, 4}): 2.29, frozenset({26, 19}): 1.33, frozenset({36, 29}): 1.09, frozenset({37, 29}): 1.09, frozenset({17, 6}): 1.08, frozenset({18, 7}): 1.08, frozenset({8, 19}): 1.51, frozenset({9, 20}): 1.08, frozenset({32, 19}): 1.34, frozenset({25, 38}): 1.09, frozenset({24, 0}): 1.55, f

{frozenset({0, 1}): 1.54, frozenset({1, 2}): 1.45, frozenset({2, 3}): 1.34, frozenset({3, 4}): 1.27, frozenset({5, 6}): 1.38, frozenset({6, 7}): 1.39, frozenset({8, 7}): 1.39, frozenset({8, 9}): 1.38, frozenset({9, 10}): 1.33, frozenset({10, 11}): 2.18, frozenset({11, 12}): 2.33, frozenset({13, 14}): 1.09, frozenset({24, 25}): 1.53, frozenset({3, 5}): 1.48, frozenset({11, 13}): 1.97, frozenset({13, 15}): 1.09, frozenset({25, 27}): 1.09, frozenset({26, 28}): 1.09, frozenset({16, 13}): 1.09, frozenset({26, 29}): 1.09, frozenset({33, 30}): 1.09, frozenset({0, 4}): 1.47, frozenset({32, 36}): 1.09, frozenset({10, 5}): 1.34, frozenset({26, 31}): 1.09, frozenset({32, 37}): 1.09, frozenset({32, 38}): 1.09, frozenset({33, 39}): 1.09, frozenset({11, 4}): 2.29, frozenset({40, 33}): 1.09, frozenset({18, 26}): 1.42, frozenset({24, 32}): 1.53, frozenset({24, 33}): 1.53, frozenset({25, 34}): 1.09, frozenset({25, 35}): 1.09, frozenset({17, 6}): 1.08, frozenset({18, 7}): 1.34, frozenset({8, 19}): 1.08,

{frozenset({0, 1}): 1.55, frozenset({1, 2}): 1.45, frozenset({2, 3}): 1.34, frozenset({3, 4}): 1.26, frozenset({5, 6}): 1.37, frozenset({6, 7}): 1.4, frozenset({8, 7}): 1.38, frozenset({8, 9}): 1.4, frozenset({9, 10}): 1.33, frozenset({10, 11}): 2.26, frozenset({11, 12}): 2.33, frozenset({13, 14}): 1.09, frozenset({24, 25}): 1.53, frozenset({3, 5}): 1.48, frozenset({11, 13}): 1.98, frozenset({13, 15}): 1.09, frozenset({16, 13}): 1.09, frozenset({0, 4}): 1.47, frozenset({25, 29}): 1.09, frozenset({26, 30}): 1.09, frozenset({10, 5}): 1.35, frozenset({33, 28}): 1.09, frozenset({32, 37}): 1.09, frozenset({32, 38}): 1.09, frozenset({11, 4}): 2.25, frozenset({27, 20}): 1.09, frozenset({32, 39}): 1.09, frozenset({40, 33}): 1.09, frozenset({18, 26}): 1.42, frozenset({24, 32}): 1.53, frozenset({33, 41}): 1.09, frozenset({24, 33}): 1.53, frozenset({25, 35}): 1.09, frozenset({17, 6}): 1.08, frozenset({18, 7}): 1.34, frozenset({8, 19}): 1.08, frozenset({9, 20}): 1.5, frozenset({20, 31}): 1.09, fro

{frozenset({0, 1}): 1.55, frozenset({1, 2}): 1.45, frozenset({2, 3}): 1.34, frozenset({3, 4}): 1.26, frozenset({5, 6}): 1.38, frozenset({6, 7}): 1.39, frozenset({8, 7}): 1.38, frozenset({8, 9}): 1.4, frozenset({9, 10}): 1.33, frozenset({10, 11}): 2.27, frozenset({11, 12}): 2.33, frozenset({13, 14}): 1.09, frozenset({32, 31}): 1.51, frozenset({33, 34}): 1.5, frozenset({35, 36}): 1.39, frozenset({36, 37}): 1.51, frozenset({3, 5}): 1.47, frozenset({11, 13}): 1.98, frozenset({13, 15}): 1.09, frozenset({29, 31}): 1.39, frozenset({33, 35}): 1.39, frozenset({16, 13}): 1.09, frozenset({24, 27}): 1.53, frozenset({0, 4}): 1.46, frozenset({32, 28}): 1.09, frozenset({33, 29}): 1.39, frozenset({51, 55}): 1.09, frozenset({10, 5}): 1.35, frozenset({50, 55}): 1.09, frozenset({24, 30}): 1.53, frozenset({32, 26}): 1.09, frozenset({49, 55}): 1.09, frozenset({11, 4}): 2.26, frozenset({32, 25}): 1.09, frozenset({34, 42}): 1.09, frozenset({34, 43}): 1.09, frozenset({37, 46}): 1.09, frozenset({34, 44}): 1.09

{frozenset({0, 1}): 1.55, frozenset({1, 2}): 1.45, frozenset({2, 3}): 1.34, frozenset({3, 4}): 1.26, frozenset({5, 6}): 1.38, frozenset({6, 7}): 1.39, frozenset({8, 7}): 1.38, frozenset({8, 9}): 1.4, frozenset({9, 10}): 1.34, frozenset({10, 11}): 2.27, frozenset({11, 12}): 2.33, frozenset({13, 14}): 1.09, frozenset({26, 27}): 1.09, frozenset({29, 30}): 1.09, frozenset({32, 31}): 1.41, frozenset({3, 5}): 1.48, frozenset({11, 13}): 1.98, frozenset({13, 15}): 1.09, frozenset({24, 26}): 1.53, frozenset({16, 13}): 1.09, frozenset({25, 28}): 1.39, frozenset({0, 4}): 1.46, frozenset({10, 5}): 1.35, frozenset({25, 20}): 1.39, frozenset({24, 30}): 1.53, frozenset({11, 4}): 2.26, frozenset({32, 39}): 1.09, frozenset({50, 43}): 1.08, frozenset({32, 40}): 1.09, frozenset({26, 35}): 1.09, frozenset({32, 41}): 1.09, frozenset({33, 42}): 1.08, frozenset({24, 34}): 1.53, frozenset({26, 36}): 1.09, frozenset({28, 38}): 1.08, frozenset({34, 44}): 1.09, frozenset({17, 6}): 1.08, frozenset({18, 7}): 1.08,

In [10]:
import os
import csv
from rdkit import Chem
from rdkit.Chem import rdmolfiles

def xyz_dir_to_csv(directory, output_csv="output.csv"):
    """
    Processes XYZ files in a directory and creates a CSV with filenames and SMILES.
    
    Args:
        directory (str): Path to directory containing XYZ files
        output_csv (str): Name of output CSV file (default: output.csv)
    """
    results = []
    mols = []
    for filename in os.listdir(directory):
        if filename.endswith(".xyz"):
            filepath = os.path.join(directory, filename)
            
            try:
                # Read XYZ file and convert to molecule
                mol = rdmolfiles.MolFromXYZFile(filepath)
                mols.append(mol)
                if mol:
                    # Generate canonical SMILES
                    smiles =  rdmolfiles.MolToCXSmiles(mol)
                    results.append([filename, smiles])
                    
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
    
    # Write to CSV
    # with open(output_csv, 'w', newline='') as f:
    #     writer = csv.writer(f)
    #     writer.writerow(["Filename", "SMILES"])
    #     writer.writerows(results)

    return results, mols

os.chdir(r'C:\Users\edens\Documents\GitHub\lucas_project\new_update_logs\xyz_new')
res,mols=xyz_dir_to_csv(os.getcwd(), output_csv="smiles_lucas.csv")

In [20]:
import os
import csv
from openbabel import pybel  # Requires Open Babel

def xyz_to_cxsmiles_with_metal(directory, output_csv="output.csv"):
    """
    Processes XYZ files in a directory and creates a CSV with filenames and CXSMILES strings,
    including proper handling of transition metals like Pd.

    Args:
        directory (str): Path to the directory containing XYZ files.
        output_csv (str): Name of the output CSV file (default: "output.csv").
    """
    results = []

    # Iterate through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".xyz"):  # Only process XYZ files
            filepath = os.path.join(directory, filename)

            try:
                # Use Open Babel to read the XYZ file
                mol = next(pybel.readfile("xyz", filepath))

                # Generate SMILES or CXSMILES
                smiles = mol.write("smi").split()[0].strip()  # Generate SMILES string
                results.append([filename, smiles])
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
                results.append([filename, "Error"])

    # Write results to a CSV file
    with open(output_csv, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["Filename", "CXSMILES"])
        writer.writerows(results)

    print(f"CSV file '{output_csv}' created successfully!")


In [21]:
xyz_to_cxsmiles_with_metal(r'C:\Users\edens\Documents\GitHub\lucas_project\new_update_logs\xyz_new', output_csv="smiles_lucas_metal_new.csv")

Error processing LS1685.xyz: 
CSV file 'smiles_lucas_metal_new.csv' created successfully!
