# Updating Microstate Lists Based on Manual Correction

This jupyter notebook incorporates a participant's suggested microstates (Robert Fraczkiewicz) and removes dublicated microstates(resonance structures and cis-trans isomers) detected by visual inspection. 

Files to be created for 24 molecules:  
* SMX_microstates.csv  
* SMX_microstates_deprecated.csv  
* SMX_microstate_IDs_with_2D_depiction.xlsx

In [1]:
import pandas as pd
from openeye.oechem import *

In [2]:
path_to_correction_files = "corrections/"
path_to_corrected_files = "microstate_lists_after_manual_correction/"

# Iterate over 24 molecules
for j in range(24):
    mol_name = "SM"+str(j+1).zfill(2)
    print(mol_name, "...")

    # Read correction file
    correction_file = path_to_correction_files + mol_name + "_correction.csv"
    df_microstates = pd.read_csv(correction_file)

    # Convert all SMILES to canonical isomeric SMILES
    for i, row in enumerate(df_microstates.iterrows()):
        smiles = df_microstates.loc[i,"canonical isomeric SMILES"]

        mol = OEGraphMol()
        OESmilesToMol(mol, smiles)
        canonical_smiles = OEMolToSmiles(mol)

        df_microstates.loc[i, "canonical isomeric SMILES"] = canonical_smiles
    
    # Check if there is any deprecated microstate

    correction = df_microstates["correction"]
    deprecated_boolean = correction.isin(["deprecated"])

    deprecated_label = False
    for b in deprecated_boolean:
        if b == False:
            continue
        if b == True:
            print("Deprecated microstate found.")
            deprecated_label = True


    # Check if there is any added microstate

    correction = df_microstates["correction"]
    added_boolean = correction.isin(["added"])

    added_label = False
    for b in added_boolean:
        if b == False:
            continue
        if b == True:
            print("Added microstate found.")
            added_label = True


    # Write deprecated microstates to a separate file

    if(deprecated_label):
        df_deprecated = df_microstates.loc[df_microstates["correction"] == "deprecated"]
        print("Number of deprecated microstates of {}: {}".format(mol_name, df_deprecated.shape[0]))

        df_deprecated = df_deprecated.rename(columns = {"correction":"remarks"})

        deprecated_microstates_file_name = path_to_corrected_files + mol_name + "_microstates_deprecated.csv"
        df_deprecated.to_csv(deprecated_microstates_file_name, index=False)
        print("Created:" , deprecated_microstates_file_name)
        print("\n")


    # Write new microstates list with deprecated microstates removed and new microstates added.

    if(deprecated_label and added_label):
        df_remaining = df_microstates.loc[df_microstates["correction"] != "deprecated"]
        df_remaining = df_remaining.loc[df_remaining["correction"] != "added"]
        print("Number of remaining microstates of {}: {}".format(mol_name, df_remaining.shape[0]))

        df_added = df_microstates.loc[df_microstates["correction"] == "added"]
        print("Number of new microstates of {}: {}".format(mol_name, df_added.shape[0]))
        
        df_updated = df_microstates.loc[df_microstates["correction"] != "deprecated"]
        print("Total number of microstates in updated list of {}: {}".format(mol_name, df_updated.shape[0]))

    elif(added_label): # no deprecated
        df_remaining = df_microstates.loc[df_remaining["correction"] != "added"]
        print("Number of remaining microstates of {}: {}".format(mol_name, df_remaining.shape[0]))

        df_added = df_microstates.loc[df_microstates["correction"] == "added"]
        print("Number of new microstates of {}: {}".format(mol_name, df_added.shape[0]))

        df_updated = df_microstates
        print("Total number of microstates in updated list of {}: {}".format(mol_name, df_updated.shape[0]))

    elif(deprecated_label): # no added

        df_updated = df_microstates.loc[df_microstates["correction"] != "deprecated"]
        print("Total number of microstates in updated list of {}: {}".format(mol_name, df_updated.shape[0]))

    else:
        df_updated = df_microstates
        print("No correction to microstate list.")
        print("Total number of microstates in updated list of {}: {}".format(mol_name, df_updated.shape[0]))


    df_updated = df_updated.loc[:,("microstate ID","canonical isomeric SMILES")]

    updated_microstates_file_name = path_to_corrected_files + mol_name + "_microstates.csv"
    df_updated.to_csv(updated_microstates_file_name, index=False)
    print("Created:" , updated_microstates_file_name)
    print("\n")


    # Create Excel file with 2D depiction for updated microstates list

    # Organize colums to create csv input file for csv2xlsx.py script
    df_2D_input = pd.DataFrame()
    df_2D_input["Molecule"] = df_updated["canonical isomeric SMILES"]
    df_2D_input["Microstate ID"] = df_updated["microstate ID"]
    df_2D_input["microstate ID"] = df_updated["microstate ID"]
    df_2D_input["canonical isomeric SMILES"] = df_updated["canonical isomeric SMILES"]

    csv_file_name = path_to_corrected_files + "{}_microstate_IDs_with_2D_depiction.csv".format(mol_name)
    xlsx_file_name = path_to_corrected_files + "{}_microstate_IDs_with_2D_depiction.xlsx".format(mol_name)

    df_2D_input.to_csv(csv_file_name, index=False)

    !python csv2xlsx.py $csv_file_name $xlsx_file_name
    !trash $csv_file_name
    print("Created: ",xlsx_file_name)
    print(mol_name, ": Done!")
    print("\n")

SM01 ...
Deprecated microstate found.
Deprecated microstate found.
Number of deprecated microstates of SM01: 2
Created: microstate_lists_after_manual_correction/SM01_microstates_deprecated.csv


Total number of microstates in updated list of SM01: 8
Created: microstate_lists_after_manual_correction/SM01_microstates.csv


Created:  microstate_lists_after_manual_correction/SM01_microstate_IDs_with_2D_depiction.xlsx
SM01 : Done!


SM02 ...
Deprecated microstate found.
Deprecated microstate found.
Deprecated microstate found.
Added microstate found.
Added microstate found.
Added microstate found.
Number of deprecated microstates of SM02: 3
Created: microstate_lists_after_manual_correction/SM02_microstates_deprecated.csv


Number of remaining microstates of SM02: 8
Number of new microstates of SM02: 3
Total number of microstates in updated list of SM02: 11
Created: microstate_lists_after_manual_correction/SM02_microstates.csv


Created:  microstate_lists_after_manual_correction/SM02_microst

Total number of microstates in updated list of SM15: 4
Created: microstate_lists_after_manual_correction/SM15_microstates.csv


Created:  microstate_lists_after_manual_correction/SM15_microstate_IDs_with_2D_depiction.xlsx
SM15 : Done!


SM16 ...
No correction to microstate list.
Total number of microstates in updated list of SM16: 8
Created: microstate_lists_after_manual_correction/SM16_microstates.csv


Created:  microstate_lists_after_manual_correction/SM16_microstate_IDs_with_2D_depiction.xlsx
SM16 : Done!


SM17 ...
Deprecated microstate found.
Deprecated microstate found.
Deprecated microstate found.
Added microstate found.
Added microstate found.
Added microstate found.
Added microstate found.
Added microstate found.
Added microstate found.
Number of deprecated microstates of SM17: 3
Created: microstate_lists_after_manual_correction/SM17_microstates_deprecated.csv


Number of remaining microstates of SM17: 2
Number of new microstates of SM17: 6
Total number of microstates in upda

## One molecules script for testing

In [39]:
mol_name = "SM05"
path_to_correction_files = "corrections/"
path_to_corrected_files = "microstate_lists_after_manual_correction/"

# Read correction file
correction_file = path_to_correction_files + mol_name + "_correction.csv"
df_microstates = pd.read_csv(correction_file)

# Convert all SMILES to canonical isomeric SMILES
for i, row in enumerate(df_microstates.iterrows()):
    smiles = df_microstates.loc[i,"canonical isomeric SMILES"]

    mol = OEGraphMol()
    OESmilesToMol(mol, smiles)
    canonical_smiles = OEMolToSmiles(mol)

    df_microstates.loc[i, "canonical isomeric SMILES"] = canonical_smiles
df_microstates

Unnamed: 0,microstate ID,canonical isomeric SMILES,correction
0,SM05_micro001,c1ccc(c(c1)NC(=O)c2ccc(o2)Cl)N3CCCCC3,
1,SM05_micro002,c1ccc(c(c1)/N=C(/c2ccc(o2)Cl)\[O-])N3CCCCC3,
2,SM05_micro003,c1ccc(c(c1)[NH2+]C(=[OH+])c2ccc(o2)Cl)N3CCCCC3,
3,SM05_micro004,c1ccc(c(c1)NC(=C2C=CC(=[O+]2)Cl)[O-])[NH+]3CCCCC3,deprecated
4,SM05_micro005,c1ccc(c(c1)NC(=[OH+])c2ccc(o2)Cl)[NH+]3CCCCC3,
5,SM05_micro006,c1ccc(c(c1)/N=C(/c2ccc(o2)Cl)\[O-])[NH+]3CCCCC3,
6,SM05_micro007,c1ccc(c(c1)NC(=C2C=CC(=[O+]2)Cl)[O-])N3CCCCC3,deprecated
7,SM05_micro008,c1ccc(c(c1)[N-]C(=[OH+])c2ccc(o2)Cl)N3CCCCC3,
8,SM05_micro009,c1ccc(c(c1)NC(=[OH+])c2ccc(o2)Cl)N3CCCCC3,
9,SM05_micro010,c1ccc(c(c1)NC(=O)c2ccc(o2)Cl)[NH+]3CCCCC3,


In [40]:
# Check if there is any deprecated microstate

correction = df_microstates["correction"]
deprecated_boolean = correction.isin(["deprecated"])

deprecated_label = False
for b in deprecated_boolean:
    if b == False:
        continue
    if b == True:
        print("Deprecated microstate found.")
        deprecated_label = True
        
        
# Check if there is any added microstate

correction = df_microstates["correction"]
added_boolean = correction.isin(["added"])

added_label = False
for b in added_boolean:
    if b == False:
        continue
    if b == True:
        print("Added microstate found.")
        added_label = True

        
# Write deprecated microstates to a separate file

if(deprecated_label):
    df_deprecated = df_microstates.loc[df_microstates["correction"] == "deprecated"]
    print("Number of deprecated microstates of {}: {}".format(mol_name, df_deprecated.shape[0]))

    df_deprecated = df_deprecated.rename(columns = {"correction":"remarks"})

    deprecated_microstates_file_name = path_to_corrected_files + mol_name + "_microstates_deprecated.csv"
    df_deprecated.to_csv(deprecated_microstates_file_name, index=False)
    print("Created:" , deprecated_microstates_file_name)
    print("\n")

    
# Write new microstates list with deprecated microstates removed and new microstates added.

if(deprecated_label and added_label):
    df_remaining = df_microstates.loc[df_microstates["correction"] != "deprecated"]
    df_remaining = df_remaining.loc[df_remaining["correction"] != "added"]
    print("Number of remaining microstates of {}: {}".format(mol_name, df_remaining.shape[0]))
    
    df_added = df_microstates.loc[df_microstates["correction"] == "added"]
    print("Number of new microstates of {}: {}".format(mol_name, df_added.shape[0]))
    
    df_updated = df_microstates.loc[df_microstates["correction"] != "deprecated"]
    print("Total number of microstates in updated list of {}: {}".format(mol_name, df_updated.shape[0]))

elif(added_label): # no deprecated
    df_remaining = df_microstates.loc[df_remaining["correction"] != "added"]
    print("Number of remaining microstates of {}: {}".format(mol_name, df_remaining.shape[0]))
    
    df_added = df_microstates.loc[df_microstates["correction"] == "added"]
    print("Number of new microstates of {}: {}".format(mol_name, df_added.shape[0]))
    
    df_updated = df_microstates
    print("Total number of microstates in updated list of {}: {}".format(mol_name, df_updated.shape[0]))

elif(deprecated_label): # no added
    
    df_updated = df_microstates.loc[df_microstates["correction"] != "deprecated"]
    print("Total number of microstates in updated list of {}: {}".format(mol_name, df_updated.shape[0]))
    
else:
    df_updated = df_microstates
    print("No correction to microstate list.")
    print("Total number of microstates in updated list of {}: {}".format(mol_name, df_updated.shape[0]))


df_updated = df_updated.loc[:,("microstate ID","canonical isomeric SMILES")]

updated_microstates_file_name = path_to_corrected_files + mol_name + "_microstates.csv"
df_updated.to_csv(updated_microstates_file_name, index=False)
print("Created:" , updated_microstates_file_name)
print("\n")


# Create Excel file with 2D depiction for updated microstates list

# Organize colums to create csv input file for csv2xlsx.py script
df_2D_input = pd.DataFrame()
df_2D_input["Molecule"] = df_updated["canonical isomeric SMILES"]
df_2D_input["Microstate ID"] = df_updated["microstate ID"]
df_2D_input["microstate ID"] = df_updated["microstate ID"]
df_2D_input["canonical isomeric SMILES"] = df_updated["canonical isomeric SMILES"]

csv_file_name = path_to_corrected_files + "{}_microstate_IDs_with_2D_depiction.csv".format(mol_name)
xlsx_file_name = path_to_corrected_files + "{}_microstate_IDs_with_2D_depiction.xlsx".format(mol_name)

df_2D_input.to_csv(csv_file_name, index=False)

!python csv2xlsx.py $csv_file_name $xlsx_file_name
!trash $csv_file_name
print("Created: ",xlsx_file_name)
print(mol_name, ": Done!")
print("\n")

Deprecated microstate found.
Deprecated microstate found.
Number of deprecated microstates of SM05: 2
Created: microstate_lists_after_manual_correction/SM05_microstates_deprecated.csv


Total number of microstates in updated list of SM05: 9
Created: microstate_lists_after_manual_correction/SM05_microstates.csv


Created:  microstate_lists_after_manual_correction/SM05_microstate_IDs_with_2D_depiction.xlsx
SM05 : Done!


