In [10]:
import numpy as np
import pandas as pd
from datetime import date
from molmass import Formula
from rdkit import Chem
from rdkit.Chem import Descriptors

In [11]:
# define all variables
lib_id = "mce"

# usually empty unless, e.g., second measurement or other parameters
# always ends with underscore _
prefix = "100AGC_60000Res_"
instrument_method = r"C:\Xcalibur\methods\Corinna_Brungs\Library6_100AGC_60000Res_MS5_POS_mz115-2000"

plates = ["1D1","1D2","1D3"]
plate_id_header = "Diltuted Duplicate 1"

# plates are inserted into the BLUE B compartment
plate_loc_in_autosampler = "B"


# final values
unique_id_header = "lib_plate_well"
raw_filename = "raw_filename"
well_header = "final_plate_location"

library_file = "data/{}_library.tsv".format(lib_id)

current_date = date.today().strftime("%Y%m%d")

## Import library

In [12]:
lib_df = pd.read_csv(library_file, sep="\t")
lib_df.head(5)

Unnamed: 0,RackCode,Original Plate Location,Short Name (A & B Coulumn),Mixture plate name,Diltuted Duplicate 1,Diltuted Duplicate 2,Diltuted Duplicate 3,VialCode,Cat. No.,Product Name,...,Formula,Smiles,Solubility,Solvent,Batch No.,Quantity,URL,Pathway,Research Area,Clinical Information
0,HYCPK16574,A10,574_A10,M1_A1,1D1_A1,2D1_A1,3D1_A1,,HY-13796,IPSU,...,C23H27N5O2,COC1=CC=NC(N(CC2)CCC2(C3=O)CCCN3CC4=CNC5=CC=CC...,DMSO : ≥ 30 mg/mL (73.98 mM),DMSO Solution,24371.0,10mM * 30uL,https://www.medchemexpress.com/IPSU.html,GPCR/G Protein; Neuronal Signaling,Neurological Disease; Endocrinology,No Development Reported
1,HYCPK16574,A11,574_A11,M1_A1,1D1_A1,2D1_A1,3D1_A1,,HY-19831A,(Z)-4EGI-1,...,C18H12Cl2N4O4S,O=C(O)/C(CC1=CC=CC=C1[N+]([O-])=O)=N/NC2=NC(C3...,DMSO : 50 mg/mL (110.80 mM; ultrasonic and war...,DMSO Solution,64669.0,10mM * 30uL,https://www.medchemexpress.com/z-4egi-1.html,Cell Cycle/DNA Damage,Cancer,No Development Reported
2,HYCPK16574,A2,574_A2,M1_A1,1D1_A1,2D1_A1,3D1_A1,,HY-15338,TG003,...,C13H15NO2S,CC(/C=C1SC2=CC=C(C=C2N\1CC)OC)=O.[Z],DMSO : ≥ 31 mg/mL (124.33 mM),DMSO Solution,21769.0,10mM * 30uL,https://www.medchemexpress.com/TG003.html,Cell Cycle/DNA Damage,Cancer,No Development Reported
3,HYCPK16574,A3,574_A3,M1_A1,1D1_A1,2D1_A1,3D1_A1,,HY-15440B,Fostemsavir Tris,...,C29H37N8O11P,O=C(N1CCN(C(C2=CC=CC=C2)=O)CC1)C(C3=CN(COP(O)(...,DMSO : 125 mg/mL (ultrasonic);H2O : 100 mg/mL ...,DMSO Solution,58208.0,10mM * 30uL,https://www.medchemexpress.com/BMS-663068-Tris...,Anti-infection,Infection,Launched
4,HYCPK16574,A4,574_A4,M1_A1,1D1_A1,2D1_A1,3D1_A1,,HY-114315,NQO1 substrate,...,C13H2F2N4O,N#CC1=C(C#N)N=C2C(C(C3=C2C=C(F)C(F)=C3)=O)=N1,DMSO : 125 mg/mL (466.10 mM; Need ultrasonic),DMSO Solution,63377.0,10mM * 30uL,https://www.medchemexpress.com/nqo1-substrate....,Others,Cancer,No Development Reported


## Add unique column with internal ID and well location
Use internal ID of plate and then library ID

In [13]:
def exact_mass(formula):
    try:
        clean = formula.split(".")[0]
        return Formula(clean).isotope.mass
    except:
        return np.NAN

def exact_mass_from_smi(smi):
    try:
        clean = smi.split(".")[0]
        mol = Chem.MolFromSmiles(clean)
        return Descriptors.ExactMolWt(mol)
    except:
        return np.NAN

In [14]:
exact_mass_from_smi("CN1CCC[C@H]1c2cccnc2.Na")

162.115698448

In [15]:
# define file names
lib_df[unique_id_header] = ["pluskal_{}_{}".format(lib_id, plate_id) for plate_id in lib_df[plate_id_header]]
lib_df[raw_filename] = ["{}_{}{}".format(current_date, prefix, unique_id) for unique_id in lib_df[unique_id_header]]
lib_df[well_header] = [plate_id.split("_")[1] for plate_id in lib_df[plate_id_header]]


electron_mass = 0.00054857
mzh = exact_mass("H")-electron_mass
mzna = exact_mass("Na")-electron_mass

# define exact mass
if not "exact_mass" in lib_df:
    lib_df["exact_mass"] = [exact_mass(formula) for formula in lib_df["Formula"]]
    lib_df["mz_h"] = lib_df["exact_mass"] + mzh
    lib_df["mz_na"] = lib_df["exact_mass"] + mzna

# from smiles
lib_df["exact_mass_smiles"] = [exact_mass_from_smi(smi) for smi in lib_df["Smiles"]]
lib_df["mz_h_smiles"] = lib_df["exact_mass_smiles"] + mzh
lib_df["mz_na_smiles"] = lib_df["exact_mass_smiles"] + mzna

lib_df["mass_matches"] = [abs(a-b) < 0.01 for a, b in zip(lib_df["exact_mass_smiles"], lib_df["exact_mass"])]


lib_df.to_csv("data/lib_formatted_{}.csv".format(lib_id), sep="\t", index=False)

lib_df.head()

[14:26:33] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[14:26:33] Explicit valence for atom # 3 O, 3, is greater than permitted
[14:26:33] SMILES Parse Error: syntax error while parsing: [FCYWKVCW-NH2(Disulfide
[14:26:33] SMILES Parse Error: Failed parsing SMILES '[FCYWKVCW-NH2(Disulfide' for input: '[FCYWKVCW-NH2(Disulfide'
[14:26:33] SMILES Parse Error: syntax error while parsing: [AC-TGSTQHQ-CG(Disulfide
[14:26:33] SMILES Parse Error: Failed parsing SMILES '[AC-TGSTQHQ-CG(Disulfide' for input: '[AC-TGSTQHQ-CG(Disulfide'
[14:26:33] Explicit valence for atom # 2 O, 3, is greater than permitted
[14:26:33] SMILES Parse Error: syntax error while parsing: [Cyclo(AS-{d-Pro}-PTWI-{Dab}-{Orn}-{d-Dab}-{Dab}-W-{Dab}-{Dab})
[14:26:33] SMILES Parse Error: Failed parsing SMILES '[Cyclo(AS-{d-Pro}-PTWI-{Dab}-{Orn}-{d-Dab}-{Dab}-W-{Dab}-{Dab})' for input: '[Cyclo(AS-{d-Pro}-PTWI-{Dab}-{Orn}-{d-Dab}-{Dab}-W-{Dab}-{Dab})'
[14:26:33] Explicit valence for atom # 2 N, 4, is greater th

Unnamed: 0,RackCode,Original Plate Location,Short Name (A & B Coulumn),Mixture plate name,Diltuted Duplicate 1,Diltuted Duplicate 2,Diltuted Duplicate 3,VialCode,Cat. No.,Product Name,...,lib_plate_well,raw_filename,final_plate_location,exact_mass,mz_h,mz_na,exact_mass_smiles,mz_h_smiles,mz_na_smiles,mass_matches
0,HYCPK16574,A10,574_A10,M1_A1,1D1_A1,2D1_A1,3D1_A1,,HY-13796,IPSU,...,pluskal_mce_1D1_A1,20220613_100AGC_60000Res_pluskal_mce_1D1_A1,A1,405.216475,406.223752,428.205696,405.216475,406.223752,428.205696,True
1,HYCPK16574,A11,574_A11,M1_A1,1D1_A1,2D1_A1,3D1_A1,,HY-19831A,(Z)-4EGI-1,...,pluskal_mce_1D1_A1,20220613_100AGC_60000Res_pluskal_mce_1D1_A1,A1,449.995631,451.002908,472.984852,449.995631,451.002908,472.984852,True
2,HYCPK16574,A2,574_A2,M1_A1,1D1_A1,2D1_A1,3D1_A1,,HY-15338,TG003,...,pluskal_mce_1D1_A1,20220613_100AGC_60000Res_pluskal_mce_1D1_A1,A1,249.08235,250.089626,272.071571,249.08235,250.089626,272.07157,True
3,HYCPK16574,A3,574_A3,M1_A1,1D1_A1,2D1_A1,3D1_A1,,HY-15440B,Fostemsavir Tris,...,pluskal_mce_1D1_A1,20220613_100AGC_60000Res_pluskal_mce_1D1_A1,A1,704.231941,705.239218,727.221162,583.158047,584.165324,606.147268,False
4,HYCPK16574,A4,574_A4,M1_A1,1D1_A1,2D1_A1,3D1_A1,,HY-114315,NQO1 substrate,...,pluskal_mce_1D1_A1,20220613_100AGC_60000Res_pluskal_mce_1D1_A1,A1,268.019667,269.026943,291.008888,268.019667,269.026944,291.008888,True


## Import and create sequence

In [16]:
pd.read_csv("data/sequence_example.csv").head()

Unnamed: 0,Unnamed: 1.1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Bracket Type=4,Unnamed: 1
Sample Type,File Name,Sample ID,Path,Instrument Method,Process Method,Calibration File,Position,Inj Vol,Level,Sample Wt,Sample Vol,ISTD Amt,Dil Factor,L1 Study,L2 Client,L3 Laboratory,L4 Company,L5 Phone,Comment,Sample Name
Unknown,20220601_01_ACN_2uL,1,C:\Xcalibur\data\Corinna_Brungs\20220601_Library_test_Sigma_MCE,C:\Xcalibur\methods\Corinna_Brungs\Library6_MS5_POS_mz115-2000,,,R:F9,2,,0,0,0,1,,,,,,,
Unknown,20220601_02_ACN_2uL,1,C:\Xcalibur\data\Corinna_Brungs\20220601_Library_test_Sigma_MCE,C:\Xcalibur\methods\Corinna_Brungs\Library6_MS5_POS_mz115-2000,,,R:F9,2,,0,0,0,1,,,,,,,
Unknown,20220601_03_ACN_2uL,1,C:\Xcalibur\data\Corinna_Brungs\20220601_Library_test_Sigma_MCE,C:\Xcalibur\methods\Corinna_Brungs\Library6_MS5_POS_mz115-2000,,,R:F9,2,,0,0,0,1,,,,,,,
Unknown,20220601_04_ACN_2uL,1,C:\Xcalibur\data\Corinna_Brungs\20220601_Library_test_Sigma_MCE,C:\Xcalibur\methods\Corinna_Brungs\Library6_MS5_POS_mz115-2000,,,R:F9,2,,0,0,0,1,,,,,,,


In [17]:
seq_df = pd.DataFrame()

seq_df["File Name"] = lib_df[raw_filename]
seq_df["Path"] = r"C:\Xcalibur\data\Corinna_Brungs\{}".format(lib_id)
seq_df["Instrument Method"] = instrument_method
seq_df["Position"] = ["{}:{}".format(plate_loc_in_autosampler, well) for well in lib_df[well_header]]
seq_df["Inj Vol"] = 2
seq_df["Dil Factor"] = 1

seq_df = seq_df.drop_duplicates()

for plate in plates:
    filtered_df = seq_df[seq_df["File Name"].str.contains(plate)]
    csv_file = "data/seq_{}_{}_{}.csv".format(plate_loc_in_autosampler, lib_id, plate)
    filtered_df.to_csv(csv_file, index=False)

    with open(csv_file, 'r') as original:
        data = original.read()
    with open(csv_file, 'w') as modified:
        modified.write("Bracket Type=4,\n" + data)

seq_df.head()

Unnamed: 0,File Name,Path,Instrument Method,Position,Inj Vol,Dil Factor
0,20220613_100AGC_60000Res_pluskal_mce_1D1_A1,C:\Xcalibur\data\Corinna_Brungs\mce,C:\Xcalibur\methods\Corinna_Brungs\Library6_10...,G:A1,2,1
10,20220613_100AGC_60000Res_pluskal_mce_1D1_A2,C:\Xcalibur\data\Corinna_Brungs\mce,C:\Xcalibur\methods\Corinna_Brungs\Library6_10...,G:A2,2,1
20,20220613_100AGC_60000Res_pluskal_mce_1D1_A3,C:\Xcalibur\data\Corinna_Brungs\mce,C:\Xcalibur\methods\Corinna_Brungs\Library6_10...,G:A3,2,1
30,20220613_100AGC_60000Res_pluskal_mce_1D1_A4,C:\Xcalibur\data\Corinna_Brungs\mce,C:\Xcalibur\methods\Corinna_Brungs\Library6_10...,G:A4,2,1
40,20220613_100AGC_60000Res_pluskal_mce_1D1_A5,C:\Xcalibur\data\Corinna_Brungs\mce,C:\Xcalibur\methods\Corinna_Brungs\Library6_10...,G:A5,2,1
