In [None]:
import numpy as np
import pandas as pd
from datetime import date
from molmass import Formula
from rdkit import Chem
from rdkit.Chem import Descriptors

In [None]:
# define all variables
lib_id = "mce"

# usually empty unless, e.g., second measurement or other parameters
# always ends with underscore _
prefix = "100AGC_60000Res_"
instrument_method = r"C:\Xcalibur\methods\Corinna_Brungs\Library6_100AGC_60000Res_MS5_POS_mz115-2000"

plates = ["1D1","1D2","1D3"]
plate_id_header = "mixed_location_plate1"

# plates are inserted into the BLUE B compartment
plate_loc_in_autosampler = "B"


# final values
unique_id_header = "lib_plate_well"
raw_filename = "raw_filename"
well_header = "final_plate_location"

library_file = "data/{}_library.tsv".format(lib_id)

current_date = date.today().strftime("%Y%m%d")

## Import library

In [None]:
lib_df = pd.read_csv(library_file, sep="\t")
lib_df.head(5)

## Add unique column with internal ID and well location
Use internal ID of plate and then library ID

In [None]:
def exact_mass(formula):
    try:
        clean = formula.split(".")[0]
        return Formula(clean).isotope.mass
    except:
        return np.NAN

def exact_mass_from_smi(smi):
    try:
        clean = smi.split(".")[0]
        mol = Chem.MolFromSmiles(clean)
        return Descriptors.ExactMolWt(mol)
    except:
        return np.NAN

In [None]:
exact_mass_from_smi("CN1CCC[C@H]1c2cccnc2.Na")

In [None]:
# define file names
lib_df[unique_id_header] = ["pluskal_{}_{}".format(lib_id, plate_id) for plate_id in lib_df[plate_id_header]]
lib_df[raw_filename] = ["{}_{}{}".format(current_date, prefix, unique_id) for unique_id in lib_df[unique_id_header]]
lib_df[well_header] = [plate_id.split("_")[1] for plate_id in lib_df[plate_id_header]]


electron_mass = 0.00054857
mzh = exact_mass("H")-electron_mass
mzna = exact_mass("Na")-electron_mass

# define exact mass
if not "exact_mass" in lib_df:
    lib_df["exact_mass"] = [exact_mass(formula) for formula in lib_df["Formula"]]
    lib_df["mz_h"] = lib_df["exact_mass"] + mzh
    lib_df["mz_na"] = lib_df["exact_mass"] + mzna

# from smiles
lib_df["exact_mass_smiles"] = [exact_mass_from_smi(smi) for smi in lib_df["Smiles"]]
lib_df["mz_h_smiles"] = lib_df["exact_mass_smiles"] + mzh
lib_df["mz_na_smiles"] = lib_df["exact_mass_smiles"] + mzna

lib_df["mass_matches"] = [abs(a-b) < 0.01 for a, b in zip(lib_df["exact_mass_smiles"], lib_df["exact_mass"])]


lib_df.to_csv("data/lib_formatted_{}.csv".format(lib_id), sep="\t", index=False)

lib_df.head()

## Import and create sequence

In [None]:
pd.read_csv("data/sequence_example.csv").head()

In [None]:
seq_df = pd.DataFrame()

seq_df["File Name"] = lib_df[raw_filename]
seq_df["Path"] = r"C:\Xcalibur\data\Corinna_Brungs\{}".format(lib_id)
seq_df["Instrument Method"] = instrument_method
seq_df["Position"] = ["{}:{}".format(plate_loc_in_autosampler, well) for well in lib_df[well_header]]
seq_df["Inj Vol"] = 2
seq_df["Dil Factor"] = 1

seq_df = seq_df.drop_duplicates()

for plate in plates:
    filtered_df = seq_df[seq_df["File Name"].str.contains(plate)]
    csv_file = "data/seq_{}_{}_{}.csv".format(plate_loc_in_autosampler, lib_id, plate)
    filtered_df.to_csv(csv_file, index=False)

    with open(csv_file, 'r') as original:
        data = original.read()
    with open(csv_file, 'w') as modified:
        modified.write("Bracket Type=4,\n" + data)

seq_df.head()