In [None]:
import os
import pandas as pd
path = '../../../Stitch_Assembly/'
os.mkdir(path)
tools_list = [
    "AdaNovo", "CasanovoV1", "CasanovoV2", "ContraNovo", "DeepNovo",
    "PepNet", "PGPointNovo", "pi-HelixNovo", "pi-PrimeNovo", "pNovo3", "PointNovo", 'SMSNet', 'InstaNovo'
]
mabs = os.listdir('../../../monoclonal_antibody/')
for tool in tools_list:
    path1 = path+tool
    try:
        os.mkdir(path1)
    except:
        print(path1+'existed')
    for mab in mabs:
        try:
            path2 = path1+'/'+mab
            os.mkdir(path2)
        except:
            print(path2+'existed')

df = pd.read_csv('../../data/Tool_Confidence_Threshold_90.csv')
SocreAll = pd.Series(df.Confidence_Threshold.values, index=df.Tool).to_dict()

In [None]:
import numpy as np
import pandas as pd
mass_H = 1.0078
mass_N_terminus = 1.0078
mass_C_terminus = 17.0027
mass_AA = {
    "A": 71.03711,  # 0
    "R": 156.10111,  # 1
    "N": 114.04293,  # 2
    "n": 115.02695,
    "D": 115.02694,  # 3
    "C": 160.03065,  # 103.00919,  # 4
    "E": 129.04259,  # 5
    "Q": 128.05858,  # 6
    "q": 129.0426,
    "G": 57.02146,  # 7
    "H": 137.05891,  # 8
    "I": 113.08406,  # 9
    "L": 113.08406,  # 10
    "K": 128.09496,  # 11
    "M": 131.04049,  # 12
    "m": 147.0354,
    "F": 147.06841,  # 13
    "P": 97.05276,  # 14
    "S": 87.03203,  # 15
    "T": 101.04768,  # 16
    "W": 186.07931,  # 17
    "Y": 163.06333,  # 18
    "V": 99.06841,  # 19
    "d": 25.980265,
    "e": -17.026549,
    "f": 43.005814,
    "g": 42.010565,
    "p": 111.032,
}

In [None]:
import pandas as pd
import numpy as np
import subprocess 
import re
quality_cutoff_Stitch = 90
tool = 'CasanovoV1'
quality_cutoff_Stitch_local = SocreAll[tool]
def process_casanovoV1_Stitch(casanovo_path,mgf_file,batchfile_dir, resultdir):
    """Parse de novo results from Casanovo to dataframe.
            :param
                casanovo_path: path to the mztab file of Casanovo
            :return
                pnovo_df: dataframe with Score, Peptide, AAScore of Casanovo
    """
    titles=[]
    exp_mass_to_charge = [] 
    charge = []
    RT = []
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
            elif 'PEPMASS=' in line:
                exp_mass_to_charge.append(line.strip().split('PEPMASS=')[-1])
            elif 'CHARGE=' in line:
                charge.append(line.strip().split('CHARGE=')[-1])
            elif 'RTINSECONDS=' in line:
                RT.append(line.strip().split('RTINSECONDS=')[-1])
                
    exp_mass_to_charge = [float(i) for i in exp_mass_to_charge]
    charge = [int(re.match(r'\d+', i).group()) for i in charge]    
    
    casanovo_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Stitch) + '_localScore_' + str(quality_cutoff_Stitch_local) + '.csv'
    with open(casanovo_path) as f:           
        # Identify header
        for i, line in enumerate(f):
            if line.startswith("PSH"):
                casanovo_header = i 
                break
            
    # Load the data into a DataFrame
    df_deno = pd.read_csv(casanovo_path, sep="\t", header=casanovo_header)
    df_denovo = df_deno[df_deno['sequence'].notna()].copy()
    df_denovo['PSM_ID'] = df_denovo['PSM_ID'].astype(int)
    df_denovo['exp_mass_to_charge'] = df_denovo['PSM_ID'].apply(lambda x: exp_mass_to_charge[x] if x < len(exp_mass_to_charge) else None)
    df_denovo['charge'] = df_denovo['PSM_ID'].apply(lambda x: charge[x] if x < len(charge) else None)
    df_denovo['RT'] = df_denovo['PSM_ID'].apply(lambda x: RT[x] if x < len(RT) else None)
    df_denovo['PSM_ID'] = df_denovo['PSM_ID'].apply(lambda x: titles[x] if x < len(titles) else None)

    sequence = df_denovo['sequence'].tolist()
    PSM_ID = df_denovo['PSM_ID'].tolist()
    Score = [int((i * 100 + 100)/2) for i in df_denovo['search_engine_score[1]'].tolist()]
    aascore = df_denovo['opt_ms_run[1]_aa_scores'].tolist()
    for i in range(len(aascore)):
        aascore[i]=aascore[i].replace(',', ' ')
    aascore= [i.replace(',', ' ') for i in aascore]
    aascore = [i.split() for i in aascore]
    aascore = [[str(int(float(j) * 100)) for j in i] for i in aascore]
    df_denovo['aaScore'] = [" ".join(i) for i in aascore]
    aascore = df_denovo['aaScore'].tolist()
    scan = df_denovo['PSM_ID'].tolist()

    mode = ['HCD' for x in PSM_ID]
    seq = [str(i).replace('M(+15.99)', 'm').replace('Q(+.98)', 'q').replace('N(+.98)', 'n').replace('Q(-17.03)','q').replace(' ','').replace('C(+57.02)', 'C') for i in sequence]
    length = [len(x) for x in seq]
    Mass_peptide = []
    for se in seq:
        Mass_peptide.append(sum(mass_AA[aa] for aa in se)+mass_N_terminus + mass_C_terminus)
    #Mass_exp = exp_mass_to_charge * charge - charge * mass_H
    #ppm = 10**6 * (Mass_exp - Mass_peptide) / Mass_peptide

    Mass_exp = [
        exp_mass_to_charge[i] * charge[i] - charge[i] * mass_H for i in range(len(exp_mass_to_charge))
    ]
    ppm = [
        10**6 * (Mass_exp[i] - Mass_peptide[i]) / Mass_peptide[i]
        for i in range(len(Mass_peptide))
    ]
    
    DF = np.empty((len(PSM_ID), 20),dtype='object')
    DF[:,1] = PSM_ID
    DF[:,3] = sequence 
    DF[:,4] = scan 
    DF[:,5] = length 
    DF[:,6] = Score 
    DF[:,7] = Score
    DF[:,8] = length
    DF[:,9] = df_denovo['exp_mass_to_charge']
    DF[:,10] = df_denovo['charge'] 
    DF[:,11] = df_denovo['RT'] 
    DF[:,12] = '-' 
    DF[:,13] = 1
    DF[:,14] = Mass_peptide 
    DF[:,15] = ppm
    DF[:,17] = aascore 
    DF[:,18] = sequence 
    DF[:,19] = mode 
    name = ['Fraction','Source File','Feature','Peptide','Scan','Tag Length','Denovo Score','ALC (%)','length','m/z','z','RT','Predict RT','Area','Mass','ppm','PTM','local confidence (%)','tag (>=0%)','mode']
    df = pd.DataFrame(DF, columns=name)
    df['Denovo Score'] = df['Denovo Score'].astype(int)
    df = df[df['Denovo Score'] >= SocreAll[tool]]
    df.to_csv(casanovo_out,index=False)

    with open(batchfile_dir,'r') as fr:
        lines = fr.readlines()
        newfile = resultdir + batchfile_dir.split('/')[-1]
        with open(newfile,'w') as fw:
            for line in lines:
                if 'Path     : ' in line:
                    fw.write(line.replace('../datasets/200305_HER_test_04_DENOVO.csv', casanovo_out))
                elif 'CutoffALC: ' in line:
                    fw.write(line.replace('95', str(quality_cutoff_Stitch_local)))
                else:
                    fw.write(line)

    subprocess.run(
    ['../../../stitch-v1.5.0-linux/stitch.bin', 'run', newfile], stdout=subprocess.DEVNULL)
    
Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}
path = '../../../denovo/casanovo3/'
out_path = '../../../Stitch_Assembly/CasanovoV1/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
mabs = [mab for mab in mabs if '131ab_EThcd' not in mab and '.yaml' not in mab and 'Herceptin' not in mab and 'anti-FLAG-M2' not in mab]
for mab in mabs:
    print(mab)
    casanovo_path = path + mab +'/'+'casanovo_denovo.mztab'
    resultdir = out_path + mab +'/'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    batchfile_dir = '../../../stitch-v1.5.0-linux/batchfiles/' + mab + '.txt'
    process_casanovoV1_Stitch(casanovo_path,mgf_file,batchfile_dir,resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
import re
quality_cutoff_Stitch = 90
tool = 'CasanovoV2'
quality_cutoff_Stitch_local = SocreAll[tool]
def process_casanovoV2_Stitch(casanovo_path,mgf_file,batchfile_dir, resultdir):
    """Parse de novo results from Casanovo to dataframe.
            :param
                casanovo_path: path to the mztab file of Casanovo
            :return
                pnovo_df: dataframe with Score, Peptide, AAScore of Casanovo
    """
    titles=[]
    exp_mass_to_charge = [] 
    charge = []
    RT = []
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
            elif 'PEPMASS=' in line:
                exp_mass_to_charge.append(line.strip().split('PEPMASS=')[-1])
            elif 'CHARGE=' in line:
                charge.append(line.strip().split('CHARGE=')[-1])
            elif 'RTINSECONDS=' in line:
                RT.append(line.strip().split('RTINSECONDS=')[-1])
                
    exp_mass_to_charge = [float(i) for i in exp_mass_to_charge]
    charge = [int(re.match(r'\d+', i).group()) for i in charge]    
    
    casanovo_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Stitch) + '_localScore_' + str(quality_cutoff_Stitch_local) + '.csv'
    with open(casanovo_path) as f:           
        # Identify header
        for i, line in enumerate(f):
            if line.startswith("PSH"):
                casanovo_header = i 
                break
            
    # Load the data into a DataFrame
    df_deno = pd.read_csv(casanovo_path, sep="\t", header=casanovo_header)
    df_denovo = df_deno[df_deno['sequence'].notna()].copy()
    df_denovo['PSM_ID'] = [int(item.replace('ms_run[1]:index=', '')) for item in df_denovo['spectra_ref']]
    df_denovo['exp_mass_to_charge'] = df_denovo['PSM_ID'].apply(lambda x: exp_mass_to_charge[x] if x < len(exp_mass_to_charge) else None)
    df_denovo['charge'] = df_denovo['PSM_ID'].apply(lambda x: charge[x] if x < len(charge) else None)
    df_denovo['RT'] = df_denovo['PSM_ID'].apply(lambda x: RT[x] if x < len(RT) else None)
    df_denovo['PSM_ID'] = df_denovo['PSM_ID'].apply(lambda x: titles[x] if x < len(titles) else None)

    sequence = df_denovo['sequence'].tolist()
    PSM_ID = df_denovo['PSM_ID'].tolist()
    Score = [int((i * 100 + 100)/2) for i in df_denovo['search_engine_score[1]'].tolist()]
    aascore = df_denovo['opt_ms_run[1]_aa_scores'].tolist()
    for i in range(len(aascore)):
        aascore[i]=aascore[i].replace(',', ' ')
    aascore= [i.replace(',', ' ') for i in aascore]
    aascore = [i.split() for i in aascore]
    aascore = [[str(int(float(j) * 100)) for j in i] for i in aascore]
    df_denovo['aaScore'] = [" ".join(i) for i in aascore]
    aascore = df_denovo['aaScore'].tolist()
    scan = df_denovo['PSM_ID'].tolist()

    mode = ['HCD' for x in PSM_ID]
    seq = [str(i).replace('M(+15.99)', 'm').replace('Q(+.98)', 'q').replace('N(+.98)', 'n').replace('Q(-17.03)','q').replace(' ','').replace('C(+57.02)', 'C') for i in sequence]
    length = [len(x) for x in seq]
    Mass_peptide = []
    for se in seq:
        Mass_peptide.append(sum(mass_AA[aa] for aa in se)+mass_N_terminus + mass_C_terminus)
    #Mass_exp = exp_mass_to_charge * charge - charge * mass_H
    #ppm = 10**6 * (Mass_exp - Mass_peptide) / Mass_peptide

    Mass_exp = [
        exp_mass_to_charge[i] * charge[i] - charge[i] * mass_H for i in range(len(exp_mass_to_charge))
    ]
    ppm = [
        10**6 * (Mass_exp[i] - Mass_peptide[i]) / Mass_peptide[i]
        for i in range(len(Mass_peptide))
    ]
    
    DF = np.empty((len(PSM_ID), 20),dtype='object')
    DF[:,1] = PSM_ID
    DF[:,3] = sequence 
    DF[:,4] = scan 
    DF[:,5] = length 
    DF[:,6] = Score 
    DF[:,7] = Score
    DF[:,8] = length
    DF[:,9] = df_denovo['exp_mass_to_charge']
    DF[:,10] = df_denovo['charge'] 
    DF[:,11] = df_denovo['RT'] 
    DF[:,12] = '-' 
    DF[:,13] = 1
    DF[:,14] = Mass_peptide 
    DF[:,15] = ppm
    DF[:,17] = aascore 
    DF[:,18] = sequence 
    DF[:,19] = mode 
    name = ['Fraction','Source File','Feature','Peptide','Scan','Tag Length','Denovo Score','ALC (%)','length','m/z','z','RT','Predict RT','Area','Mass','ppm','PTM','local confidence (%)','tag (>=0%)','mode']
    df = pd.DataFrame(DF, columns=name)
    df['Denovo Score'] = df['Denovo Score'].astype(int)
    df = df[df['Denovo Score'] >= SocreAll[tool]]
    df.to_csv(casanovo_out,index=False)

    with open(batchfile_dir,'r') as fr:
        lines = fr.readlines()
        newfile = resultdir + batchfile_dir.split('/')[-1]
        with open(newfile,'w') as fw:
            for line in lines:
                if 'Path     : ' in line:
                    fw.write(line.replace('../datasets/200305_HER_test_04_DENOVO.csv', casanovo_out))
                elif 'CutoffALC: ' in line:
                    fw.write(line.replace('95', str(quality_cutoff_Stitch_local)))
                else:
                    fw.write(line)

    subprocess.run(
    ['../../../stitch-v1.5.0-linux/stitch.bin', 'run', newfile], stdout=subprocess.DEVNULL)
    
Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/casanovo4/'
out_path = '../../../Stitch_Assembly/CasanovoV2/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
mabs = [mab for mab in mabs if '131ab_EThcd' not in mab and '.yaml' not in mab and 'Herceptin' not in mab and 'anti-FLAG-M2' not in mab]
for mab in mabs:
    print(mab)
    casanovo_path = path + mab +'/'+'denovo.mztab'
    resultdir = out_path + mab +'/'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    batchfile_dir = '../../../stitch-v1.5.0-linux/batchfiles/' + mab + '.txt'
    process_casanovoV2_Stitch(casanovo_path,mgf_file,batchfile_dir,resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_Stitch = 90
tool = 'PepNet'
quality_cutoff_Stitch_local = SocreAll[tool]

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/pepnet/'
out_path = '../../../Stitch_Assembly/PepNet/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
mabs = [mab for mab in mabs if '131ab_EThcd' not in mab and '.yaml' not in mab and 'Herceptin' not in mab and 'anti-FLAG-M2' not in mab and 'results' not in mab]
for mab in mabs:
    print(mab)
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    resultdir= out_path + mab +'/'
    pepnet_path = path + mab +'/'+'denovo.tsv'
    pepnet_df = pd.read_csv(pepnet_path, sep='\t')
    pepnet_title = pepnet_df['TITLE'].tolist()
    pepnet_df['Spectrum Name']=pepnet_title

    dat = pd.read_csv(mgf_path + mab +'/process1/'+'spectrum_id_HCD_old.csv')
    titles = list(dat['TITLE'])

    exp_mass_to_charge = [] 
    charge = []
    RT = []
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'PEPMASS=' in line:
                mz = line.strip().split('PEPMASS=')[-1]
            elif 'CHARGE=' in line:
                z = line.strip().split('CHARGE=')[-1]
            elif 'RTINSECONDS=' in line:
                rt = line.strip().split('RTINSECONDS=')[-1]
                exp_mass_to_charge.append(mz)
                charge.append(z)
                RT.append(float(rt))
                
    exp_mass_to_charge = [float(i) for i in exp_mass_to_charge]
    charge = [int(re.match(r'\d+', i).group()) for i in charge] 

    loc = [titles.index(i) for i in pepnet_df['Spectrum Name']]
    exp_mass_to_charge = exp_mass_to_charge[loc]
    charge = charge[loc]
    RT = RT[loc]

    if len(exp_mass_to_charge) != len(pepnet_df['Spectrum Name']):
        print('error')
    
    pepnet_peptide = pepnet_df['DENOVO'].tolist()
    Mass_peptide = []
    for se in pepnet_peptide:
        Mass_peptide.append(sum(mass_AA[aa] for aa in se)+mass_N_terminus + mass_C_terminus)

    Mass_exp = [
        exp_mass_to_charge[i] * charge[i] - charge[i] * mass_H for i in range(len(exp_mass_to_charge))
    ]
    ppm = [
        10**6 * (Mass_exp[i] - Mass_peptide[i]) / Mass_peptide[i]
        for i in range(len(Mass_peptide))
    ]   
    length = [len(x) for x in pepnet_peptide]
    pepnet_df['PepNet Peptide']=[peptide.replace("m", "M(+15.99)").replace("q", "Q(+.98)").replace("n", "N(+.98)").replace("C", "C(+57.02)") for peptide in pepnet_peptide]
    pepnet_score = [int(i*100) for i in pepnet_df['Score'].tolist()]
    aascore = pepnet_df['Positional Score'].tolist()
    aascore= [i.replace('[', '').replace(']', '').replace(',', ' ') for i in aascore]
    aascore = [i.split() for i in aascore]
    aascore = [[str(int(float(j)*100)) for j in i] for i in aascore]
    pepnet_df['PepNet aaScore'] = [" ".join(i) for i in aascore]

    DF = np.empty((len(pepnet_df), 20),dtype='object')
    DF[:,1] = pepnet_df['Spectrum Name']
    DF[:,3] = pepnet_df['PepNet Peptide']
    DF[:,4] = pepnet_df['Spectrum Name']
    DF[:,5] = length 
    DF[:,6] = pepnet_score
    DF[:,7] = pepnet_score
    DF[:,8] = length
    DF[:,9] = exp_mass_to_charge
    DF[:,10] = charge
    DF[:,11] = RT
    DF[:,12] = '-' 
    DF[:,13] = 1
    DF[:,14] = Mass_peptide 
    DF[:,15] = ppm
    DF[:,17] = pepnet_df['PepNet aaScore'] 
    DF[:,18] = pepnet_df['PepNet Peptide'] 
    DF[:,19] = 'HCD'
    name = ['Fraction','Source File','Feature','Peptide','Scan','Tag Length','Denovo Score','ALC (%)','length','m/z','z','RT','Predict RT','Area','Mass','ppm','PTM','local confidence (%)','tag (>=0%)','mode']
    df = pd.DataFrame(DF, columns=name)
    df['Denovo Score'] = df['Denovo Score'].astype(int)
    df = df[df['Denovo Score'] >= SocreAll[tool]]
    pepnet_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Stitch) + '_localScore_' + str(quality_cutoff_Stitch_local) + '.csv'
    df.to_csv(pepnet_out,index=False)
    batchfile_dir = '../../../stitch-v1.5.0-linux/batchfiles/' + mab + '.txt'
    with open(batchfile_dir,'r') as fr:
        lines = fr.readlines()
        newfile = resultdir + batchfile_dir.split('/')[-1]
        with open(newfile,'w') as fw:
            for line in lines:
                if 'Path     : ' in line:
                    fw.write(line.replace('../datasets/200305_HER_test_04_DENOVO.csv', pepnet_out))
                elif 'CutoffALC: ' in line:
                    fw.write(line.replace('95', str(quality_cutoff_Stitch_local)))
                else:
                    fw.write(line)

    subprocess.run(
    ['../../../stitch-v1.5.0-linux/stitch.bin', 'run', newfile], stdout=subprocess.DEVNULL) 

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_Stitch = 90
tool = 'pNovo3'
quality_cutoff_Stitch_local = SocreAll[tool]

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/pNovo3/'
out_path = '../../../Stitch_Assembly/pNovo3/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
mabs = [mab for mab in mabs if '131ab_EThcd' not in mab and '.yaml' not in mab and 'Herceptin' not in mab and 'anti-FLAG-M2' not in mab]
for mab in mabs:
    print(mab)
    resultdir = out_path + mab +'/'
    pnovo_path = path + mab + '/HCD/results.res'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    df = pd.read_csv(pnovo_path, sep="	", header=None)
    df = df[[0, 1, 4, 5]]
    df.columns = ['Spectrum Name', 'pNovo3 Peptide', 'pNovo3 Score', 'pNovo3 aaScore']

    peps = list(df['pNovo3 Peptide'])
    aas = list(df['pNovo3 aaScore'])
    loc = [len(p) == len(a.split(',')) for p, a in zip(peps, aas)]
    pnovo_df = df[loc].copy()  
    pnovo_peptide = pnovo_df['pNovo3 Peptide'].tolist()
    pnovo_peptide = [str(i).replace('I','L').replace('a', 'N(+.98)').replace('b', 'Q(+.98)').replace('B', 'Q(+.98)').replace('c', 'M(+15.99)').replace('C','C(+57.02)') for i in pnovo_peptide]
    pnovo_df['pNovo3 Peptide']=pnovo_peptide
    seq = [i.replace('N(+.98)','n').replace('Q(+.98)','q').replace('M(+15.99)','m').replace('C(+57.02)','C') for i in pnovo_df['pNovo3 Peptide']]
    pnovo_score = [int(i) for i in pnovo_df['pNovo3 Score'].tolist()]
    pnovo_df['pNovo3 Score']=pnovo_score
    aascore = pnovo_df['pNovo3 aaScore'].tolist()
    aascore= [i.replace(',', ' ') for i in aascore]
    aascore = [i.split() for i in aascore]
    aascore = [[str(int(float(j))) for j in i] for i in aascore]
    pnovo_df['pNovo3 aaScore'] = [" ".join(i) for i in aascore]

    titles = []
    exp_mass_to_charge = [] 
    charge = []
    RT = []
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append(line.strip().split('TITLE=')[-1])
            elif 'PEPMASS=' in line:
                mz = line.strip().split('PEPMASS=')[-1]
            elif 'CHARGE=' in line:
                z = line.strip().split('CHARGE=')[-1]
            elif 'RTINSECONDS=' in line:
                rt = line.strip().split('RTINSECONDS=')[-1]
                exp_mass_to_charge.append(mz)
                charge.append(z)
                RT.append(float(rt))

    exp_mass_to_charge = [float(i) for i in exp_mass_to_charge]
    charge = [int(re.match(r'\d+', i).group()) for i in charge] 

    loc = [titles.index(i) for i in pnovo_df['Spectrum Name']]
    exp_mass_to_charge = np.array(exp_mass_to_charge)[loc].tolist()
    charge = np.array(charge)[loc].tolist()
    RT = np.array(RT)[loc].tolist()

    if len(exp_mass_to_charge) != len(pnovo_df['Spectrum Name']):
        print('error')

    Mass_peptide = []
    for se in seq:
        Mass_peptide.append(sum(mass_AA[aa] for aa in se)+mass_N_terminus + mass_C_terminus)

    Mass_exp = [
        exp_mass_to_charge[i] * charge[i] - charge[i] * mass_H for i in range(len(exp_mass_to_charge))
    ]
    ppm = [
        10**6 * (Mass_exp[i] - Mass_peptide[i]) / Mass_peptide[i]
        for i in range(len(Mass_peptide))
    ]   
    length = [len(x) for x in seq]

    DF = np.empty((len(pnovo_df['Spectrum Name']), 20),dtype='object')
    DF[:,1] = pnovo_df['Spectrum Name']
    DF[:,3] = pnovo_df['pNovo3 Peptide']
    DF[:,4] = pnovo_df['Spectrum Name']
    DF[:,5] = length 
    DF[:,6] = pnovo_df['pNovo3 Score']
    DF[:,7] = pnovo_df['pNovo3 Score']
    DF[:,8] = length
    DF[:,9] = exp_mass_to_charge
    DF[:,10] = charge
    DF[:,11] = RT
    DF[:,12] = '-' 
    DF[:,13] = 1
    DF[:,14] = Mass_peptide 
    DF[:,15] = ppm
    DF[:,17] = pnovo_df['pNovo3 aaScore'] 
    DF[:,18] = pnovo_df['pNovo3 Peptide'] 
    DF[:,19] = 'HCD'
    name = ['Fraction','Source File','Feature','Peptide','Scan','Tag Length','Denovo Score','ALC (%)','length','m/z','z','RT','Predict RT','Area','Mass','ppm','PTM','local confidence (%)','tag (>=0%)','mode']
    df = pd.DataFrame(DF, columns=name)
    df['Denovo Score'] = df['Denovo Score'].astype(int)
    df = df[df['Denovo Score'] >= SocreAll[tool]]
    pnovo_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Stitch) + '_localScore_' + str(quality_cutoff_Stitch_local) + '.csv'
    df.to_csv(pnovo_out,index=False)

    batchfile_dir = '../../../stitch-v1.5.0-linux/batchfiles/' + mab + '.txt'
    with open(batchfile_dir,'r') as fr:
        lines = fr.readlines()
        newfile = resultdir + batchfile_dir.split('/')[-1]
        with open(newfile,'w') as fw:
            for line in lines:
                if 'Path     : ' in line:
                    fw.write(line.replace('../datasets/200305_HER_test_04_DENOVO.csv', pnovo_out))
                elif 'CutoffALC: ' in line:
                    fw.write(line.replace('95', str(quality_cutoff_Stitch_local)))
                else:
                    fw.write(line)

    subprocess.run(
    ['../../../stitch-v1.5.0-linux/stitch.bin', 'run', newfile], stdout=subprocess.DEVNULL) 


In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_Stitch = 90
tool = 'pi-HelixNovo'
quality_cutoff_Stitch_local = SocreAll[tool]

def process_pinovo_Stitch(pinovo_path,mgf_file,batchfile_dir,resultdir,mab):
    """parse de novo results from Pi-HelixNovo to dataframe
            :param
                pinovo_path: path to the result file of Pi-HelixNovo
            :return
                pinovo_df: dataframe with Tiltle, Peptide, Score of Pi-HelixNovo
    """
    pinovo_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Stitch) + '_localScore_' + str(quality_cutoff_Stitch_local) + '.csv'
    pinovo_df = pd.read_csv(pinovo_path, sep="\t", header=None)
    pinovo_df = pinovo_df.dropna()
    pinovo_df.columns = ["Title", "sequence", "Score"]
    pinovo_title = pinovo_df['Title']
    pinovo_df['Title'] = pinovo_title
    pinovo_peptide = pinovo_df['sequence'].tolist()
    seq = [str(i).replace('M(+15.99)', 'm').replace('Q(+.98)', 'q').replace('N(+.98)', 'n').replace(' ',
                                        '').replace('C(+57.02)', 'C') for i in pinovo_peptide]
    pinovo_df['pi-HelixNovo Peptide'] = seq        
    # scale peptide score from 0 to 100
    pinovo_score = pinovo_df['Score']
    pinovo_df['pi-HelixNovo Score'] = [int(i * 100) for i in pinovo_score]
    pinovo_df['pi-HelixNovo aaScore'] = pinovo_df.apply(lambda row: ' '.join([str(row['pi-HelixNovo Score'])] * len(row['pi-HelixNovo Peptide'])), axis=1)
    pinovo_df['pi-HelixNovo Peptide'] = pinovo_peptide
    pinovo_df['Spectrum Name'] = pinovo_df['Title']
    pinovo_df = pinovo_df[['Spectrum Name', 'pi-HelixNovo Peptide', 'pi-HelixNovo Score', 'pi-HelixNovo aaScore']]

    dat = pd.read_csv(mgf_path + mab +'/process1/'+'spectrum_id_HCD_old.csv')
    titles = list(dat['TITLE'])

    exp_mass_to_charge = [] 
    charge = []
    RT = []
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'PEPMASS=' in line:
                mz = line.strip().split('PEPMASS=')[-1]
            elif 'CHARGE=' in line:
                z = line.strip().split('CHARGE=')[-1]
            elif 'RTINSECONDS=' in line:
                rt = line.strip().split('RTINSECONDS=')[-1]
                exp_mass_to_charge.append(mz)
                charge.append(z)
                RT.append(float(rt))
                
    exp_mass_to_charge = [float(i) for i in exp_mass_to_charge]
    charge = [int(re.match(r'\d+', i).group()) for i in charge] 

    loc = [titles.index(i) for i in pinovo_df['Spectrum Name']]
    exp_mass_to_charge = np.array(exp_mass_to_charge)[loc].tolist()
    charge = np.array(charge)[loc].tolist()
    RT = np.array(RT)[loc].tolist()

    if len(exp_mass_to_charge) != len(pinovo_df['Spectrum Name']):
        print('error')
                

    Mass_peptide = []
    for se in seq:
        Mass_peptide.append(sum(mass_AA[aa] for aa in se)+mass_N_terminus + mass_C_terminus)

    Mass_exp = [
        exp_mass_to_charge[i] * charge[i] - charge[i] * mass_H for i in range(len(exp_mass_to_charge))
    ]
    ppm = [
        10**6 * (Mass_exp[i] - Mass_peptide[i]) / Mass_peptide[i]
        for i in range(len(Mass_peptide))
    ]   
    length = [len(x) for x in seq]

    DF = np.empty((len(pinovo_df['Spectrum Name']), 20),dtype='object')
    DF[:,1] = pinovo_df['Spectrum Name']
    DF[:,3] = pinovo_df['pi-HelixNovo Peptide']
    DF[:,4] = pinovo_df['Spectrum Name']
    DF[:,5] = length 
    DF[:,6] = pinovo_df['pi-HelixNovo Score']
    DF[:,7] = pinovo_df['pi-HelixNovo Score']
    DF[:,8] = length
    DF[:,9] = exp_mass_to_charge
    DF[:,10] = charge
    DF[:,11] = RT
    DF[:,12] = '-' 
    DF[:,13] = 1
    DF[:,14] = Mass_peptide 
    DF[:,15] = ppm
    DF[:,17] = pinovo_df['pi-HelixNovo aaScore'] 
    DF[:,18] = pinovo_df['pi-HelixNovo Peptide']
    DF[:,19] = 'HCD'
    name = ['Fraction','Source File','Feature','Peptide','Scan','Tag Length','Denovo Score','ALC (%)','length','m/z','z','RT','Predict RT','Area','Mass','ppm','PTM','local confidence (%)','tag (>=0%)','mode']
    df = pd.DataFrame(DF, columns=name)
    df['Denovo Score'] = df['Denovo Score'].astype(int)
    df = df[df['Denovo Score'] >= SocreAll[tool]]
    pinovo_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Stitch) + '_localScore_' + str(quality_cutoff_Stitch_local) + '.csv'
    df.to_csv(pinovo_out,index=False)

    batchfile_dir = '../../../stitch-v1.5.0-linux/batchfiles/' + mab + '.txt'
    with open(batchfile_dir,'r') as fr:
        lines = fr.readlines()
    newfile = resultdir + batchfile_dir.split('/')[-1]
    with open(newfile,'w') as fw:
        for line in lines:
            if 'Path     : ' in line:
                fw.write(line.replace('../datasets/200305_HER_test_04_DENOVO.csv', pinovo_out))
            elif 'CutoffALC: ' in line:
                fw.write(line.replace('95', str(quality_cutoff_Stitch_local)))
            else:
                fw.write(line)

    subprocess.run(
    ['../../../stitch-v1.5.0-linux/stitch.bin', 'run', newfile], stdout=subprocess.DEVNULL) 

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/pi-helixnovo/'
out_path = '../../../Stitch_Assembly/pi-HelixNovo/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
mabs = [mab for mab in mabs if '131ab_EThcd' not in mab and '.yaml' not in mab and 'Herceptin' not in mab and 'anti-FLAG-M2' not in mab and '131ab_EThcd' not in mab]
for mab in mabs:
    print(mab)
    pinovo_path = path + mab +'/'+'denovo_denovo.txt'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    batchfile_dir = '../../../stitch-v1.5.0-linux/batchfiles/' + mab + '.txt'
    resultdir = out_path + mab +'/'
    process_pinovo_Stitch(pinovo_path,mgf_file,batchfile_dir,resultdir,mab)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_Stitch = 90
tool = 'pi-PrimeNovo'
quality_cutoff_Stitch_local = SocreAll[tool]

def process_primenovo_Stitch(primenovo_path,mgf_file,batchfile_dir,resultdir,mab):
    """parse de novo results from pi-PrimeNovo to dataframe
            :param
                primenovo_path: path to the mztab file of pi-PrimeNovo
            :return
                pnovo_df: dataframe with Score, Peptide, AAScore of pi-PrimeNovo
    """ 
    primenovo_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Stitch) + '_localScore_' + str(quality_cutoff_Stitch_local) + '.csv'
    primenovo_df = pd.read_table(primenovo_path)
    primenovo_peptide = primenovo_df['prediction'].tolist()
    seq = [str(i).replace('M[+15.995]', 'm').replace('Q[+0.984]', 'q').replace('N[+0.984]', 'n').replace(' ',
                                        '').replace('C[+57.021]', 'C').replace('[-17.027]-Q', 'p').replace('-[17.027]-Q', 'p').replace('[+43.006-17.027]-','d').replace('+[43.006-17.027]','d').replace('-[17.027]','e').replace('[17.027]-','e').replace('[-17.027]-','e').replace('[+43.006]-','f').replace('+[43.006]','f').replace('[+42.011]-','g').replace('+[42.011]','g') for i in primenovo_peptide]
    
    Mass_peptide = []
    for se in seq:
        Mass_peptide.append(sum(mass_AA[aa] for aa in se)+mass_N_terminus + mass_C_terminus)
    seq = [str(i).replace('M[+15.995]', 'm').replace('Q[+0.984]', 'q').replace('N[+0.984]', 'n').replace(' ',
                                        '').replace('C[+57.021]', 'C').replace('[-17.027]-Q', 'p').replace('-[17.027]-Q', 'p').replace('[+43.006-17.027]-','').replace('+[43.006-17.027]','').replace('-[17.027]','').replace('[17.027]-','').replace('[-17.027]-','').replace('[+43.006]-','').replace('+[43.006]','').replace('[+42.011]-','').replace('+[42.011]','') for i in primenovo_peptide]
    primenovo_df['pi-PrimeNovo Peptide'] = seq
    score = primenovo_df['score']
    primenovo_df['pi-PrimeNovo Score'] = [int(i * 100) for i in score]
    primenovo_df['pi-PrimeNovo aaScore'] = primenovo_df.apply(lambda row: ' '.join([str(row['pi-PrimeNovo Score'])] * len(row['pi-PrimeNovo Peptide'])), axis=1)
    primenovo_df['pi-PrimeNovo Peptide'] = [str(i).replace('M[+15.995]', 'M(+15.99)').replace('Q[+0.984]', 'Q(+.98)').replace('N[+0.984]', 'N(+.98)').replace(' ',
                    '').replace('C[+57.021]', 'C(+57.02)').replace('[-17.027]-Q', 'Q(-17.03)').replace('-[17.027]-Q', 'Q(-17.03)').replace('[+43.006-17.027]-','').replace('+[43.006-17.027]','').replace('-[17.027]','').replace('[17.027]-','').replace('[-17.027]-','').replace('[+43.006]-','').replace('+[43.006]','').replace('[+42.011]-','').replace('+[42.011]','') for i in primenovo_peptide]
    
    primenovo_df['Spectrum Name'] = primenovo_df['label']

    dat = pd.read_csv(mgf_path + mab +'/process1/'+'spectrum_id_HCD_old.csv')
    titles = list(dat['TITLE'])

    exp_mass_to_charge = [] 
    charge = []
    RT = []
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'PEPMASS=' in line:
                mz = line.strip().split('PEPMASS=')[-1]
            elif 'CHARGE=' in line:
                z = line.strip().split('CHARGE=')[-1]
            elif 'RTINSECONDS=' in line:
                rt = line.strip().split('RTINSECONDS=')[-1]
                exp_mass_to_charge.append(mz)
                charge.append(z)
                RT.append(float(rt))
                
    exp_mass_to_charge = [float(i) for i in exp_mass_to_charge]
    charge = [int(re.match(r'\d+', i).group()) for i in charge] 

    loc = [titles.index(i) for i in primenovo_df['Spectrum Name']]
    exp_mass_to_charge = np.array(exp_mass_to_charge)[loc].tolist()
    charge = np.array(charge)[loc].tolist()
    RT = np.array(RT)[loc].tolist()

    if len(exp_mass_to_charge) != len(primenovo_df['Spectrum Name']):
        print('error')
                

    Mass_peptide = []
    for se in seq:
        Mass_peptide.append(sum(mass_AA[aa] for aa in se)+mass_N_terminus + mass_C_terminus)

    Mass_exp = [
        exp_mass_to_charge[i] * charge[i] - charge[i] * mass_H for i in range(len(exp_mass_to_charge))
    ]
    ppm = [
        10**6 * (Mass_exp[i] - Mass_peptide[i]) / Mass_peptide[i]
        for i in range(len(Mass_peptide))
    ]   
    length = [len(x) for x in seq]

    DF = np.empty((len(primenovo_df['Spectrum Name']), 20),dtype='object')
    DF[:,1] = primenovo_df['Spectrum Name']
    DF[:,3] = primenovo_df['pi-PrimeNovo Peptide']
    DF[:,4] = primenovo_df['Spectrum Name']
    DF[:,5] = length 
    DF[:,6] = primenovo_df['pi-PrimeNovo Score']
    DF[:,7] = primenovo_df['pi-PrimeNovo Score']
    DF[:,8] = length
    DF[:,9] = exp_mass_to_charge
    DF[:,10] = charge
    DF[:,11] = RT
    DF[:,12] = '-' 
    DF[:,13] = 1
    DF[:,14] = Mass_peptide 
    DF[:,15] = ppm
    DF[:,17] = primenovo_df['pi-PrimeNovo aaScore'] 
    DF[:,18] = primenovo_df['pi-PrimeNovo Peptide']
    DF[:,19] = 'HCD'
    name = ['Fraction','Source File','Feature','Peptide','Scan','Tag Length','Denovo Score','ALC (%)','length','m/z','z','RT','Predict RT','Area','Mass','ppm','PTM','local confidence (%)','tag (>=0%)','mode']
    df = pd.DataFrame(DF, columns=name)
    df['Denovo Score'] = df['Denovo Score'].astype(int)
    df = df[df['Denovo Score'] >= SocreAll[tool]]
    primenovo_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Stitch) + '_localScore_' + str(quality_cutoff_Stitch_local) + '.csv'
    df.to_csv(primenovo_out,index=False)

    batchfile_dir = '../../../stitch-v1.5.0-linux/batchfiles/' + mab + '.txt'
    with open(batchfile_dir,'r') as fr:
        lines = fr.readlines()
    newfile = resultdir + batchfile_dir.split('/')[-1]
    with open(newfile,'w') as fw:
        for line in lines:
            if 'Path     : ' in line:
                fw.write(line.replace('../datasets/200305_HER_test_04_DENOVO.csv', primenovo_out))
            elif 'CutoffALC: ' in line:
                fw.write(line.replace('95', str(quality_cutoff_Stitch_local)))
            else:
                fw.write(line)

    subprocess.run(
    ['../../../stitch-v1.5.0-linux/stitch.bin', 'run', newfile], stdout=subprocess.DEVNULL) 

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/pi-primenovo/'
out_path = '../../../Stitch_Assembly/pi-PrimeNovo/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
mabs = [mab for mab in mabs if '131ab_EThcd' not in mab and '.yaml' not in mab and 'Herceptin' not in mab and 'anti-FLAG-M2' not in mab and '131ab_EThcd' not in mab]
for mab in mabs:
    print(mab)
    primenovo_path = path + mab +'/'+'denovo.tsv'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    resultdir = out_path + mab +'/'
    process_primenovo_Stitch(primenovo_path, mgf_file,batchfile_dir,resultdir,mab)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_Stitch = 90
tool = 'AdaNovo'
quality_cutoff_Stitch_local = SocreAll[tool]

def process_adanovo_Stitch(adanovo_path, mgf_file, batchfile_dir, resultdir):
    """Parse de novo results from AdaNovo to dataframe.
            :param
                adanovo_path: path to the mztab file of AdaNovo
            :return
                pnovo_df: dataframe with Score, Peptide, AAScore of AdaNovo
    """
    titles=[]
    exp_mass_to_charge = [] 
    charge = []
    RT = []
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
            elif 'PEPMASS=' in line:
                exp_mass_to_charge.append(line.strip().split('PEPMASS=')[-1])
            elif 'CHARGE=' in line:
                charge.append(line.strip().split('CHARGE=')[-1])
            elif 'RTINSECONDS=' in line:
                RT.append(line.strip().split('RTINSECONDS=')[-1])
                
    exp_mass_to_charge = [float(i) for i in exp_mass_to_charge]
    charge = [int(re.match(r'\d+', i).group()) for i in charge]  
    
    adanovo_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Stitch) + '_localScore_' + str(quality_cutoff_Stitch_local) + '.csv'
    with open(adanovo_path) as f:           
        # Identify header
        for i, line in enumerate(f):
            if line.startswith("PSH"):
                adanovo_header = i 
                break
            
    # Load the data into a DataFrame
    df_deno = pd.read_csv(adanovo_path, sep="\t", header=adanovo_header)
    df_denovo = df_deno[df_deno['sequence'].notna()].copy()

    df_denovo['PSM_ID'] = [int(item.replace('ms_run[1]:index=', '')) for item in df_denovo['spectra_ref']]
    df_denovo['exp_mass_to_charge'] = df_denovo['PSM_ID'].apply(lambda x: exp_mass_to_charge[x] if x < len(exp_mass_to_charge) else None)
    df_denovo['charge'] = df_denovo['PSM_ID'].apply(lambda x: charge[x] if x < len(charge) else None)
    df_denovo['RT'] = df_denovo['PSM_ID'].apply(lambda x: RT[x] if x < len(RT) else None)
    df_denovo['PSM_ID'] = df_denovo['PSM_ID'].apply(lambda x: titles[x] if x < len(titles) else None)
    PSM_ID = df_denovo['PSM_ID'].tolist()
    sequence = df_denovo['sequence'].tolist()
    Score = [int((i * 100 + 100)/2) for i in df_denovo['search_engine_score[1]'].tolist()]
    aascore = df_denovo['opt_ms_run[1]_aa_scores'].tolist()
    for i in range(len(aascore)):
        aascore[i]=aascore[i].replace(',', ' ')
    aascore= [i.replace(',', ' ') for i in aascore]
    aascore = [i.split() for i in aascore]
    aascore = [[str(int(float(j) * 100)) for j in i] for i in aascore]
    df_denovo['aaScore'] = [" ".join(i) for i in aascore]
    aascore = df_denovo['aaScore'].tolist()
    scan = df_denovo['PSM_ID'].tolist()

    mode = ['HCD' for x in PSM_ID]
    seq = [str(i).replace('M(+15.99)', 'm').replace('Q(+.98)', 'q').replace('N(+.98)', 'n').replace('Q(-17.03)','q').replace(' ','').replace('C(+57.02)', 'C') for i in sequence]
    length = [len(x) for x in seq]
    Mass_peptide = []
    for se in seq:
        Mass_peptide.append(sum(mass_AA[aa] for aa in se)+mass_N_terminus + mass_C_terminus)
    #Mass_exp = exp_mass_to_charge * charge - charge * mass_H
    #ppm = 10**6 * (Mass_exp - Mass_peptide) / Mass_peptide

    Mass_exp = [
        exp_mass_to_charge[i] * charge[i] - charge[i] * mass_H for i in range(len(exp_mass_to_charge))
    ]
    ppm = [
        10**6 * (Mass_exp[i] - Mass_peptide[i]) / Mass_peptide[i]
        for i in range(len(Mass_peptide))
    ]

    DF = np.empty((len(PSM_ID), 20),dtype='object')
    DF[:,1] = PSM_ID
    DF[:,3] = sequence 
    DF[:,4] = scan 
    DF[:,5] = length 
    DF[:,6] = Score 
    DF[:,7] = Score
    DF[:,8] = length
    DF[:,9] = df_denovo['exp_mass_to_charge']
    DF[:,10] = df_denovo['charge'] 
    DF[:,11] = df_denovo['RT'] 
    DF[:,12] = '-' 
    DF[:,13] = 1
    DF[:,14] = Mass_peptide 
    DF[:,15] = ppm
    DF[:,17] = aascore 
    DF[:,18] = sequence 
    DF[:,19] = mode 
    name = ['Fraction','Source File','Feature','Peptide','Scan','Tag Length','Denovo Score','ALC (%)','length','m/z','z','RT','Predict RT','Area','Mass','ppm','PTM','local confidence (%)','tag (>=0%)','mode']
    df = pd.DataFrame(DF, columns=name)
    df['Denovo Score'] = df['Denovo Score'].astype(int)
    df = df[df['Denovo Score'] >= SocreAll[tool]]
    df.to_csv(adanovo_out,index=False)

    with open(batchfile_dir,'r') as fr:
        lines = fr.readlines()
        newfile = resultdir + batchfile_dir.split('/')[-1]
        with open(newfile,'w') as fw:
            for line in lines:
                if 'Path     : ' in line:
                    fw.write(line.replace('../datasets/200305_HER_test_04_DENOVO.csv', adanovo_out))
                elif 'CutoffALC: ' in line:
                    fw.write(line.replace('95', str(quality_cutoff_Stitch_local)))
                else:
                    fw.write(line)

    subprocess.run(
    ['../../../stitch-v1.5.0-linux/stitch.bin', 'run', newfile], stdout=subprocess.DEVNULL)

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/adanovo/'
out_path = '../../../Stitch_Assembly/AdaNovo/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
mabs = [mab for mab in mabs if '131ab_EThcd' not in mab and '.yaml' not in mab and 'Herceptin' not in mab and 'anti-FLAG-M2' not in mab]
for mab in mabs:
    print(mab)
    adanovo_path = path + mab +'/'+'denovo.mztab'
    resultdir = out_path + mab +'/'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    batchfile_dir = '../../../stitch-v1.5.0-linux/batchfiles/' + mab + '.txt'
    process_adanovo_Stitch(adanovo_path,mgf_file,batchfile_dir,resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_Stitch = 90
tool = 'ContraNovo'
quality_cutoff_Stitch_local = SocreAll[tool]

def process_contranovo_Stitch(contranovo_path,mgf_file,batchfile_dir, resultdir, mab):
    """Parse de novo results from ContraNovo to dataframe.
            :param
                contranovo_path: path to the mztab file of ContraNovo
            :return
                contranovo_df: dataframe with Score, Peptide, AAScore of ContraNovo
    """
    
    
    contranovo_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Stitch) + '_localScore_' + str(quality_cutoff_Stitch_local) + '.csv'
    with open(contranovo_path) as f:           
        # Identify header
        for i, line in enumerate(f):
            if line.startswith("PSH"):
                contranovo_header = i 
                break
            
    # Load the data into a DataFrame
    df_deno = pd.read_csv(contranovo_path, sep="\t", header=contranovo_header)
    df_denovo = df_deno[df_deno['sequence'].notna()].copy()
    
    df_denovo['PSM_ID'] = [i.replace('ms_run[','').replace(']','') for i in df_denovo['spectra_ref']]

    sequence = df_denovo['sequence'].tolist()
    #sequence = [str(i).replace('M+15.995', 'M(+15.99)').replace('Q+0.984', 'Q(+.98)').replace('N+0.984', 'N(+.98)').replace(' ','').replace('C+57.021', 'C(+57.02)').replace('-17.027Q','').replace('+43.006-17.027','').replace('-17.027','').replace('+43.006','').replace('+42.011','') for i in sequence]     
    
    PSM_ID = df_denovo['PSM_ID'].tolist()
    Score = [int((i * 100 + 100)/2) for i in df_denovo['search_engine_score[1]'].tolist()]
    aascore = df_denovo['opt_ms_run[1]_aa_scores'].tolist()
    for i in range(len(aascore)):
        aascore[i]=aascore[i].replace(',', ' ')
    aascore= [i.replace(',', ' ') for i in aascore]
    aascore = [i.split() for i in aascore]
    aascore = [[str(int(float(j) * 100)) for j in i] for i in aascore]
    df_denovo['aaScore'] = [" ".join(i) for i in aascore]
    aascore = df_denovo['aaScore'].tolist()
    scan = df_denovo['PSM_ID'].tolist()
    seq = []
    for s, aa in zip(sequence, aascore):
        # Replace known components in the sequence
        s_modified = s.replace('M+15.995', 'm') \
                  .replace('Q+0.984', 'q') \
                  .replace('N+0.984', 'n') \
                  .replace('C+57.021', 'C') \
                  .replace('+42.011', 'g') \
                  .replace('+43.006-17.027', 'd') \
                  .replace('-17.027','e') \
                  .replace('+43.006','f')
        # Split the aaScore to check lengths
        aa_list = aa.split()
        # If the sequence length and aaScore length don't match, replace 'd' with 'fe'
        if len(s_modified) != len(aa_list):
            s_modified = s_modified.replace('d', 'fe')  # If mismatch, replace 'd' with 'f' and 'e'

        # Append the modified sequence
        seq.append(s_modified)

    # 过滤包含p, d, e, f, g的氨基酸，并同步移除对应位置的aascore
    filtered_sequences = []
    filtered_aascores = []

    excluded_chars = {'p', 'd', 'e', 'f', 'g'}

    idx = 0
    for s, aa in zip(seq, aascore):
        idx += 1
        # 将氨基酸分数字符串转换为列表
        aa_list = aa.split()
    
        # 检查序列和分数长度是否一致
        if len(s) != len(aa_list):
            raise ValueError("序列长度与分数数量不匹配: 序列长度 %d, 分数数量 %d" % (len(s), len(aa_list)))
    
        # 同步过滤氨基酸和对应分数
        filtered = [(char, score) for char, score in zip(s, aa_list) if char not in excluded_chars]
    
        # 分离过滤后的字符和分数
        filtered_chars, filtered_scores = zip(*filtered) if filtered else ([], [])
    
        filtered_sequences.append(''.join(filtered_chars))
        filtered_aascores.append(' '.join(filtered_scores))

    length = [len(seqe)  for seqe in filtered_sequences]
    seq = [i for i in filtered_sequences]
    filtered_sequences = [seqe.replace('C','C(+57.02)').replace('m','M(+15.99)').replace('q','Q(+.98)').replace('n','N(+.98)')  for seqe in filtered_sequences]
    
    DF = np.empty((len(PSM_ID), 4),dtype='object')
    DF[:,0] = PSM_ID
    DF[:,1] = filtered_sequences
    DF[:,2] = Score
    DF[:,3] = filtered_aascores
    name = ['Scan', 'Peptide', 'Score', 'aaScore']
    df_denovo = pd.DataFrame(DF, columns=name)

    dat = pd.read_csv(mgf_path + mab +'/process1/'+'spectrum_id_HCD_old.csv')
    titles = list(dat['TITLE'])

    exp_mass_to_charge = [] 
    charge = []
    RT = []
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'PEPMASS=' in line:
                mz = line.strip().split('PEPMASS=')[-1]
            elif 'CHARGE=' in line:
                z = line.strip().split('CHARGE=')[-1]
            elif 'RTINSECONDS=' in line:
                rt = line.strip().split('RTINSECONDS=')[-1]
                exp_mass_to_charge.append(mz)
                charge.append(z)
                RT.append(float(rt))
                
    exp_mass_to_charge = [float(i) for i in exp_mass_to_charge]
    charge = [int(re.match(r'\d+', i).group()) for i in charge] 

    loc = [titles.index(i) for i in df_denovo['Scan']]
    exp_mass_to_charge = np.array(exp_mass_to_charge)[loc].tolist()
    charge = np.array(charge)[loc].tolist()
    RT = np.array(RT)[loc].tolist()

    if len(exp_mass_to_charge) != len(df_denovo['Scan']):
        print('error')

    Mass_peptide = []
    for se in seq:
        Mass_peptide.append(sum(mass_AA[aa] for aa in se)+mass_N_terminus + mass_C_terminus)

    Mass_exp = [
        exp_mass_to_charge[i] * charge[i] - charge[i] * mass_H for i in range(len(exp_mass_to_charge))
    ]
    ppm = [
        10**6 * (Mass_exp[i] - Mass_peptide[i]) / Mass_peptide[i]
        for i in range(len(Mass_peptide))
    ]   
    length = [len(x) for x in seq]

    DF = np.empty((len(df_denovo['Scan']), 20),dtype='object')
    DF[:,1] = df_denovo['Scan']
    DF[:,3] = df_denovo['Peptide']
    DF[:,4] = df_denovo['Scan']
    DF[:,5] = length 
    DF[:,6] = df_denovo['Score']
    DF[:,7] = df_denovo['Score']
    DF[:,8] = length
    DF[:,9] = exp_mass_to_charge
    DF[:,10] = charge
    DF[:,11] = RT
    DF[:,12] = '-' 
    DF[:,13] = 1
    DF[:,14] = Mass_peptide 
    DF[:,15] = ppm
    DF[:,17] = df_denovo['aaScore'] 
    DF[:,18] = df_denovo['Peptide']
    DF[:,19] = 'HCD'
    name = ['Fraction','Source File','Feature','Peptide','Scan','Tag Length','Denovo Score','ALC (%)','length','m/z','z','RT','Predict RT','Area','Mass','ppm','PTM','local confidence (%)','tag (>=0%)','mode']
    df = pd.DataFrame(DF, columns=name)
    df['Denovo Score'] = df['Denovo Score'].astype(int)
    df = df[df['Denovo Score'] >= SocreAll[tool]]
    df.to_csv(contranovo_out,index=False)

    batchfile_dir = '../../../stitch-v1.5.0-linux/batchfiles/' + mab + '.txt'
    with open(batchfile_dir,'r') as fr:
        lines = fr.readlines()
    newfile = resultdir + batchfile_dir.split('/')[-1]
    with open(newfile,'w') as fw:
        for line in lines:
            if 'Path     : ' in line:
                fw.write(line.replace('../datasets/200305_HER_test_04_DENOVO.csv', contranovo_out))
            elif 'CutoffALC: ' in line:
                fw.write(line.replace('95', str(quality_cutoff_Stitch_local)))
            else:
                fw.write(line)

    subprocess.run(
    ['../../../stitch-v1.5.0-linux/stitch.bin', 'run', newfile], stdout=subprocess.DEVNULL) 

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/contranovo/'
out_path = '../../../Stitch_Assembly/ContraNovo/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
mabs = [mab for mab in mabs if '131ab_EThcd' not in mab and '.yaml' not in mab and 'Herceptin' not in mab and 'anti-FLAG-M2' not in mab and '131ab_EThcd' not in mab]
for mab in mabs:
    print(mab)
    contranovo_path = path + mab +'/'+'denovo.mztab'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    resultdir = out_path + mab +'/'
    process_contranovo_Stitch(contranovo_path,mgf_file,batchfile_dir,resultdir,mab)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_Stitch = 90
tool = 'DeepNovo'
quality_cutoff_Stitch_local = SocreAll[tool]

def process_deepnovo_Stitch(deepnovo_path, mgf_file,batchfile_dir,resultdir):
    """parse de novo results from DeepNoo to dataframe

            :param
                deepnovo_path: path to the result file of DeepNovo
            :return
                deepnovo_df: dataframe with Score, Peptide, AAScore of DeepNovo
    """
    titles=[]
    exp_mass_to_charge = [] 
    charge = []
    RT = []
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
            elif 'PEPMASS=' in line:
                exp_mass_to_charge.append(line.strip().split('PEPMASS=')[-1])
            elif 'CHARGE=' in line:
                charge.append(line.strip().split('CHARGE=')[-1])
            elif 'RTINSECONDS=' in line:
                RT.append(line.strip().split('RTINSECONDS=')[-1])
                
    exp_mass_to_charge = [float(i) for i in exp_mass_to_charge]
    charge = [int(re.match(r'\d+', i).group()) for i in charge]
    
    deepnovo_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Stitch) + '_localScore_' + str(quality_cutoff_Stitch_local) + '.csv'
    deepno_df = pd.read_csv(deepnovo_path, sep="	", header=0)
    deepno_df = deepno_df[['scan', 'predicted_score', 'predicted_sequence',  'predicted_position_score']]
    deepnovo_df = deepno_df[deepno_df['predicted_sequence'].notna()].copy()

    # bug from deepnovo (v.PNAS) which causes result to be wrong aligned

    '''
    scan    output_score    output_seq  aa_scpre
    1       NaN             NaN         NaN
    NaN     24              S,E,L       12, 41, 12
    2       NaN ...
    '''
    # change peptide from P,E,P,T,I,D to PEPTID

    deepnovo_peptide = deepnovo_df['predicted_sequence'].tolist()
    for i in range(len(deepnovo_peptide)):
        deepnovo_peptide[i] = str(deepnovo_peptide[i])
        deepnovo_peptide[i] = deepnovo_peptide[i].replace(",", "").replace("I", "L").replace("Cmod",
            "C(+57.02)").replace("Mmod", "M(+15.99)").replace("Nmod", "N(+.98)").replace("Qmod", "Q(+.98)")
    deepnovo_df['predicted_sequence'] = deepnovo_peptide
    deepnovo_df.columns = ['Spectrum Name','DeepNovo Score', 'DeepNovo Peptide', 'DeepNovo aaScore']

    # scale peptide score from 0 to 100

    deepnovo_score = deepnovo_df['DeepNovo Score']
    deepnovo_df['DeepNovo Score'] = [int(np.exp(i) * 100) if not pd.isna(i) else 0 for i in deepnovo_score]

    # scale amino acid score from 0 to 100

    deepnovo_aascore = deepnovo_df['DeepNovo aaScore'].tolist()
    deepnovo_aascore = [str(i).replace(",", " ") for i in deepnovo_aascore]
    deepnovo_aascore = [i.split() for i in deepnovo_aascore]
    deepnovo_aascore = [[str(int(np.exp(float(j)) * 100)) if j not in ['nan', 'NaN', '', None] else '' for j in i] for i in deepnovo_aascore]
    deepnovo_df['DeepNovo aaScore'] = [" ".join(i) for i in deepnovo_aascore]
    deepnovo_df['DeepNovo Score'] = deepnovo_df['DeepNovo Score'].astype(int)

    deepnovo_df['PSM_ID'] = [int(item)-1 for item in deepnovo_df['Spectrum Name']]
    deepnovo_df['exp_mass_to_charge'] = deepnovo_df['PSM_ID'].apply(lambda x: exp_mass_to_charge[x] if x < len(exp_mass_to_charge) else None)
    exp_mass_to_charge = list(deepnovo_df['exp_mass_to_charge'])
    deepnovo_df['charge'] = deepnovo_df['PSM_ID'].apply(lambda x: charge[x] if x < len(charge) else None)
    charge = [int(i) for i in deepnovo_df['charge']]
    deepnovo_df['RT'] = deepnovo_df['PSM_ID'].apply(lambda x: RT[x] if x < len(RT) else None)
    deepnovo_df['PSM_ID'] = deepnovo_df['PSM_ID'].apply(lambda x: titles[x] if x < len(titles) else None)
    seq = [i.replace('C(+57.02)','C').replace('N(+.98)','n').replace('Q(+.98)','q').replace('M(+15.99)','m') for i in deepnovo_df['DeepNovo Peptide']]
    length = [len(x) for x in seq]
    Mass_peptide = []
    for se in seq:
        Mass_peptide.append(sum(mass_AA[aa] for aa in se)+mass_N_terminus + mass_C_terminus)
    #Mass_exp = exp_mass_to_charge * charge - charge * mass_H
    #ppm = 10**6 * (Mass_exp - Mass_peptide) / Mass_peptide

    Mass_exp = [
        exp_mass_to_charge[i] * charge[i] - charge[i] * mass_H for i in range(len(exp_mass_to_charge))
    ]
    ppm = [
        10**6 * (Mass_exp[i] - Mass_peptide[i]) / Mass_peptide[i]
        for i in range(len(Mass_peptide))
    ]
    
    DF = np.empty((len(deepnovo_df['PSM_ID']), 20),dtype='object')
    DF[:,1] = deepnovo_df['PSM_ID']
    DF[:,3] = deepnovo_df['DeepNovo Peptide']
    DF[:,4] = deepnovo_df['PSM_ID'] 
    DF[:,5] = length 
    DF[:,6] = deepnovo_df['DeepNovo Score'] 
    DF[:,7] = deepnovo_df['DeepNovo Score'] 
    DF[:,8] = length
    DF[:,9] = exp_mass_to_charge
    DF[:,10] = charge
    DF[:,11] = deepnovo_df['RT'] 
    DF[:,12] = '-' 
    DF[:,13] = 1
    DF[:,14] = Mass_peptide 
    DF[:,15] = ppm
    DF[:,17] = deepnovo_df['DeepNovo aaScore'] 
    DF[:,18] = deepnovo_df['DeepNovo Peptide']
    DF[:,19] = 'HCD' 
    name = ['Fraction','Source File','Feature','Peptide','Scan','Tag Length','Denovo Score','ALC (%)','length','m/z','z','RT','Predict RT','Area','Mass','ppm','PTM','local confidence (%)','tag (>=0%)','mode']
    df = pd.DataFrame(DF, columns=name)
    df['Denovo Score'] = df['Denovo Score'].astype(int)
    df = df[df['Denovo Score'] >= SocreAll[tool]]
    df.to_csv(deepnovo_out,index=False)

    with open(batchfile_dir,'r') as fr:
        lines = fr.readlines()
        newfile = resultdir + batchfile_dir.split('/')[-1]
        with open(newfile,'w') as fw:
            for line in lines:
                if 'Path     : ' in line:
                    fw.write(line.replace('../datasets/200305_HER_test_04_DENOVO.csv', deepnovo_out))
                elif 'CutoffALC: ' in line:
                    fw.write(line.replace('95', str(quality_cutoff_Stitch_local)))
                else:
                    fw.write(line)

    subprocess.run(
    ['../../../stitch-v1.5.0-linux/stitch.bin', 'run', newfile], stdout=subprocess.DEVNULL)
    
path = '../../../denovo/deepnovo2/'
out_path = '../../../Stitch_Assembly/DeepNovo/'
mabs = os.listdir(path)
mabs = [mab for mab in mabs if '131ab_EThcd' not in mab and '.yaml' not in mab and 'Herceptin' not in mab and 'anti-FLAG-M2' not in mab]
for mab in mabs:
    print(mab)
    deepnovo_path = path + mab +'/'+mab+'.deepnovo_denovo'
    resultdir = out_path + mab +'/'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    batchfile_dir = '../../../stitch-v1.5.0-linux/batchfiles/' + mab + '.txt'
    process_deepnovo_Stitch(deepnovo_path, mgf_file,batchfile_dir,resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_Stitch = 90
tool = 'InstaNovo'
quality_cutoff_Stitch_local = SocreAll[tool]

def process_instanovo_Stitch(instanovo_path,mgf_file,batchfile_dir,resultdir):
    """Parse de novo results from AdaNovo to dataframe.
            :param
                instanovo_path: path to the mztab file of InstaNovo
            :return
                pnovo_df: dataframe with Score, Peptide, AAScore of InstaNovo
    """
    titles=[]
    exp_mass_to_charge = [] 
    charge = []
    RT = []
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
            elif 'PEPMASS=' in line:
                exp_mass_to_charge.append(line.strip().split('PEPMASS=')[-1])
            elif 'CHARGE=' in line:
                charge.append(line.strip().split('CHARGE=')[-1])
            elif 'RTINSECONDS=' in line:
                RT.append(line.strip().split('RTINSECONDS=')[-1])
                
    exp_mass_to_charge = [float(i) for i in exp_mass_to_charge]
    charge = [int(re.match(r'\d+', i).group()) for i in charge]   

    instanovo_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Stitch) + '_localScore_' + str(quality_cutoff_Stitch_local) + '.csv'
           
    # Load the data into a DataFrame
    df_deno = pd.read_csv(instanovo_path)
    df_denovo = df_deno[df_deno['predictions'].notna()].copy()

    df_denovo['PSM_ID'] = [int(item) for item in df_denovo['scan_number']]
    df_denovo['exp_mass_to_charge'] = df_denovo['PSM_ID'].apply(lambda x: exp_mass_to_charge[x] if x < len(exp_mass_to_charge) else None)
    df_denovo['charge'] = df_denovo['PSM_ID'].apply(lambda x: charge[x] if x < len(charge) else None)
    df_denovo['RT'] = df_denovo['PSM_ID'].apply(lambda x: RT[x] if x < len(RT) else None)   
    df_denovo['PSM_ID'] = df_denovo['PSM_ID'].apply(lambda x: titles[x] if x < len(titles) else None)

    sequence = df_denovo['predictions'].tolist()
    PSM_ID = df_denovo['PSM_ID'].tolist()
    Score = [int(np.exp(i) * 100) for i in df_denovo['log_probabilities'].tolist()]
    df_denovo['Score'] = Score
    aascore = df_denovo['token_log_probabilities'].tolist()
    for i in range(len(aascore)):
        aascore[i]=aascore[i].replace(',', ' ').replace('[','').replace(']','')
    aascore= [i.replace(',', ' ') for i in aascore]
    aascore = [i.split() for i in aascore]
    #aascore = [[str(int(np.exp(float(j)) * 100)) for j in i] for i in aascore]
    aascore = [[str(int(np.exp(float(i[j])) * 100)) for j in range(0, len(i)-1)] for i in aascore]

    df_denovo['aaScore'] = [" ".join(i) for i in aascore]
    aascore = df_denovo['aaScore'].tolist()
    scan = df_denovo['PSM_ID'].tolist()
    df_denovo['Peptide'] = [str(i).replace('M[UNIMOD:35]', 'M(+15.99)').replace('Q[UNIMOD:7]', 'Q(+.98)').replace('N[UNIMOD:7]', 'N(+.98)').replace('C[UNIMOD:4]', 'C(+57.02)').replace('I','L') for i in sequence]
    seq = [str(i).replace('M(+15.99)', 'm').replace('Q(+.98)', 'q').replace('N(+.98)', 'n').replace('Q(-17.03)','q').replace(' ','').replace('C(+57.02)', 'C') for i in df_denovo['Peptide']]
    length = [len(x) for x in seq]
    Mass_peptide = []
    for se in seq:
        Mass_peptide.append(sum(mass_AA[aa] for aa in se)+mass_N_terminus + mass_C_terminus)
    #Mass_exp = exp_mass_to_charge * charge - charge * mass_H
    #ppm = 10**6 * (Mass_exp - Mass_peptide) / Mass_peptide
    exp_mass_to_charge = list(df_denovo['exp_mass_to_charge'])
    charge = list(df_denovo['charge'])
    RT = df_denovo['RT']
    Mass_exp = [
        exp_mass_to_charge[i] * charge[i] - charge[i] * mass_H for i in range(len(exp_mass_to_charge))
    ]
    ppm = [
        10**6 * (Mass_exp[i] - Mass_peptide[i]) / Mass_peptide[i]
        for i in range(len(Mass_peptide))
    ]

    DF = np.empty((len(scan), 20),dtype='object')
    DF[:,1] = scan
    DF[:,3] = df_denovo['Peptide']
    DF[:,4] = scan 
    DF[:,5] = length 
    DF[:,6] = df_denovo['Score']
    DF[:,7] = df_denovo['Score']
    DF[:,8] = length
    DF[:,9] = df_denovo['exp_mass_to_charge']
    DF[:,10] = df_denovo['charge'].astype(int)
    DF[:,11] = df_denovo['RT'] 
    DF[:,12] = '-' 
    DF[:,13] = 1
    DF[:,14] = Mass_peptide 
    DF[:,15] = ppm
    DF[:,17] = df_denovo['aaScore']
    DF[:,18] = df_denovo['Peptide'] 
    DF[:,19] = 'HCD'
    name = ['Fraction','Source File','Feature','Peptide','Scan','Tag Length','Denovo Score','ALC (%)','length','m/z','z','RT','Predict RT','Area','Mass','ppm','PTM','local confidence (%)','tag (>=0%)','mode']
    df = pd.DataFrame(DF, columns=name)
    df['Denovo Score'] = df['Denovo Score'].astype(int)
    df = df[df['Denovo Score'] >= SocreAll[tool]]
    df.to_csv(instanovo_out,index=False)

    with open(batchfile_dir,'r') as fr:
        lines = fr.readlines()
        newfile = resultdir + batchfile_dir.split('/')[-1]
        with open(newfile,'w') as fw:
            for line in lines:
                if 'Path     : ' in line:
                    fw.write(line.replace('../datasets/200305_HER_test_04_DENOVO.csv', instanovo_out))
                elif 'CutoffALC: ' in line:
                    fw.write(line.replace('95', str(quality_cutoff_Stitch_local)))
                else:
                    fw.write(line)

    subprocess.run(
    ['../../../stitch-v1.5.0-linux/stitch.bin', 'run', newfile], stdout=subprocess.DEVNULL)

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/instanovo_new/'
out_path = '../../../Stitch_Assembly/InstaNovo/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
mabs = [mab for mab in mabs if '131ab_EThcd' not in mab and '.yaml' not in mab and 'Herceptin' not in mab and 'anti-FLAG-M2' not in mab]
for mab in mabs:
    print(mab)
    instanovo_path = path + mab +'/'+'predictions_before_refinement.csv'
    resultdir = out_path + mab +'/'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    batchfile_dir = '../../../stitch-v1.5.0-linux/batchfiles/' + mab + '.txt'
    process_instanovo_Stitch(instanovo_path,mgf_file,batchfile_dir,resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_Stitch = 90
tool = 'PointNovo'
quality_cutoff_Stitch_local = SocreAll[tool]

def process_pointnovo_Stitch(pointnovo_path, mgf_file, batchfile_dir, resultdir):
    """parse de novo results from PointNovo to dataframe
            :param
                pointnovo_path: path to the result file of PointNovo
            :return
                pointnovo_df: dataframe with ScanNum, Score, Peptide, AAScore of PointNovo
    """
    titles=[]
    exp_mass_to_charge = [] 
    charge = []
    RT = []
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
            elif 'PEPMASS=' in line:
                exp_mass_to_charge.append(line.strip().split('PEPMASS=')[-1])
            elif 'CHARGE=' in line:
                charge.append(line.strip().split('CHARGE=')[-1])
            elif 'RTINSECONDS=' in line:
                RT.append(line.strip().split('RTINSECONDS=')[-1])
                
    exp_mass_to_charge = [float(i) for i in exp_mass_to_charge]
    charge = [int(re.match(r'\d+', i).group()) for i in charge]  
    pointnovo_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Stitch) + '_localScore_' + str(quality_cutoff_Stitch_local) + '.csv'
    
    pointno_df = pd.read_csv(pointnovo_path, sep="\t", header=0)
    pointno_df['PSM_ID'] = [int(item)-1 for item in pointno_df['feature_id']]
    pointno_df = pointno_df[['PSM_ID', 'predicted_sequence', 'predicted_score', 'predicted_position_score']]
    pointnovo_df = pointno_df[pointno_df['predicted_sequence'].notna()].copy()

    # change peptide from P,E,P,T,I,D to PEPTID

    pointnovo_peptide = pointnovo_df['predicted_sequence'].tolist()
    for i in range(len(pointnovo_peptide)):
        pointnovo_peptide[i] = str(pointnovo_peptide[i])
        pointnovo_peptide[i] = pointnovo_peptide[i].replace(",", "").replace("I", "L").replace("N(Deamidation)",
        "N(+.98)").replace("Q(Deamidation)", "Q(+.98)").replace("C(Carbamidomethylation)", "C(+57.02)").replace("M(Oxidation)", "M(+15.99)")
    pointnovo_df['predicted_sequence'] = pointnovo_peptide
    pointnovo_df.columns = ['PSM_ID','PointNovo Peptide', 'PointNovo Score', 'PointNovo aaScore']

    # scale peptide score from 0 to 100

    pointnovo_score = pointnovo_df['PointNovo Score']
    pointnovo_df['PointNovo Score'] = [int(np.exp(i) * 100) if not pd.isna(i) else 0 for i in pointnovo_score]

    # scale AAscore from 0 to 100

    pointnovo_aascore = pointnovo_df['PointNovo aaScore'].tolist()
    pointnovo_aascore = [str(i).replace(",", " ") for i in pointnovo_aascore]
    pointnovo_aascore = [i.split() for i in pointnovo_aascore]
    pointnovo_aascore = [[str(int(np.exp(float(j)) * 100)) if j not in ['nan', 'NaN', '', None] else '' for j in i] for i in pointnovo_aascore]
    pointnovo_df['PointNovo aaScore'] = [" ".join(i) for i in pointnovo_aascore]
    pointnovo_df = pointnovo_df[['PSM_ID','PointNovo Peptide', 'PointNovo Score', 'PointNovo aaScore']]

    pointnovo_df['exp_mass_to_charge'] = pointnovo_df['PSM_ID'].apply(lambda x: exp_mass_to_charge[x] if x < len(exp_mass_to_charge) else None)
    exp_mass_to_charge = list(pointnovo_df['exp_mass_to_charge'])
    pointnovo_df['charge'] = pointnovo_df['PSM_ID'].apply(lambda x: charge[x] if x < len(charge) else None)
    charge = list(pointnovo_df['charge'])
    pointnovo_df['RT'] = pointnovo_df['PSM_ID'].apply(lambda x: RT[x] if x < len(RT) else None)
    pointnovo_df['PSM_ID'] = pointnovo_df['PSM_ID'].apply(lambda x: titles[x] if x < len(titles) else None)
    seq = [i.replace('C(+57.02)','C').replace('N(+.98)','n').replace('Q(+.98)','q').replace('M(+15.99)','m') for i in pointnovo_df['PointNovo Peptide']]
    length = [len(x) for x in seq]
    Mass_peptide = []
    for se in seq:
        Mass_peptide.append(sum(mass_AA[aa] for aa in se)+mass_N_terminus + mass_C_terminus)
    #Mass_exp = exp_mass_to_charge * charge - charge * mass_H
    #ppm = 10**6 * (Mass_exp - Mass_peptide) / Mass_peptide
    Mass_exp = [
        exp_mass_to_charge[i] * charge[i] - charge[i] * mass_H for i in range(len(exp_mass_to_charge))
    ]
    ppm = [
        10**6 * (Mass_exp[i] - Mass_peptide[i]) / Mass_peptide[i]
        for i in range(len(Mass_peptide))
    ]

    DF = np.empty((len(pointnovo_df['PSM_ID']), 20),dtype='object')
    DF[:,1] = pointnovo_df['PSM_ID']
    DF[:,3] = pointnovo_df['PointNovo Peptide']
    DF[:,4] = pointnovo_df['PSM_ID'] 
    DF[:,5] = length 
    DF[:,6] = pointnovo_df['PointNovo Score'] 
    DF[:,7] = pointnovo_df['PointNovo Score'] 
    DF[:,8] = length
    DF[:,9] = exp_mass_to_charge
    DF[:,10] = charge
    DF[:,11] = pointnovo_df['RT'] 
    DF[:,12] = '-' 
    DF[:,13] = 1
    DF[:,14] = Mass_peptide 
    DF[:,15] = ppm
    DF[:,17] = pointnovo_df['PointNovo aaScore'] 
    DF[:,18] = pointnovo_df['PointNovo Peptide']
    DF[:,19] = 'HCD' 
    name = ['Fraction','Source File','Feature','Peptide','Scan','Tag Length','Denovo Score','ALC (%)','length','m/z','z','RT','Predict RT','Area','Mass','ppm','PTM','local confidence (%)','tag (>=0%)','mode']
    df = pd.DataFrame(DF, columns=name)
    df['Denovo Score'] = df['Denovo Score'].astype(int)
    df = df[df['Denovo Score'] >= SocreAll[tool]]
    df.to_csv(pointnovo_out,index=False)

    with open(batchfile_dir,'r') as fr:
        lines = fr.readlines()
        newfile = resultdir + batchfile_dir.split('/')[-1]
        with open(newfile,'w') as fw:
            for line in lines:
                if 'Path     : ' in line:
                    fw.write(line.replace('../datasets/200305_HER_test_04_DENOVO.csv', pointnovo_out))
                elif 'CutoffALC: ' in line:
                    fw.write(line.replace('95', str(quality_cutoff_Stitch_local)))
                else:
                    fw.write(line)

    subprocess.run(
    ['../../../stitch-v1.5.0-linux/stitch.bin', 'run', newfile], stdout=subprocess.DEVNULL)
   
Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/pointnovo/'
out_path = '../../../Stitch_Assembly/PointNovo/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
mabs = [mab for mab in mabs if '131ab_EThcd' not in mab and '.yaml' not in mab and 'Herceptin' not in mab and 'anti-FLAG-M2' not in mab]
for mab in mabs:
    print(mab)
    pointnovo_path = path + mab +'/'+mab+'.deepnovo_denovo'
    resultdir = out_path + mab +'/'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    batchfile_dir = '../../../stitch-v1.5.0-linux/batchfiles/' + mab + '.txt'
    process_pointnovo_Stitch(pointnovo_path,mgf_file,batchfile_dir,resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_Stitch = 90
tool = 'PGPointNovo'
quality_cutoff_Stitch_local = SocreAll[tool]

def process_pgpointnovo_Stitch(pgpointnovo_path,mgf_file,batchfile_dir,resultdir):
    """parse de novo results from PointNovo to dataframe
            :param
                pgpointnovo_path: path to the result file of PointNovo
            :return
                pgpointnovo_df: dataframe with ScanNum, Score, Peptide, AAScore of PointNovo
    """

    titles=[]
    exp_mass_to_charge = [] 
    charge = []
    RT = []
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
            elif 'PEPMASS=' in line:
                exp_mass_to_charge.append(line.strip().split('PEPMASS=')[-1])
            elif 'CHARGE=' in line:
                charge.append(line.strip().split('CHARGE=')[-1])
            elif 'RTINSECONDS=' in line:
                RT.append(line.strip().split('RTINSECONDS=')[-1])
                
    exp_mass_to_charge = [float(i) for i in exp_mass_to_charge]
    charge = [int(re.match(r'\d+', i).group()) for i in charge]
    
    pgpointnovo_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Stitch) + '_localScore_' + str(quality_cutoff_Stitch_local) + '.csv'
    
    pgpointno_df = pd.read_csv(pgpointnovo_path, sep="\t", header=0)

    pgpointno_df['PSM_ID'] = [int(item)-1 for item in pgpointno_df['feature_id']]
    pgpointno_df = pgpointno_df[['PSM_ID', 'predicted_sequence', 'predicted_score', 'predicted_position_score']]
    pgpointnovo_df = pgpointno_df[pgpointno_df['predicted_sequence'].notna()].copy()

    # change peptide from P,E,P,T,I,D to PEPTID

    pgpointnovo_peptide = pgpointnovo_df['predicted_sequence'].tolist()
    for i in range(len(pgpointnovo_peptide)):
        pgpointnovo_peptide[i] = str(pgpointnovo_peptide[i])
        pgpointnovo_peptide[i] = pgpointnovo_peptide[i].replace(",", "").replace("I", "L").replace("N(Deamidation)",
        "N(+.98)").replace("Q(Deamidation)", "Q(+.98)").replace("C(Carbamidomethylation)", "C(+57.02)").replace("M(Oxidation)", "M(+15.99)")
    pgpointnovo_df['predicted_sequence'] = pgpointnovo_peptide
    pgpointnovo_df.columns = ['PSM_ID', 'PGPointNovo Peptide', 'PGPointNovo Score', 'PGPointNovo aaScore']

    # scale peptide score from 0 to 100

    pgpointnovo_score = pgpointnovo_df['PGPointNovo Score']
    pgpointnovo_df['PGPointNovo Score'] = [int(np.exp(i) * 100) if not pd.isna(i) else 0 for i in pgpointnovo_score]

    # scale AAscore from 0 to 100

    pgpointnovo_aascore = pgpointnovo_df['PGPointNovo aaScore'].tolist()
    pgpointnovo_aascore = [str(i).replace(",", " ") for i in pgpointnovo_aascore]
    pgpointnovo_aascore = [i.split() for i in pgpointnovo_aascore]
    pgpointnovo_aascore = [[str(int(np.exp(float(j)) * 100)) if j not in ['nan', 'NaN', '', None] else '' for j in i] for i in pgpointnovo_aascore]

    pgpointnovo_df['PGPointNovo aaScore'] = [" ".join(i) for i in pgpointnovo_aascore]
    pgpointnovo_df['PGPointNovo Score'] = pgpointnovo_df['PGPointNovo Score'].astype(int)

    pgpointnovo_df['exp_mass_to_charge'] = pgpointnovo_df['PSM_ID'].apply(lambda x: exp_mass_to_charge[x] if x < len(exp_mass_to_charge) else None)
    exp_mass_to_charge = list(pgpointnovo_df['exp_mass_to_charge'])
    pgpointnovo_df['charge'] = pgpointnovo_df['PSM_ID'].apply(lambda x: charge[x] if x < len(charge) else None)
    charge = list(pgpointnovo_df['charge'])
    pgpointnovo_df['RT'] = pgpointnovo_df['PSM_ID'].apply(lambda x: RT[x] if x < len(RT) else None)
    pgpointnovo_df['PSM_ID'] = pgpointnovo_df['PSM_ID'].apply(lambda x: titles[x] if x < len(titles) else None)
    seq = [i.replace('C(+57.02)','C').replace('N(+.98)','n').replace('Q(+.98)','q').replace('M(+15.99)','m') for i in pgpointnovo_df['PGPointNovo Peptide']]
    length = [len(x) for x in seq]
    Mass_peptide = []
    for se in seq:
        Mass_peptide.append(sum(mass_AA[aa] for aa in se)+mass_N_terminus + mass_C_terminus)
    #Mass_exp = exp_mass_to_charge * charge - charge * mass_H
    #ppm = 10**6 * (Mass_exp - Mass_peptide) / Mass_peptide
    Mass_exp = [
        exp_mass_to_charge[i] * charge[i] - charge[i] * mass_H for i in range(len(exp_mass_to_charge))
    ]
    ppm = [
        10**6 * (Mass_exp[i] - Mass_peptide[i]) / Mass_peptide[i]
        for i in range(len(Mass_peptide))
    ]

    DF = np.empty((len(pgpointnovo_df['PSM_ID']), 20),dtype='object')
    DF[:,1] = pgpointnovo_df['PSM_ID']
    DF[:,3] = pgpointnovo_df['PGPointNovo Peptide']
    DF[:,4] = pgpointnovo_df['PSM_ID'] 
    DF[:,5] = length 
    DF[:,6] = pgpointnovo_df['PGPointNovo Score'] 
    DF[:,7] = pgpointnovo_df['PGPointNovo Score'] 
    DF[:,8] = length
    DF[:,9] = exp_mass_to_charge
    DF[:,10] = charge
    DF[:,11] = pgpointnovo_df['RT'] 
    DF[:,12] = '-' 
    DF[:,13] = 1
    DF[:,14] = Mass_peptide 
    DF[:,15] = ppm
    DF[:,17] = pgpointnovo_df['PGPointNovo aaScore'] 
    DF[:,18] = pgpointnovo_df['PGPointNovo Peptide']
    DF[:,19] = 'HCD' 
    name = ['Fraction','Source File','Feature','Peptide','Scan','Tag Length','Denovo Score','ALC (%)','length','m/z','z','RT','Predict RT','Area','Mass','ppm','PTM','local confidence (%)','tag (>=0%)','mode']
    df = pd.DataFrame(DF, columns=name)
    df['Denovo Score'] = df['Denovo Score'].astype(int)
    df = df[df['Denovo Score'] >= SocreAll[tool]]
    df.to_csv(pgpointnovo_out,index=False)

    with open(batchfile_dir,'r') as fr:
        lines = fr.readlines()
        newfile = resultdir + batchfile_dir.split('/')[-1]
        with open(newfile,'w') as fw:
            for line in lines:
                if 'Path     : ' in line:
                    fw.write(line.replace('../datasets/200305_HER_test_04_DENOVO.csv', pgpointnovo_out))
                elif 'CutoffALC: ' in line:
                    fw.write(line.replace('95', str(quality_cutoff_Stitch_local)))
                else:
                    fw.write(line)

    subprocess.run(
    ['../../../stitch-v1.5.0-linux/stitch.bin', 'run', newfile], stdout=subprocess.DEVNULL)

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/pgpointnovo/'
out_path = '../../../Stitch_Assembly/PGPointNovo/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
mabs = [mab for mab in mabs if '131ab_EThcd' not in mab and '.yaml' not in mab and 'Herceptin' not in mab and 'anti-FLAG-M2' not in mab]
for mab in mabs:
    print(mab)
    pgpointnovo_path = path + mab +'/'+mab+'.deepnovo_denovo'
    resultdir = out_path + mab +'/'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    batchfile_dir = '../../../stitch-v1.5.0-linux/batchfiles/' + mab + '.txt'
    process_pgpointnovo_Stitch(pgpointnovo_path,mgf_file,batchfile_dir,resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
import statistics

quality_cutoff_Stitch = 90
tool = 'SMSNet'
quality_cutoff_Stitch_local = SocreAll[tool]

def process_smsnet_Stitch(smsnet_path, mgf_file, batchfile_dir, resultdir):
    """parse de novo results from SMSNet to dataframe

            :param
                smsnet_path: path to the result file of SMSNet
            :return
                smsnet_df: dataframe with Score, Peptide, AAScore of SMSNet
        """
    titles=[]
    exp_mass_to_charge = [] 
    charge = []
    RT = []
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
            elif 'PEPMASS=' in line:
                exp_mass_to_charge.append(line.strip().split('PEPMASS=')[-1])
            elif 'CHARGE=' in line:
                charge.append(line.strip().split('CHARGE=')[-1])
            elif 'RTINSECONDS=' in line:
                RT.append(line.strip().split('RTINSECONDS=')[-1])
                
    exp_mass_to_charge = [float(i) for i in exp_mass_to_charge]
    charge = [int(re.match(r'\d+', i).group()) for i in charge] 

    smsnet_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Stitch) + '_localScore_' + str(quality_cutoff_Stitch_local) + '.csv'
                
    with open(smsnet_path) as f, open(
            smsnet_path + '_prob') as g:  # change _rescore and _prob to switch between rescoring and real
        smsnet_peptide = pd.Series([line.rstrip() for line in f])
        peptide_list = [x.replace(" ", "").replace("I", "L") for x in smsnet_peptide]
        
        smsnet_peptide = pd.DataFrame(peptide_list)
        aa_score = g.readlines()
        aa_score = [i.strip().split(' ') for i in aa_score]
        score_sum = []
        for i in range(len(aa_score)):
            if not aa_score[i] == ['']:
                for j in range(len(aa_score[i])):
                    aa_score[i][j] = int(float(np.exp(float(aa_score[i][j])) * 100))
            else:
                aa_score[i] = [0]
        for i in range(len(aa_score)):
            if not aa_score[i] == [0]:
                #score_sum.append(int(statistics.mean(aa_score[i]))) #python3
                score_sum.append(int(np.mean(aa_score[i])))
            else:
                score_sum.append(0)
        df = pd.DataFrame({'aaScore': aa_score, 'Peptide Score': score_sum})
        smsnet_df = pd.concat([smsnet_peptide, df], axis=1)
        smsnet_df['Spectrum Name'] = titles
        smsnet_df.columns = ['Peptide', 'aaScore', 'Score','Scan']
  
        smsnet_aascore = smsnet_df['aaScore'].tolist()
        smsnet_df['aaScore'] = [str(i).replace(' ', '').replace(',', ' ').replace('[', '').replace(']', '') for i in smsnet_aascore]
        smsnet_df['Peptide'] = smsnet_df['Peptide'].astype(str)
        smsnet_df['Peptide'] = smsnet_df['Peptide'].str.strip()

        smsnet_df['exp_mass_to_charge'] = exp_mass_to_charge
        smsnet_df['charge'] = charge
        smsnet_df['RT'] = RT
                
        smsnet_df1 = smsnet_df[smsnet_df['Peptide'].str.strip() != ''].copy()
        #smsnet_df1 = smsnet_df[smsnet_df['SMSNet Peptide'].notna()].copy()
        smsnet_df1 = smsnet_df1[~smsnet_df1['Peptide'].str.contains('<unk>|<s>', na=False)]
        smsnet_df1['Peptide'] = [i.replace('C','C(+57.02)').replace('m','M(+15.99)').replace('n','N(+.98)').replace('q','Q(+.98)') for i in smsnet_df1['Peptide']]
        seq = [i.replace('C(+57.02)','C').replace('N(+.98)','n').replace('Q(+.98)','q').replace('M(+15.99)','m') for i in smsnet_df1['Peptide']]
        length = [len(x) for x in seq]
        smsnet_df1 = smsnet_df1.reset_index(drop=True)
        Mass_peptide = []
        for se in seq:
            Mass_peptide.append(sum(mass_AA[aa] for aa in se)+mass_N_terminus + mass_C_terminus)
        #Mass_exp = exp_mass_to_charge * charge - charge * mass_H
        #ppm = 10**6 * (Mass_exp - Mass_peptide) / Mass_peptide
        exp_mass_to_charge = smsnet_df1['exp_mass_to_charge'].astype(float)
        charge = smsnet_df1['charge'].astype(int)
        Mass_exp = [
            exp_mass_to_charge[i] * charge[i] - charge[i] * mass_H for i in range(len(exp_mass_to_charge))
        ]
        ppm = [
            10**6 * (Mass_exp[i] - Mass_peptide[i]) / Mass_peptide[i]
            for i in range(len(Mass_peptide))
        ]

        DF = np.empty((len(smsnet_df1['Scan']), 20),dtype='object')
        DF[:,1] = smsnet_df1['Scan']
        DF[:,3] = smsnet_df1['Peptide']
        DF[:,4] = smsnet_df1['Scan']
        DF[:,5] = length
        DF[:,6] = smsnet_df1['Score'] 
        DF[:,7] = smsnet_df1['Score'] 
        DF[:,8] = length
        DF[:,9] = exp_mass_to_charge
        DF[:,10] = charge
        DF[:,11] = smsnet_df1['RT'] 
        DF[:,12] = '-' 
        DF[:,13] = 1
        DF[:,14] = Mass_peptide 
        DF[:,15] = ppm
        DF[:,17] = smsnet_df1['aaScore'] 
        DF[:,18] = smsnet_df1['Peptide']
        DF[:,19] = 'HCD' 
        name = ['Fraction','Source File','Feature','Peptide','Scan','Tag Length','Denovo Score','ALC (%)','length','m/z','z','RT','Predict RT','Area','Mass','ppm','PTM','local confidence (%)','tag (>=0%)','mode']
        df = pd.DataFrame(DF, columns=name)
        df.to_csv('../../../Stitch_Assembly/SMSNet/test.csv',index=False)
        df['Denovo Score'] = df['Denovo Score'].astype(int)
        df = df[df['Denovo Score'] >= SocreAll[tool]]
        df.to_csv(smsnet_out,index=False)

        with open(batchfile_dir,'r') as fr:
            lines = fr.readlines()
        newfile = resultdir + batchfile_dir.split('/')[-1]
        with open(newfile,'w') as fw:
            for line in lines:
                if 'Path     : ' in line:
                    fw.write(line.replace('../datasets/200305_HER_test_04_DENOVO.csv', smsnet_out))
                elif 'CutoffALC: ' in line:
                    fw.write(line.replace('95', str(quality_cutoff_Stitch_local)))
                else:
                    fw.write(line)

    subprocess.run(
    ['../../../stitch-v1.5.0-linux/stitch.bin', 'run', newfile], stdout=subprocess.DEVNULL)

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/SMSNet/'
out_path = '../../../Stitch_Assembly/SMSNet/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
mabs = [mab for mab in mabs if '131ab_EThcd' not in mab and '.yaml' not in mab and 'Herceptin' not in mab and 'anti-FLAG-M2' not in mab]
for mab in mabs:
    print(mab)
    smsnet_path = path + mab +'/'+'spectrum_'+Names[mab]+'_HCD_forSMSNet'
    resultdir = out_path + mab +'/'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    batchfile_dir = '../../../stitch-v1.5.0-linux/batchfiles/' + mab + '.txt'
    process_smsnet_Stitch(smsnet_path,mgf_file,batchfile_dir,resultdir)       