In [None]:
import os
import pandas as pd
path = '../../../ALPS_Assembly/'
os.mkdir(path)
tools_list = [
    "AdaNovo", "CasanovoV1", "CasanovoV2", "ContraNovo", "DeepNovo",
    "PepNet", "PGPointNovo", "pi-HelixNovo", "pi-PrimeNovo", "pNovo3", "PointNovo", 'SMSNet', 'InstaNovo'
]
mabs = os.listdir('../../../monoclonal_antibody/')
for tool in tools_list:
    path1 = path+tool
    os.mkdir(path1)
    for mab in mabs:
        path2 = path1+'/'+mab
        os.mkdir(path2)

df = pd.read_csv('../../data/Tool_Confidence_Threshold.csv')
SocreAll = pd.Series(df.Confidence_Threshold.values, index=df.Tool).to_dict()

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_ALPS = 50
tool = 'CasanovoV1'
quality_cutoff_ALPS_local = SocreAll[tool]
kmer = 7
contigs_ALPS = 20
def process_casanovoV1_ALPS(casanovo_path,mgf_file,resultdir):
    """Parse de novo results from Casanovo to dataframe.
            :param
                casanovo_path: path to the mztab file of Casanovo
            :return
                pnovo_df: dataframe with Score, Peptide, AAScore of Casanovo
    """
    titles=[]
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
    
    casanovo_out = resultdir + tool + f'_confScoreThreshold_{str(quality_cutoff_ALPS)}_localScore_{str(quality_cutoff_ALPS_local)}.csv'
    with open(casanovo_path) as f:           
        # Identify header
        for i, line in enumerate(f):
            if line.startswith("PSH"):
                casanovo_header = i 
                break
            
    # Load the data into a DataFrame
    df_deno = pd.read_csv(casanovo_path, sep="\t", header=casanovo_header)
    df_denovo = df_deno[df_deno['sequence'].notna()].copy()
    df_denovo['PSM_ID'] = df_denovo['PSM_ID'].astype(int)
    df_denovo['PSM_ID'] = df_denovo['PSM_ID'].apply(lambda x: titles[x] if x < len(titles) else None)

    sequence = df_denovo['sequence'].tolist()
    PSM_ID = df_denovo['PSM_ID'].tolist()
    Score = [int((i * 100 + 100)/2) for i in df_denovo['search_engine_score[1]'].tolist()]
    aascore = df_denovo['opt_ms_run[1]_aa_scores'].tolist()
    for i in range(len(aascore)):
        aascore[i]=aascore[i].replace(',', ' ')
    aascore= [i.replace(',', ' ') for i in aascore]
    aascore = [i.split() for i in aascore]
    aascore = [[str(int(float(j) * 100)) for j in i] for i in aascore]
    df_denovo['aaScore'] = [" ".join(i) for i in aascore]
    aascore = df_denovo['aaScore'].tolist()
    scan = df_denovo['PSM_ID'].tolist()
    DF = np.empty((len(PSM_ID), 5),dtype='object')
    DF[:,0] = PSM_ID
    DF[:,1] = sequence
    DF[:,3] = Score
    DF[:,2] = aascore
    DF[:,4] = 1 

    name = ['Spectrum Name','CasanovoV1 Peptide','CasanovoV1 aaScore', 'CasanovoV1 Score','Area']
    df = pd.DataFrame(DF, columns=name)
    df['CasanovoV1 Score'] = df['CasanovoV1 Score'].astype(int)
    df = df[df['CasanovoV1 Score'] >= SocreAll[tool]]
    df.to_csv(casanovo_out,index=False)
    subprocess.run(
        ('java', '-jar', '../../../ALPS.jar', casanovo_out, str(kmer), str(contigs_ALPS), '>>',
        resultdir + 'assembly.log'), stdout=subprocess.DEVNULL)

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/casanovo3/'
out_path = '../../../ALPS_Assembly/CasanovoV1/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
for mab in mabs:
    casanovo_path = path + mab +'/'+'casanovo_denovo.mztab'
    resultdir = out_path + mab +'/'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    process_casanovoV1_ALPS(casanovo_path,mgf_file,resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_ALPS = 50
tool = 'CasanovoV2'
quality_cutoff_ALPS_local = SocreAll[tool]
kmer = 7
contigs_ALPS = 20

def process_casanovoV2_ALPS(casanovo_path,mgf_file,resultdir):
    """Parse de novo results from Casanovo to dataframe.
            :param
                casanovo_path: path to the mztab file of Casanovo
            :return
                pnovo_df: dataframe with Score, Peptide, AAScore of Casanovo
    """
    titles=[]
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
    
    casanovo_out = resultdir + tool + f'_confScoreThreshold_{str(quality_cutoff_ALPS)}_localScore_{str(quality_cutoff_ALPS_local)}.csv'
    with open(casanovo_path) as f:           
        # Identify header
        for i, line in enumerate(f):
            if line.startswith("PSH"):
                casanovo_header = i 
                break
            
    # Load the data into a DataFrame
    df_deno = pd.read_csv(casanovo_path, sep="\t", header=casanovo_header)
    df_denovo = df_deno[df_deno['sequence'].notna()].copy()
               
    df_denovo['PSM_ID'] = [int(item.replace('ms_run[1]:index=', '')) for item in df_denovo['spectra_ref']]
    df_denovo['PSM_ID'] = df_denovo['PSM_ID'].apply(lambda x: titles[x] if x < len(titles) else None)

    sequence = df_denovo['sequence'].tolist()
    PSM_ID = df_denovo['PSM_ID'].tolist()
    Score = [int((i * 100 + 100)/2) for i in df_denovo['search_engine_score[1]'].tolist()]
    aascore = df_denovo['opt_ms_run[1]_aa_scores'].tolist()
    for i in range(len(aascore)):
        aascore[i]=aascore[i].replace(',', ' ')
    aascore= [i.replace(',', ' ') for i in aascore]
    aascore = [i.split() for i in aascore]
    aascore = [[str(int(float(j) * 100)) for j in i] for i in aascore]
    df_denovo['aaScore'] = [" ".join(i) for i in aascore]
    aascore = df_denovo['aaScore'].tolist()
    scan = df_denovo['PSM_ID'].tolist()
    DF = np.empty((len(PSM_ID), 5),dtype='object')
    DF[:,0] = PSM_ID
    DF[:,1] = sequence
    DF[:,3] = Score
    DF[:,2] = aascore
    DF[:,4] = 1 

    name = ['Spectrum Name','CasanovoV2 Peptide','CasanovoV2 aaScore', 'CasanovoV2 Score','Area']
    df = pd.DataFrame(DF, columns=name)
    df['CasanovoV2 Score'] = df['CasanovoV2 Score'].astype(int)
    df = df[df['CasanovoV2 Score'] >= SocreAll[tool]]
    df.to_csv(casanovo_out,index=False)
    subprocess.run(
        ('java', '-jar', '../../../ALPS.jar', casanovo_out, str(kmer), str(contigs_ALPS), '>>',
        resultdir + 'assembly.log'), stdout=subprocess.DEVNULL)
    
Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/casanovo4/'
out_path = '../../../ALPS_Assembly/CasanovoV2/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
for mab in mabs:
    casanovo_path = path + mab +'/'+'denovo.mztab'
    resultdir = out_path + mab +'/'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    process_casanovoV2_ALPS(casanovo_path,mgf_file,resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_ALPS = 50
tool = 'PepNet'
quality_cutoff_ALPS_local = SocreAll[tool]
kmer = 7
contigs_ALPS = 20

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/pepnet/'
out_path = '../../../ALPS_Assembly/PepNet/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
for mab in mabs:
    print(mab)
    pepnet_path = path + mab +'/'+'denovo.tsv'
    pepnet_df = pd.read_csv(pepnet_path, sep='\t')
    pepnet_title = pepnet_df['TITLE'].tolist()
    pepnet_df['Spectrum Name']=pepnet_title
    pepnet_peptide = pepnet_df['DENOVO'].tolist()
    pepnet_df['PepNet Peptide']=[peptide.replace("m", "M(+15.99)").replace("q", "Q(+.98)").replace("n", "N(+.98)").replace("C", "C(+57.02)") for peptide in pepnet_peptide]
    pepnet_score = [int(i*100) for i in pepnet_df['Score'].tolist()]
    pepnet_df['PepNet Score']=pepnet_score
    aascore = pepnet_df['Positional Score'].tolist()
    aascore= [i.replace('[', '').replace(']', '').replace(',', ' ') for i in aascore]
    aascore = [i.split() for i in aascore]
    aascore = [[str(int(float(j)*100)) for j in i] for i in aascore]
    pepnet_df['PepNet aaScore'] = [" ".join(i) for i in aascore]
    pepnet_df['Area'] = 1
    pepnet_df = pepnet_df[['Spectrum Name', 'PepNet Peptide', 'PepNet aaScore', 'PepNet Score', 'Area']]
    pepnet_df['PepNet Score'] = pepnet_df['PepNet Score'].astype(int)
    pepnet_df = pepnet_df[pepnet_df['PepNet Score'] >= SocreAll[tool]]
    resultdir = out_path + mab +'/'
    pepnet_out = resultdir + tool + f'_confScoreThreshold_{str(quality_cutoff_ALPS)}_localScore_{str(quality_cutoff_ALPS_local)}.csv'
    pepnet_df.to_csv(pepnet_out,index=False)
    subprocess.run(
        ('java', '-jar', '../../../ALPS.jar', pepnet_out, str(kmer), str(contigs_ALPS), '>>',
        resultdir + 'assembly.log'), stdout=subprocess.DEVNULL)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_Fusion = 50
aa_cutoff_Fusion = 50
tool = 'pNovo3'
quality_cutoff_Fusion_local = SocreAll[tool]

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/pNovo3/'
out_path = '../../../ALPS_Assembly/pNovo3/'
mabs = os.listdir(path)
for mab in mabs:
    print(mab)
    resultdir = out_path + mab +'/'
    pnovo_path = path + mab + '/HCD/results.res'
    df = pd.read_csv(pnovo_path, sep="	", header=None)
    df = df[[0, 1, 4, 5]]
    df.columns = ['Spectrum Name', 'pNovo3 Peptide', 'pNovo3 Score', 'pNovo3 aaScore']

    peps = list(df['pNovo3 Peptide'])
    aas = list(df['pNovo3 aaScore'])
    loc = [len(p) == len(a.split(',')) for p, a in zip(peps, aas)]
    pnovo_df = df[loc].copy()  

    pnovo_peptide = pnovo_df['pNovo3 Peptide'].tolist()
    pnovo_peptide = [str(i).replace('I','L').replace('a', 'N(+.98)').replace('b', 'Q(+.98)').replace('B', 'Q(+.98)').replace('c', 'M(+15.99)').replace('C','C(+57.02)') for i in pnovo_peptide]
    pnovo_df['pNovo3 Peptide']=pnovo_peptide
    pnovo_score = [int(i) for i in pnovo_df['pNovo3 Score'].tolist()]
    pnovo_df['pNovo3 Score']=pnovo_score
    aascore = pnovo_df['pNovo3 aaScore'].tolist()
    aascore= [i.replace(',', ' ') for i in aascore]
    aascore = [i.split() for i in aascore]
    aascore = [[str(int(float(j))) for j in i] for i in aascore]
    pnovo_df['pNovo3 aaScore'] = [" ".join(i) for i in aascore]
    pnovo_df = pnovo_df[['Spectrum Name', 'pNovo3 Peptide', 'pNovo3 aaScore', 'pNovo3 Score']]
    pnovo_df['pNovo3 Score'] = pnovo_df['pNovo3 Score'].astype(int)
    pnovo_df = pnovo_df[pnovo_df['pNovo3 Score'] >= SocreAll[tool]]
    pnovo_df['Area'] = 1
    pnovo_out = resultdir + tool + '_confScoreThreshold_' + str(quality_cutoff_Fusion) + '_localScore_' + str(quality_cutoff_Fusion_local) + '.csv'
    pnovo_df.to_csv(pnovo_out,index=False)
    subprocess.run(
        ('java', '-jar', '../../../ALPS.jar', pnovo_out, str(kmer), str(contigs_ALPS), '>>',
        resultdir + 'assembly.log'), stdout=subprocess.DEVNULL)


In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_ALPS = 50
tool = 'pi-HelixNovo'
quality_cutoff_ALPS_local = SocreAll[tool]
kmer = 7
contigs_ALPS = 20

def process_pinovo_ALPS(pinovo_path,resultdir):
    """parse de novo results from Pi-HelixNovo to dataframe
            :param
                pinovo_path: path to the result file of Pi-HelixNovo
            :return
                pinovo_df: dataframe with Tiltle, Peptide, Score of Pi-HelixNovo
    """
    pinovo_out = resultdir + tool + f'_confScoreThreshold_{str(quality_cutoff_ALPS)}_localScore_{str(quality_cutoff_ALPS_local)}.csv'
    pinovo_df = pd.read_csv(pinovo_path, sep="\t", header=None)
    pinovo_df = pinovo_df.dropna()
    pinovo_df.columns = ["Title", "sequence", "Score"]
    pinovo_title = pinovo_df['Title']
    pinovo_df['Title'] = pinovo_title
    pinovo_peptide = pinovo_df['sequence'].tolist()
    pinovo_df['pi-HelixNovo Peptide'] = [str(i).replace('M(+15.99)', 'm').replace('Q(+.98)', 'q').replace('N(+.98)', 'n').replace(' ',
                                        '').replace('C(+57.02)', 'C') for i in pinovo_peptide]
            
    # scale peptide score from 0 to 100

    pinovo_score = pinovo_df['Score']
    pinovo_df['pi-HelixNovo Score'] = [int(i * 100) for i in pinovo_score]
    pinovo_df['pi-HelixNovo aaScore'] = pinovo_df.apply(lambda row: ' '.join([str(row['pi-HelixNovo Score'])] * len(row['pi-HelixNovo Peptide'])), axis=1)
    pinovo_df['pi-HelixNovo Peptide'] = pinovo_peptide
    pinovo_df['Spectrum Name'] = pinovo_df['Title']
    pinovo_df['Area'] = 1
    pinovo_df = pinovo_df[['Spectrum Name', 'pi-HelixNovo Peptide', 'pi-HelixNovo aaScore', 'pi-HelixNovo Score', 'Area']]
    pinovo_df['pi-HelixNovo Score'] = pinovo_df['pi-HelixNovo Score'].astype(int)
    pinovo_df = pinovo_df[pinovo_df['pi-HelixNovo Score'] >= SocreAll[tool]]
    pinovo_df.to_csv(pinovo_out,index=False)
    subprocess.run(
        ('java', '-jar', '../../../ALPS.jar', pinovo_out, str(kmer), str(contigs_ALPS), '>>',
        resultdir + 'assembly.log'), stdout=subprocess.DEVNULL)
    
Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/pi-helixnovo/'
out_path = '../../../ALPS_Assembly/pi-HelixNovo/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
for mab in mabs:
    pinovo_path = path + mab +'/'+'denovo_denovo.txt'
    resultdir = out_path + mab +'/'
    process_pinovo_ALPS(pinovo_path, resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_ALPS = 50
tool = 'pi-PrimeNovo'
quality_cutoff_ALPS_local = SocreAll[tool]
kmer = 7
contigs_ALPS = 20


def process_primenovo_ALPS(primenovo_path, resultdir):
    """parse de novo results from pi-PrimeNovo to dataframe
            :param
                primenovo_path: path to the mztab file of pi-PrimeNovo
            :return
                pnovo_df: dataframe with Score, Peptide, AAScore of pi-PrimeNovo
    """ 
    primenovo_out = resultdir + tool + f'_confScoreThreshold_{str(quality_cutoff_ALPS)}_localScore_{str(quality_cutoff_ALPS_local)}.csv'
    primenovo_df = pd.read_table(primenovo_path)
    primenovo_peptide = primenovo_df['prediction'].tolist()
    primenovo_df['pi-PrimeNovo Peptide'] = [str(i).replace('M[+15.995]', 'm').replace('Q[+0.984]', 'q').replace('N[+0.984]', 'n').replace(' ',
                    '').replace('C[+57.021]', 'C').replace('[-17.027]-Q', 'p').replace('-[17.027]-Q', 'p').replace('[+43.006-17.027]-','').replace('+[43.006-17.027]','').replace('[17.027]-','').replace('[-17.027]-','').replace('-[17.027]','').replace('[+43.006]-','').replace('+[43.006]','').replace('[+42.011]-','').replace('+[42.011]','') for i in primenovo_peptide]
    score = primenovo_df['score']
    primenovo_df['pi-PrimeNovo Score'] = [int(i * 100) for i in score]
    primenovo_df['pi-PrimeNovo aaScore'] = primenovo_df.apply(lambda row: ' '.join([str(row['pi-PrimeNovo Score'])] * len(row['pi-PrimeNovo Peptide'])), axis=1)
    primenovo_df['pi-PrimeNovo Peptide'] = [str(i).replace('M[+15.995]', 'M(+15.99)').replace('Q[+0.984]', 'Q(+.98)').replace('N[+0.984]', 'N(+.98)').replace(' ',
                    '').replace('C[+57.021]', 'C(+57.02)').replace('[-17.027]-Q', 'Q(-17.03)').replace('-[17.027]-Q', 'Q(-17.03)').replace('[+43.006-17.027]-','').replace('+[43.006-17.027]','').replace('[17.027]-','').replace('[-17.027]-','').replace('-[17.027]','').replace('[+43.006]-','').replace('+[43.006]','').replace('[+42.011]-','').replace('+[42.011]','') for i in primenovo_peptide]
    primenovo_df['Spectrum Name'] = primenovo_df['label']
    primenovo_df['Area'] = 1
    primenovo_df = primenovo_df[['Spectrum Name', 'pi-PrimeNovo Peptide', 'pi-PrimeNovo aaScore', 'pi-PrimeNovo Score', 'Area']]
    primenovo_df['pi-PrimeNovo Score'] = primenovo_df['pi-PrimeNovo Score'].astype(int)
    primenovo_df = primenovo_df[primenovo_df['pi-PrimeNovo Score'] >= SocreAll[tool]]
    primenovo_df.to_csv(primenovo_out,index=False)
    subprocess.run(
        ('java', '-jar', '../../../ALPS.jar', primenovo_out, str(kmer), str(contigs_ALPS), '>>',
        resultdir + 'assembly.log'), stdout=subprocess.DEVNULL)

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/pi-primenovo/'
out_path = '../../../ALPS_Assembly/pi-PrimeNovo/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
for mab in mabs:
    primenovo_path = path + mab +'/'+'denovo.tsv'
    resultdir = out_path + mab +'/'
    process_primenovo_ALPS(primenovo_path, resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_ALPS = 50
tool = 'AdaNovo'
quality_cutoff_ALPS_local = SocreAll[tool]
kmer = 7
contigs_ALPS = 20

def process_adanovo_ALPS(adanovo_path, mgf_file, resultdir):
    """Parse de novo results from AdaNovo to dataframe.
            :param
                adanovo_path: path to the mztab file of AdaNovo
            :return
                pnovo_df: dataframe with Score, Peptide, AAScore of AdaNovo
    """
    titles=[]
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
    
    adanovo_out = resultdir + tool + f'_confScoreThreshold_{str(quality_cutoff_ALPS)}_localScore_{str(quality_cutoff_ALPS_local)}.csv'
    with open(adanovo_path) as f:           
        # Identify header
        for i, line in enumerate(f):
            if line.startswith("PSH"):
                adanovo_header = i 
                break
            
    # Load the data into a DataFrame
    df_deno = pd.read_csv(adanovo_path, sep="\t", header=adanovo_header)
    df_denovo = df_deno[df_deno['sequence'].notna()].copy()

    df_denovo['PSM_ID'] = [int(item.replace('ms_run[1]:index=', '')) for item in df_denovo['spectra_ref']]
    df_denovo['PSM_ID'] = df_denovo['PSM_ID'].apply(lambda x: titles[x] if x < len(titles) else None)
    PSM_ID = df_denovo['PSM_ID'].tolist()
    sequence = df_denovo['sequence'].tolist()
    Score = [int((i * 100 + 100)/2) for i in df_denovo['search_engine_score[1]'].tolist()]
    aascore = df_denovo['opt_ms_run[1]_aa_scores'].tolist()
    for i in range(len(aascore)):
        aascore[i]=aascore[i].replace(',', ' ')
    aascore= [i.replace(',', ' ') for i in aascore]
    aascore = [i.split() for i in aascore]
    aascore = [[str(int(float(j) * 100)) for j in i] for i in aascore]
    df_denovo['aaScore'] = [" ".join(i) for i in aascore]
    aascore = df_denovo['aaScore'].tolist()

    DF = np.empty((len(PSM_ID), 5),dtype='object')
    DF[:,0] = PSM_ID
    DF[:,1] = sequence
    DF[:,3] = Score
    DF[:,2] = aascore
    DF[:,4] = 1 
    name = ['Spectrum Name','AdaNovo Peptide','AdaNovo aaScore','AdaNovo Score','Area']
    df = pd.DataFrame(DF, columns=name)
    df['AdaNovo Score'] = df['AdaNovo Score'].astype(int)
    df = df[df['AdaNovo Score'] >= SocreAll[tool]]
    df.to_csv(adanovo_out,index=False)
    subprocess.run(
        ('java', '-jar', '../../../ALPS.jar', adanovo_out, str(kmer), str(contigs_ALPS), '>>',
        resultdir + 'assembly.log'), stdout=subprocess.DEVNULL)

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/adanovo/'
out_path = '../../../ALPS_Assembly/AdaNovo/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
for mab in mabs:
    adanovo_path = path + mab +'/'+'denovo.mztab'
    resultdir = out_path + mab +'/'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    process_adanovo_ALPS(adanovo_path,mgf_file,resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_ALPS = 50
tool = 'ContraNovo'
quality_cutoff_ALPS_local = SocreAll[tool]
kmer = 7
contigs_ALPS = 20

def process_contranovo_ALPS(contranovo_path, resultdir):
    """Parse de novo results from ContraNovo to dataframe.
            :param
                contranovo_path: path to the mztab file of ContraNovo
            :return
                contranovo_df: dataframe with Score, Peptide, AAScore of ContraNovo
    """
    
    contranovo_out = resultdir + tool + f'_confScoreThreshold_{str(quality_cutoff_ALPS)}_localScore_{str(quality_cutoff_ALPS_local)}.csv'
    with open(contranovo_path) as f:           
        # Identify header
        for i, line in enumerate(f):
            if line.startswith("PSH"):
                contranovo_header = i 
                break
            
    # Load the data into a DataFrame
    df_deno = pd.read_csv(contranovo_path, sep="\t", header=contranovo_header)
    df_denovo = df_deno[df_deno['sequence'].notna()].copy()
    
    df_denovo['PSM_ID'] = [i.replace('ms_run[','').replace(']','') for i in df_denovo['spectra_ref']]

    sequence = df_denovo['sequence'].tolist()
    #sequence = [str(i).replace('M+15.995', 'M(+15.99)').replace('Q+0.984', 'Q(+.98)').replace('N+0.984', 'N(+.98)').replace(' ','').replace('C+57.021', 'C(+57.02)').replace('-17.027Q','').replace('+43.006-17.027','').replace('-17.027','').replace('+43.006','').replace('+42.011','') for i in sequence]     
    
    PSM_ID = df_denovo['PSM_ID'].tolist()
    Score = [int((i * 100 + 100)/2) for i in df_denovo['search_engine_score[1]'].tolist()]
    aascore = df_denovo['opt_ms_run[1]_aa_scores'].tolist()
    for i in range(len(aascore)):
        aascore[i]=aascore[i].replace(',', ' ')
    aascore= [i.replace(',', ' ') for i in aascore]
    aascore = [i.split() for i in aascore]
    aascore = [[str(int(float(j) * 100)) for j in i] for i in aascore]
    df_denovo['aaScore'] = [" ".join(i) for i in aascore]
    aascore = df_denovo['aaScore'].tolist()
    scan = df_denovo['PSM_ID'].tolist()
    seq = []
    for s, aa in zip(sequence, aascore):
        # Replace known components in the sequence
        s_modified = s.replace('M+15.995', 'm') \
                  .replace('Q+0.984', 'q') \
                  .replace('N+0.984', 'n') \
                  .replace('C+57.021', 'C') \
                  .replace('+42.011', 'g') \
                  .replace('+43.006-17.027', 'd') \
                  .replace('-17.027','e') \
                  .replace('+43.006','f')
        # Split the aaScore to check lengths
        aa_list = aa.split()
        # If the sequence length and aaScore length don't match, replace 'd' with 'fe'
        if len(s_modified) != len(aa_list):
            s_modified = s_modified.replace('d', 'fe')  # If mismatch, replace 'd' with 'f' and 'e'

        # Append the modified sequence
        seq.append(s_modified)

    # 过滤包含p, d, e, f, g的氨基酸，并同步移除对应位置的aascore
    filtered_sequences = []
    filtered_aascores = []

    excluded_chars = {'p', 'd', 'e', 'f', 'g'}

    idx = 0
    for s, aa in zip(seq, aascore):
        idx += 1
        # 将氨基酸分数字符串转换为列表
        aa_list = aa.split()
    
        # 检查序列和分数长度是否一致
        if len(s) != len(aa_list):
            raise ValueError(f"序列长度与分数数量不匹配: 序列长度 {len(s)}, 分数数量 {len(aa_list)}")
    
        # 同步过滤氨基酸和对应分数
        filtered = [(char, score) for char, score in zip(s, aa_list) if char not in excluded_chars]
    
        # 分离过滤后的字符和分数
        filtered_chars, filtered_scores = zip(*filtered) if filtered else ([], [])
    
        filtered_sequences.append(''.join(filtered_chars))
        filtered_aascores.append(' '.join(filtered_scores))

    filtered_sequences = [seqe.replace('C','C(+57.02)').replace('m','M(+15.99)').replace('q','Q(+.98)').replace('n','N(+.98)')  for seqe in filtered_sequences]
    
    DF = np.empty((len(PSM_ID), 5),dtype='object')
    DF[:,0] = PSM_ID
    DF[:,1] = filtered_sequences
    DF[:,3] = Score
    DF[:,2] = filtered_aascores
    DF[:,4] = 1 
    name = ['Spectrum Name','ContraNovo Peptide','ContraNovo aaScore','ContraNovo Score','Area']
    df = pd.DataFrame(DF, columns=name)
    df['ContraNovo Score'] = df['ContraNovo Score'].astype(int)
    df = df[df['ContraNovo Score'] >= SocreAll[tool]]
    df.to_csv(contranovo_out,index=False)
    subprocess.run(
        ('java', '-jar', '../../../ALPS.jar', contranovo_out, str(kmer), str(contigs_ALPS), '>>',
        resultdir + 'assembly.log'), stdout=subprocess.DEVNULL)

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/contranovo/'
out_path = '../../../ALPS_Assembly/ContraNovo/'
mabs = os.listdir(path)
for mab in mabs:
    print(mab)
    contranovo_path = path + mab +'/'+'denovo.mztab'
    resultdir = out_path + mab +'/'
    process_contranovo_ALPS(contranovo_path, resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_ALPS = 50
tool = 'DeepNovo'
quality_cutoff_ALPS_local = SocreAll[tool]
kmer = 7
contigs_ALPS = 20

def process_deepnovo_ALPS(deepnovo_path, resultdir):
    """parse de novo results from DeepNoo to dataframe

            :param
                deepnovo_path: path to the result file of DeepNovo
            :return
                deepnovo_df: dataframe with Score, Peptide, AAScore of DeepNovo
    """
    deepnovo_out = resultdir + tool + f'_confScoreThreshold_{str(quality_cutoff_ALPS)}_localScore_{str(quality_cutoff_ALPS_local)}.csv'
    deepno_df = pd.read_csv(deepnovo_path, sep="	", header=0)
    deepno_df = deepno_df[['scan', 'predicted_score', 'predicted_sequence',  'predicted_position_score']]
    deepnovo_df = deepno_df[deepno_df['predicted_sequence'].notna()].copy()

    # bug from deepnovo (v.PNAS) which causes result to be wrong aligned

    '''
    scan    output_score    output_seq  aa_scpre
    1       NaN             NaN         NaN
    NaN     24              S,E,L       12, 41, 12
    2       NaN ...
    '''
    deepnovo_scan = deepnovo_df['scan'].tolist()
    deepnovo_df['scan'] = deepnovo_scan

    # change peptide from P,E,P,T,I,D to PEPTID

    deepnovo_peptide = deepnovo_df['predicted_sequence'].tolist()
    for i in range(len(deepnovo_peptide)):
        deepnovo_peptide[i] = str(deepnovo_peptide[i])
        deepnovo_peptide[i] = deepnovo_peptide[i].replace(",", "").replace("I", "L").replace("Cmod",
            "C(+57.02)").replace("Mmod", "M(+15.99)").replace("Nmod", "N(+.98)").replace("Qmod", "Q(+.98)")
    deepnovo_df['predicted_sequence'] = deepnovo_peptide
    deepnovo_df.columns = ['Spectrum Name','DeepNovo Score', 'DeepNovo Peptide', 'DeepNovo aaScore']

    # scale peptide score from 0 to 100

    deepnovo_score = deepnovo_df['DeepNovo Score']
    deepnovo_df['DeepNovo Score'] = [int(np.exp(i) * 100) if not pd.isna(i) else 0 for i in deepnovo_score]

    # scale amino acid score from 0 to 100

    deepnovo_aascore = deepnovo_df['DeepNovo aaScore'].tolist()
    deepnovo_aascore = [str(i).replace(",", " ") for i in deepnovo_aascore]
    deepnovo_aascore = [i.split() for i in deepnovo_aascore]
    deepnovo_aascore = [[str(int(np.exp(float(j)) * 100)) if j not in ['nan', 'NaN', '', None] else '' for j in i] for i in deepnovo_aascore]
    deepnovo_df['DeepNovo aaScore'] = [" ".join(i) for i in deepnovo_aascore]
    deepnovo_df['Area'] = 1
    deepnovo_df['DeepNovo Score'] = deepnovo_df['DeepNovo Score'].astype(int)
    deepnovo_df = deepnovo_df[deepnovo_df['DeepNovo Score'] >= SocreAll[tool]]
    deepnovo_df = deepnovo_df[['Spectrum Name', 'DeepNovo Peptide', 'DeepNovo aaScore', 'DeepNovo Score', 'Area']]
    deepnovo_df.to_csv(deepnovo_out,index=False)
    subprocess.run(
        ('java', '-jar', '../../../ALPS.jar', deepnovo_out, str(kmer), str(contigs_ALPS), '>>',
        resultdir + 'assembly.log'), stdout=subprocess.DEVNULL)

path = '../../../denovo/deepnovo2/'
out_path = '../../../ALPS_Assembly/DeepNovo/'
mabs = os.listdir(path)
for mab in mabs:
    print(mab)
    deepnovo_path = path + mab +'/'+mab+'.deepnovo_denovo'
    resultdir = out_path + mab +'/'
    process_deepnovo_ALPS(deepnovo_path, resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_ALPS = 50
tool = 'InstaNovo'
quality_cutoff_ALPS_local = SocreAll[tool]
kmer = 7
contigs_ALPS = 20

def process_instanovo_ALPS(instanovo_path,mgf_file,resultdir):
    """Parse de novo results from AdaNovo to dataframe.
            :param
                instanovo_path: path to the mztab file of InstaNovo
            :return
                pnovo_df: dataframe with Score, Peptide, AAScore of InstaNovo
    """
    instanovo_out = resultdir + tool + f'_confScoreThreshold_{str(quality_cutoff_ALPS)}_localScore_{str(quality_cutoff_ALPS_local)}.csv'
    titles=[]
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
           
    # Load the data into a DataFrame
    df_deno = pd.read_csv(instanovo_path)
    df_denovo = df_deno[df_deno['predictions'].notna()].copy()

    df_denovo['PSM_ID'] = [int(item) for item in df_denovo['scan_number']]
    df_denovo['PSM_ID'] = df_denovo['PSM_ID'].apply(lambda x: titles[x] if x < len(titles) else None)

    sequence = df_denovo['predictions'].tolist()
    PSM_ID = df_denovo['PSM_ID'].tolist()
    Score = [int(np.exp(i) * 100) for i in df_denovo['log_probabilities'].tolist()]
    aascore = df_denovo['token_log_probabilities'].tolist()
    for i in range(len(aascore)):
        aascore[i]=aascore[i].replace(',', ' ').replace('[','').replace(']','')
    aascore= [i.replace(',', ' ') for i in aascore]
    aascore = [i.split() for i in aascore]
    #aascore = [[str(int(np.exp(float(j)) * 100)) for j in i] for i in aascore]
    aascore = [[str(int(np.exp(float(i[j])) * 100)) for j in range(0, len(i)-1)] for i in aascore]

    df_denovo['aaScore'] = [" ".join(i) for i in aascore]
    aascore = df_denovo['aaScore'].tolist()
    scan = df_denovo['PSM_ID'].tolist()
    seq = [str(i).replace('M[UNIMOD:35]', 'M(+15.99)').replace('Q[UNIMOD:7]', 'Q(+.98)').replace('N[UNIMOD:7]', 'N(+.98)').replace('C[UNIMOD:4]', 'C(+57.02)').replace('I','L') for i in sequence]
    DF = np.empty((len(PSM_ID), 4),dtype='object')
    DF[:,0] = PSM_ID
    DF[:,1] = seq
    DF[:,2] = Score
    DF[:,3] = aascore 

    name = ['Spectrum Name','InstaNovo Peptide','InstaNovo Score','InstaNovo aaScore']
    df = pd.DataFrame(DF, columns=name)
    df.to_csv(instanovo_out,index=False)

    df['Area'] = 1
    df['InstaNovo Score'] = df['InstaNovo Score'].astype(int)
    df = df[df['InstaNovo Score'] >= SocreAll[tool]]
    df = df[['Spectrum Name', 'InstaNovo Peptide', 'InstaNovo aaScore', 'InstaNovo Score', 'Area']]
    

    peps = [i.replace('C(+57.02)','C').replace('M(+15.99)','m').replace('Q(+.98)','q').replace('N(+.98)','n') for i in df['InstaNovo Peptide']]
    aas = list(df['InstaNovo aaScore'])
    loc = [len(p) == len(a.split()) for p, a in zip(peps, aas)]
    df1 = df[loc].copy()
    
    df1.to_csv(instanovo_out,index=False)
    subprocess.run(
        ('java', '-jar', '../../../ALPS.jar', instanovo_out, str(kmer), str(contigs_ALPS), '>>',
        resultdir + 'assembly.log'), stdout=subprocess.DEVNULL)

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/instanovo_new/'
out_path = '../../../ALPS_Assembly/InstaNovo/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
for mab in mabs:
    instanovo_path = path + mab +'/'+'predictions_before_refinement.csv'
    resultdir = out_path + mab +'/'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    process_instanovo_ALPS(instanovo_path,mgf_file,resultdir)


In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_ALPS = 50
tool = 'PointNovo'
quality_cutoff_ALPS_local = SocreAll[tool]
kmer = 7
contigs_ALPS = 20

def process_pointnovo_ALPS(pointnovo_path, mgf_file, resultdir):
    """parse de novo results from PointNovo to dataframe
            :param
                pointnovo_path: path to the result file of PointNovo
            :return
                pointnovo_df: dataframe with ScanNum, Score, Peptide, AAScore of PointNovo
    """

    pointnovo_out = resultdir + tool + f'_confScoreThreshold_{str(quality_cutoff_ALPS)}_localScore_{str(quality_cutoff_ALPS_local)}.csv'
    titles=[]
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
    pointno_df = pd.read_csv(pointnovo_path, sep="\t", header=0)
    pointno_df['PSM_ID'] = [int(item) for item in pointno_df['feature_id']]
    pointno_df['PSM_ID'] = pointno_df['PSM_ID'].apply(lambda x: titles[x-1] if x < len(titles) else None)
    
    pointno_df = pointno_df[['PSM_ID', 'predicted_sequence', 'predicted_score', 'predicted_position_score']]
    pointnovo_df = pointno_df[pointno_df['predicted_sequence'].notna()].copy()

    # change peptide from P,E,P,T,I,D to PEPTID

    pointnovo_peptide = pointnovo_df['predicted_sequence'].tolist()
    for i in range(len(pointnovo_peptide)):
        pointnovo_peptide[i] = str(pointnovo_peptide[i])
        pointnovo_peptide[i] = pointnovo_peptide[i].replace(",", "").replace("I", "L").replace("N(Deamidation)",
        "N(+.98)").replace("Q(Deamidation)", "Q(+.98)").replace("C(Carbamidomethylation)", "C(+57.02)").replace("M(Oxidation)", "M(+15.99)")
    pointnovo_df['predicted_sequence'] = pointnovo_peptide
    pointnovo_df.columns = ['Spectrum Name','PointNovo Peptide', 'PointNovo Score', 'PointNovo aaScore']

    # scale peptide score from 0 to 100

    pointnovo_score = pointnovo_df['PointNovo Score']
    pointnovo_df['PointNovo Score'] = [int(np.exp(i) * 100) if not pd.isna(i) else 0 for i in pointnovo_score]

    # scale AAscore from 0 to 100

    pointnovo_aascore = pointnovo_df['PointNovo aaScore'].tolist()
    pointnovo_aascore = [str(i).replace(",", " ") for i in pointnovo_aascore]
    pointnovo_aascore = [i.split() for i in pointnovo_aascore]
    pointnovo_aascore = [[str(int(np.exp(float(j)) * 100)) if j not in ['nan', 'NaN', '', None] else '' for j in i] for i in pointnovo_aascore]
    pointnovo_df['PointNovo aaScore'] = [" ".join(i) for i in pointnovo_aascore]

    pointnovo_df = pointnovo_df[['Spectrum Name','PointNovo Peptide', 'PointNovo aaScore', 'PointNovo Score']]
    pointnovo_df['Area'] = 1

    pointnovo_df['PointNovo Score'] = pointnovo_df['PointNovo Score'].astype(int)
    pointnovo_df = pointnovo_df[pointnovo_df['PointNovo Score'] >= SocreAll[tool]]
    pointnovo_df.to_csv(pointnovo_out,index=False)
    subprocess.run(
        ('java', '-jar', '../../../ALPS.jar', pointnovo_out, str(kmer), str(contigs_ALPS), '>>',
        resultdir + 'assembly.log'), stdout=subprocess.DEVNULL)
            
Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/pointnovo/'
out_path = '../../../ALPS_Assembly/PointNovo/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
for mab in mabs:
    pointnovo_path = path + mab +'/'+mab+'.deepnovo_denovo'
    resultdir = out_path + mab +'/'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    process_pointnovo_ALPS(pointnovo_path,mgf_file,resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
quality_cutoff_ALPS = 50
tool = 'PGPointNovo'
quality_cutoff_ALPS_local = SocreAll[tool]
kmer = 7
contigs_ALPS = 20

def process_pgpointnovo_ALPS(pgpointnovo_path,mgf_file, resultdir):
    """parse de novo results from PointNovo to dataframe
            :param
                pgpointnovo_path: path to the result file of PointNovo
            :return
                pgpointnovo_df: dataframe with ScanNum, Score, Peptide, AAScore of PointNovo
    """
    pgpointnovo_out = resultdir + tool + f'_confScoreThreshold_{str(quality_cutoff_ALPS)}_localScore_{str(quality_cutoff_ALPS_local)}.csv'
    titles=[]
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
    pgpointno_df = pd.read_csv(pgpointnovo_path, sep="\t", header=0)

    pgpointno_df['PSM_ID'] = [int(item) for item in pgpointno_df['feature_id']]
    pgpointno_df['Spectrum Name'] = pgpointno_df['PSM_ID'].apply(lambda x: titles[x-1] if x < len(titles) else None)
    
    pgpointno_df = pgpointno_df[['Spectrum Name', 'predicted_sequence', 'predicted_score', 'predicted_position_score']]
    pgpointnovo_df = pgpointno_df[pgpointno_df['predicted_sequence'].notna()].copy()

    # change peptide from P,E,P,T,I,D to PEPTID

    pgpointnovo_peptide = pgpointnovo_df['predicted_sequence'].tolist()
    for i in range(len(pgpointnovo_peptide)):
        pgpointnovo_peptide[i] = str(pgpointnovo_peptide[i])
        pgpointnovo_peptide[i] = pgpointnovo_peptide[i].replace(",", "").replace("I", "L").replace("N(Deamidation)",
        "N(+.98)").replace("Q(Deamidation)", "Q(+.98)").replace("C(Carbamidomethylation)", "C(+57.02)").replace("M(Oxidation)", "M(+15.99)")
    pgpointnovo_df['predicted_sequence'] = pgpointnovo_peptide
    pgpointnovo_df.columns = ['Spectrum Name', 'PGPointNovo Peptide', 'PGPointNovo Score', 'PGPointNovo aaScore']

    # scale peptide score from 0 to 100

    pgpointnovo_score = pgpointnovo_df['PGPointNovo Score']
    pgpointnovo_df['PGPointNovo Score'] = [int(np.exp(i) * 100) if not pd.isna(i) else 0 for i in pgpointnovo_score]

    # scale AAscore from 0 to 100

    pgpointnovo_aascore = pgpointnovo_df['PGPointNovo aaScore'].tolist()
    pgpointnovo_aascore = [str(i).replace(",", " ") for i in pgpointnovo_aascore]
    pgpointnovo_aascore = [i.split() for i in pgpointnovo_aascore]
    pgpointnovo_aascore = [[str(int(np.exp(float(j)) * 100)) if j not in ['nan', 'NaN', '', None] else '' for j in i] for i in pgpointnovo_aascore]

    pgpointnovo_df['PGPointNovo aaScore'] = [" ".join(i) for i in pgpointnovo_aascore]

    pgpointnovo_df = pgpointnovo_df[['Spectrum Name','PGPointNovo Peptide', 'PGPointNovo aaScore', 'PGPointNovo Score']]
    pgpointnovo_df['Area'] = 1

    pgpointnovo_df['PGPointNovo Score'] = pgpointnovo_df['PGPointNovo Score'].astype(int)
    pgpointnovo_df = pgpointnovo_df[pgpointnovo_df['PGPointNovo Score'] >= SocreAll[tool]]
    pgpointnovo_df.to_csv(pgpointnovo_out,index=False)
    subprocess.run(
        ('java', '-jar', '../../../ALPS.jar', pgpointnovo_out, str(kmer), str(contigs_ALPS), '>>',
        resultdir + 'assembly.log'), stdout=subprocess.DEVNULL)

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/pgpointnovo/'
out_path = '../../../ALPS_Assembly/PGPointNovo/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
for mab in mabs:
    pgpointnovo_path = path + mab +'/'+mab+'.deepnovo_denovo'
    resultdir = out_path + mab +'/'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    process_pgpointnovo_ALPS(pgpointnovo_path,mgf_file,resultdir)

In [None]:
import pandas as pd
import numpy as np
import subprocess 
import statistics
quality_cutoff_ALPS = 50
tool = 'SMSNet'
quality_cutoff_ALPS_local = SocreAll[tool]
kmer = 7
contigs_ALPS = 20

def process_smsnet_ALPS(smsnet_path, mgf_file, resultdir):
    """parse de novo results from SMSNet to dataframe

            :param
                smsnet_path: path to the result file of SMSNet
            :return
                smsnet_df: dataframe with Score, Peptide, AAScore of SMSNet
        """
    smsnet_out = resultdir + tool + f'_confScoreThreshold_{str(quality_cutoff_ALPS)}_localScore_{str(quality_cutoff_ALPS_local)}.csv'
    titles=[]
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
                
    with open(smsnet_path) as f, open(
            smsnet_path + '_prob') as g:  # change _rescore and _prob to switch between rescoring and real
        smsnet_peptide = pd.Series([line.rstrip() for line in f])
        peptide_list = [x.replace(" ", "").replace("I", "L") for x in smsnet_peptide]
        
        smsnet_peptide = pd.DataFrame(peptide_list)
        aa_score = g.readlines()
        aa_score = [i.strip().split(' ') for i in aa_score]
        score_sum = []
        for i in range(len(aa_score)):
            if not aa_score[i] == ['']:
                for j in range(len(aa_score[i])):
                    aa_score[i][j] = int(float(np.exp(float(aa_score[i][j])) * 100))
            else:
                aa_score[i] = [0]
        for i in range(len(aa_score)):
            if not aa_score[i] == [0]:
                score_sum.append(int(statistics.mean(aa_score[i])))
            else:
                score_sum.append(0)
        df = pd.DataFrame({'aaScore': aa_score, 'Peptide Score': score_sum})
        smsnet_df = pd.concat([smsnet_peptide, df], axis=1)
        smsnet_df['Spectrum Name'] = titles
        smsnet_df.columns = ['SMSNet Peptide', 'SMSNet aaScore', 'SMSNet Score','Spectrum Name']
        clist = ['Spectrum Name','SMSNet Peptide', 'SMSNet aaScore', 'SMSNet Score']
        smsnet_df = smsnet_df[clist]
        smsnet_aascore = smsnet_df['SMSNet aaScore'].tolist()
        smsnet_df['SMSNet aaScore'] = [str(i).replace(' ', '').replace(',', ' ').replace('[', '').replace(']', '') for i in smsnet_aascore]
        smsnet_df['Area'] = 1
        smsnet_df['SMSNet Peptide'] = smsnet_df['SMSNet Peptide'].astype(str)
        smsnet_df['SMSNet Peptide'] = smsnet_df['SMSNet Peptide'].str.strip()
        smsnet_df1 = smsnet_df[smsnet_df['SMSNet Peptide'].str.strip() != ''].copy()
        #smsnet_df1 = smsnet_df[smsnet_df['SMSNet Peptide'].notna()].copy()
        smsnet_df1 = smsnet_df1[~smsnet_df1['SMSNet Peptide'].str.contains('<unk>|<s>', na=False)]
        smsnet_df1['SMSNet Peptide'] = [i.replace('C','C(+57.02)').replace('m','M(+15.99)').replace('n','N(+.98)').replace('q','Q(+.98)') for i in smsnet_df1['SMSNet Peptide']]
        smsnet_df1['SMSNet Score'] = smsnet_df1['SMSNet Score'].astype(int)
        smsnet_df1 = smsnet_df1[smsnet_df1['SMSNet Score'] >= SocreAll[tool]]
        smsnet_df1 = smsnet_df1[smsnet_df1['SMSNet Peptide'].notna()]

        smsnet_df1.to_csv(smsnet_out,index=False)
        subprocess.run(
            ('java', '-jar', '../../../ALPS.jar', smsnet_out, str(kmer), str(contigs_ALPS), '>>',
            resultdir + 'assembly.log'), stdout=subprocess.DEVNULL)

Names = {
 '50ugmAb1':'mAb1',
 '100ugmAb2':'mAb2',
 '200ugmAb3':'mAb3',
 '20230210-mAb4':'mAb4',
 '20230707-mAb5':'mAb5',
 '20231203-mAb6':'mAb6',
 '20231221-mAb7':'mAb7',    
 '20250415-mAb8':'mAb8'}

path = '../../../denovo/SMSNet/'
out_path = '../../../ALPS_Assembly/SMSNet/'
mgf_path = '../../../monoclonal_antibody/'
mabs = os.listdir(path)
for mab in mabs:
    print(mab)
    smsnet_path = path + mab +'/'+'spectrum_'+Names[mab]+'_HCD_forSMSNet'
    resultdir = out_path + mab +'/'
    mgf_file = mgf_path + mab +'/process1/'+'spectrum_'+ Names[mab] +'_HCD.mgf'
    process_smsnet_ALPS(smsnet_path,mgf_file,resultdir)       