In [1]:
from json import tool
import csv
import numpy as np
import os
import pandas as pd
import subprocess
import matplotlib
matplotlib.use('pdf')
from matplotlib import pyplot as plt
from sklearn import metrics
from fast_diff_match_patch import diff
import logging
from collections import Counter 

pd.options.mode.chained_assignment = None
logger = logging.getLogger(__name__)
logging.getLogger('matplotlib').setLevel(logging.ERROR)

In [2]:
tools_list = [
    "CasanovoV1", "CasanovoV2", "CasanovoV3", "DeepNovo", "InstaNovo", "InstaNovoPlus", "PepNet", "pi-HelixNovo",
]

In [3]:
def arePermutation(str1, str2):
    n1 = len(str1)
    n2 = len(str2)
    if str1 == str2:
        return False
    if n1 != n2:
        return False
    a = sorted(str1)
    str1 = " ".join(a)
    b = sorted(str2)
    str2 = " ".join(b)
    for i in range(0, n1, 1):
        if str1[i] != str2[i]:
            return False
    return True

In [4]:
def error_stats(summary_df, resultdir):
    logger.debug("Error Evaluation.")
    with open(resultdir + "error_eval.txt", "w+") as text_file:
        None
    for tools in tools_list:
        score_cutoff = 50
        while (score_cutoff > -1):
            true_list = summary_df['Modified Sequence'].tolist()
            to_test = summary_df[tools + ' Peptide'].tolist()
            to_test_score = summary_df[tools + ' Score'].tolist()

            permutations_first3 = 0
            permutations_last3 = 0
            permutations_last_and_first3 = 0
            amount_1AA_replacements = 0
            amount_1AA_replacements_firstposition = 0
            amount_1AA_replacements_twoposition = 0
            amount_1AA_replacements_lastposition = 0
            amount_2AA_replacements = 0
            amount_3AA_replacements = 0
            amount_4AA_replacements = 0
            amount_5AA_replacements = 0
            amount_6AA_replacements = 0
            amount_moreThan6AA_replacements = 0
            unknown_error = 0
            total_errors = 0
            number_of_predictions = 0

            tuples_SingleReplacements = []

            for i, (pred_peptide, true_peptide) in enumerate(zip(to_test, true_list)):
                if type(pred_peptide) is str and type(true_peptide) is str and to_test_score[i] >= score_cutoff:
                    changes = diff(pred_peptide, true_peptide, timelimit=0, checklines=False)
                    longest_mismatch_neg = 0
                    longest_mismatch_pos = 0
                    longest_mismatch = 0
                    longest_mismatch_exactposition = -1
                    length_seq = 0
                    lengthseq2 = 0
                    number_of_predictions += 1
                    pos_deletion = 0
                    pos_insertion = 0
                    for z, (op, length) in enumerate(changes):
                        length_seq += length
                        if op == "=":
                            lengthseq2 += length
                        if op == "+" and length > longest_mismatch_pos:
                            longest_mismatch_pos = length
                            pos_insertion=lengthseq2
                        if op == "-" and length > longest_mismatch_neg:
                            longest_mismatch_neg = length
                            pos_deletion = lengthseq2
                        if (op == "+" or op == "-") and length > longest_mismatch:
                            longest_mismatch = length
                            longest_mismatch_exactposition = length_seq - length

                    if longest_mismatch > 0:
                        total_errors += 1
                    if longest_mismatch == 0:
                        pass
                    elif (arePermutation(pred_peptide[0:3], true_peptide[0:3]) or arePermutation(pred_peptide[-3:],
                                                                                                 true_peptide[
                                                                                                 -3:])) and longest_mismatch < 6:
                        if arePermutation(pred_peptide[-3:], true_peptide[-3:]) and arePermutation(pred_peptide[0:3],
                                                                                                   true_peptide[0:3]):
                            permutations_last_and_first3 += 1
                        elif arePermutation(pred_peptide[0:3], true_peptide[0:3]):
                            permutations_first3 += 1
                        elif arePermutation(pred_peptide[-3:], true_peptide[-3:]):
                            permutations_last3 += 1
                    elif (longest_mismatch_pos == 1 and 1 >= longest_mismatch_neg <= 2) or (
                            1 >= longest_mismatch_pos <= 2 and longest_mismatch_neg == 1):
                        amount_1AA_replacements += 1
                        if pos_deletion == pos_insertion:
                            singleElem = tuple(sorted((pred_peptide[pos_deletion], true_peptide[pos_insertion])))
                            tuples_SingleReplacements.append(singleElem)
                        if pred_peptide[0] != true_peptide[0]:
                            amount_1AA_replacements_firstposition += 1
                        elif pred_peptide[0:1] != true_peptide[0:1]:
                            amount_1AA_replacements_twoposition += 1
                        if pred_peptide[-1] != true_peptide[-1]:
                            amount_1AA_replacements_lastposition += 1
                    elif longest_mismatch_pos == 2 and longest_mismatch_neg == 2:
                        amount_2AA_replacements += 1
                    elif longest_mismatch_pos == 3 and longest_mismatch_neg == 3:
                        amount_3AA_replacements += 1
                    elif longest_mismatch_pos == 4 and longest_mismatch_neg == 4:
                        amount_4AA_replacements += 1
                    elif longest_mismatch_pos == 5 and longest_mismatch_neg == 5:
                        amount_5AA_replacements += 1
                    elif longest_mismatch_pos == 6 and longest_mismatch_neg == 6:
                        amount_6AA_replacements += 1
                    elif longest_mismatch > 6:
                        amount_moreThan6AA_replacements += 1
                    else:
                        unknown_error += 1

            if total_errors == 0:
                total_errors = 1

            with open(resultdir + "error_eval.txt", "a+") as text_file:
                text_file.write("\n\nError Evaluation for " + str(tools))
                text_file.write("\nScore Cutoff: " + str(score_cutoff))
                text_file.write("\nNumber of total errors: " + str(total_errors))
                text_file.write("\nNumber of predictions: " + str(number_of_predictions))
                text_file.write("\nAmount of permutations at first three positions: in total numbers " + str(
                    permutations_first3) + " and in % " + str(permutations_first3 * 100 / total_errors))
                text_file.write("\nAmount of permutations at last three positions: in total numbers " + str(
                    permutations_last3) + " and in % " + str(permutations_last3 * 100 / total_errors))
                text_file.write(
                    "\nAmount of permutations at last three and first three posistions: in total numbers " + str(
                        permutations_last_and_first3) + " and in % " + str(
                        permutations_last_and_first3 * 100 / total_errors))
                text_file.write("\nNumber where 1 AA was replaced by 1 or 2 AA: " + str(
                    amount_1AA_replacements) + " and in % " + str(
                    amount_1AA_replacements * 100 / total_errors))
                text_file.write("\nNumber where 1 AA was replaced by 1 or 2 AA in the first position: " + str(
                    amount_1AA_replacements_firstposition) + " and in % " + str(
                    amount_1AA_replacements_firstposition * 100 / total_errors))
                text_file.write("\nNumber where 1 AA was replaced by 1 or 2 AA in the first two positions: " + str(
                    amount_1AA_replacements_twoposition) + " and in % " + str(
                    amount_1AA_replacements_twoposition * 100 / total_errors))
                text_file.write("\nNumber where 1 AA was replaced by 1 or 2 AA in the last position: " + str(
                    amount_1AA_replacements_lastposition) + " and in % " + str(
                    amount_1AA_replacements_lastposition * 100 / total_errors))
                text_file.write(
                    "\nNumber where 2 AA was replaced by 2 AA: " + str(amount_2AA_replacements) + " and in % " + str(
                        amount_2AA_replacements * 100 / total_errors))
                text_file.write(
                    "\nNumber where 3 AA was replaced by 3 AA: " + str(amount_3AA_replacements) + " and in % " + str(
                        amount_3AA_replacements * 100 / total_errors))
                text_file.write(
                    "\nNumber where 4 AA was replaced by 4 AA: " + str(amount_4AA_replacements) + " and in % " + str(
                        amount_4AA_replacements * 100 / total_errors))
                text_file.write(
                    "\nNumber where 5 AA was replaced by 5 AA: " + str(amount_5AA_replacements) + " and in % " + str(
                        amount_5AA_replacements * 100 / total_errors))
                text_file.write(
                    "\nNumber where 6 AA was replaced by 6 AA: " + str(amount_6AA_replacements) + " and in % " + str(
                        amount_6AA_replacements * 100 / total_errors))
                text_file.write("\nNumber where more than 6 AA Errors: " + str(
                    amount_moreThan6AA_replacements) + " and in % " + str(
                    amount_moreThan6AA_replacements * 100 / total_errors))
                text_file.write(
                    "\nOther Error: " + str(unknown_error) + " and in % " + str(unknown_error * 100 / total_errors))
                text_file.write("\n--------------------")
            score_cutoff = score_cutoff - 50

    summary_df_nomissingCleavages = summary_df[summary_df['Number of missing cleavages'] <= 0]
    with open(resultdir + "error_eval_nomissingcleavages.txt", "w+") as text_file:
        None
    for tools in tools_list:
        score_cutoff = 0
        while (score_cutoff > -1):
            true_list = summary_df_nomissingCleavages['Modified Sequence'].tolist()
            to_test = summary_df_nomissingCleavages[tools + ' Peptide'].tolist()
            to_test_score = summary_df_nomissingCleavages[tools + ' Score'].tolist()
            
            permutations_first3 = 0
            permutations_last3 = 0
            permutations_last_and_first3 = 0
            amount_1AA_replacements = 0
            amount_1AA_replacements_firstposition = 0
            amount_1AA_replacements_twoposition = 0
            amount_1AA_replacements_lastposition = 0
            amount_2AA_replacements = 0
            amount_3AA_replacements = 0
            amount_4AA_replacements = 0
            amount_5AA_replacements = 0
            amount_6AA_replacements = 0
            amount_moreThan6AA_replacements = 0
            unknown_error = 0
            total_errors = 0

            Which_2AA_list = []

            for i, (pred_peptide, true_peptide) in enumerate(zip(to_test, true_list)):
                if type(pred_peptide) is str and type(true_peptide) is str and to_test_score[i] >= score_cutoff:
                    changes = diff(pred_peptide, true_peptide, timelimit=0, checklines=False)
                    longest_mismatch_neg = 0
                    longest_mismatch_pos = 0
                    longest_mismatch = 0
                    longest_mismatch_exactposition = -1
                    length_seq = 0
                    for z, (op, length) in enumerate(changes):
                        length_seq += length
                        if op == "+" and length > longest_mismatch_pos:
                            longest_mismatch_pos = length
                        if op == "-" and length > longest_mismatch_neg:
                            longest_mismatch_neg = length
                        if (op == "+" or op == "-") and length > longest_mismatch:
                            longest_mismatch = length
                            longest_mismatch_exactposition = length_seq - length

                    if longest_mismatch > 0:
                        total_errors += 1
                    if longest_mismatch == 0:
                        pass
                    elif (arePermutation(pred_peptide[0:3], true_peptide[0:3]) or arePermutation(pred_peptide[-3:],
                                                                                                 true_peptide[
                                                                                                 -3:])) and longest_mismatch < 6:
                        if arePermutation(pred_peptide[-3:], true_peptide[-3:]) and arePermutation(pred_peptide[0:3],
                                                                                                   true_peptide[0:3]):
                            permutations_last_and_first3 += 1
                        elif arePermutation(pred_peptide[0:3], true_peptide[0:3]):
                            permutations_first3 += 1
                        elif arePermutation(pred_peptide[-3:], true_peptide[-3:]):
                            permutations_last3 += 1
                    elif (longest_mismatch_pos == 1 and 1 >= longest_mismatch_neg <= 2) or (
                            1 >= longest_mismatch_pos <= 2 and longest_mismatch_neg == 1):
                        amount_1AA_replacements += 1
                        if pred_peptide[0] != true_peptide[0]:
                            amount_1AA_replacements_firstposition += 1
                        elif pred_peptide[0:1] != true_peptide[0:1]:
                            amount_1AA_replacements_twoposition += 1
                        if pred_peptide[-1] != true_peptide[-1]:
                            amount_1AA_replacements_lastposition += 1
                    elif longest_mismatch_pos == 2 and longest_mismatch_neg == 2:
                        amount_2AA_replacements += 1
                    elif longest_mismatch_pos == 3 and longest_mismatch_neg == 3:
                        amount_3AA_replacements += 1
                    elif longest_mismatch_pos == 4 and longest_mismatch_neg == 4:
                        amount_4AA_replacements += 1
                    elif longest_mismatch_pos == 5 and longest_mismatch_neg == 5:
                        amount_5AA_replacements += 1
                    elif longest_mismatch_pos == 6 and longest_mismatch_neg == 6:
                        amount_6AA_replacements += 1
                    elif longest_mismatch > 6:
                        amount_moreThan6AA_replacements += 1
                    else:
                        unknown_error += 1

            score_cutoff = score_cutoff - 50

            if total_errors == 0:
                total_errors = 1

            with open(resultdir + "error_eval_nomissingcleavages.txt", "a+") as text_file:
                text_file.write("\n\nError Evaluation for " + str(tools))
                text_file.write("\nScore Cutoff: " + str(score_cutoff + 50))
                text_file.write("\nNumber of total errors: " + str(total_errors))
                text_file.write("\nNumber of predictions: " + str(len(to_test)))
                text_file.write("\nAmount of permutations at first three positions: in total numbers " + str(
                    permutations_first3) + " and in % " + str(permutations_first3 * 100 / total_errors))
                text_file.write("\nAmount of permutations at last three positions: in total numbers " + str(
                    permutations_last3) + " and in % " + str(permutations_last3 * 100 / total_errors))
                text_file.write(
                    "\nAmount of permutations at last three and first three posistions: in total numbers " + str(
                        permutations_last_and_first3) + " and in % " + str(
                        permutations_last_and_first3 * 100 / total_errors))
                text_file.write("\nNumber where 1 AA was replaced by 1 or 2 AA: " + str(
                    amount_1AA_replacements) + " and in % " + str(
                    amount_1AA_replacements * 100 / total_errors))
                text_file.write("\nNumber where 1 AA was replaced by 1 or 2 AA in the first position: " + str(
                    amount_1AA_replacements_firstposition) + " and in % " + str(
                    amount_1AA_replacements_firstposition * 100 / total_errors))
                text_file.write("\nNumber where 1 AA was replaced by 1 or 2 AA in the first two positions: " + str(
                    amount_1AA_replacements_twoposition) + " and in % " + str(
                    amount_1AA_replacements_twoposition * 100 / total_errors))
                text_file.write("\nNumber where 1 AA was replaced by 1 or 2 AA in the last position: " + str(
                    amount_1AA_replacements_lastposition) + " and in % " + str(
                    amount_1AA_replacements_lastposition * 100 / total_errors))
                text_file.write(
                    "\nNumber where 2 AA was replaced by 2 AA: " + str(amount_2AA_replacements) + " and in % " + str(
                        amount_2AA_replacements * 100 / total_errors))
                text_file.write(
                    "\nNumber where 3 AA was replaced by 3 AA: " + str(amount_3AA_replacements) + " and in % " + str(
                        amount_3AA_replacements * 100 / total_errors))
                text_file.write(
                    "\nNumber where 4 AA was replaced by 4 AA: " + str(amount_4AA_replacements) + " and in % " + str(
                        amount_4AA_replacements * 100 / total_errors))
                text_file.write(
                    "\nNumber where 5 AA was replaced by 5 AA: " + str(amount_5AA_replacements) + " and in % " + str(
                        amount_5AA_replacements * 100 / total_errors))
                text_file.write(
                    "\nNumber where 6 AA was replaced by 6 AA: " + str(amount_6AA_replacements) + " and in % " + str(
                        amount_6AA_replacements * 100 / total_errors))
                text_file.write("\nNumber where more than 6 AA Errors: " + str(
                    amount_moreThan6AA_replacements) + " and in % " + str(
                    amount_moreThan6AA_replacements * 100 / total_errors))
                text_file.write(
                    "\nOther Error: " + str(unknown_error) + " and in % " + str(unknown_error * 100 / total_errors))
                text_file.write("\n--------------------")
    logger.debug("Error Evaluation finished.")


In [5]:
import pandas as pd

# 读取两个 CSV 文件
df1 = pd.read_csv("../../analysis/robutustness_metric.csv")
df2 = pd.read_csv("../../DB_search_merged5.csv")

# 获取列名
cols_df1 = set(df1.columns)
cols_df2 = set(df2.columns)

# 找出重复的列
common_cols = cols_df1.intersection(cols_df2)
print("重复列名：", common_cols)

# 去除 df2 中重复的列（保留 df1 中的版本）
df2_filtered = df2.drop(columns=common_cols)

# 合并两个 DataFrame
combined_df = pd.concat([df1, df2_filtered], axis=1)

# 保存结果
combined_df.to_csv("combined_output.csv", index=False)
print("合并后的列名：", combined_df.columns.tolist())

重复列名： {'Modified Sequence'}
合并后的列名： ['Modified Sequence', 'Number of missing cleavages', 'Position of present cleavages', 'Number of missing cleavages (including a-ions)', 'Position of present cleavages (including a-ions)', 'Spectrum Name', 'Length', 'm/z', 'Z', 'Protease', 'Species', 'CasanovoV1 Peptide', 'CasanovoV1 Score', 'CasanovoV1 aaScore', 'CasanovoV2 Peptide', 'CasanovoV2 Score', 'CasanovoV2 aaScore', 'CasanovoV3 Peptide', 'CasanovoV3 Score', 'CasanovoV3 aaScore', 'DeepNovo Peptide', 'DeepNovo Score', 'DeepNovo aaScore', 'InstaNovo Peptide', 'InstaNovo Score', 'InstaNovo aaScore', 'InstaNovoPlus Peptide', 'InstaNovoPlus Score', 'InstaNovoPlus aaScore', 'PepNet Peptide', 'PepNet Score', 'PepNet aaScore', 'pi-HelixNovo Peptide', 'pi-HelixNovo Score']


In [6]:
error_stats(combined_df, '../../analysis/error')