# Parsing NetMHCpan 4.1 Output

## Importing modules

In [1]:
import numpy as np
import re
import pandas as pd
from scipy import stats

## Parsing peptide scores

In [5]:
#Define a function to parse all binding scores from the file of random peptides
def parse_random_peptide_scores(filename) :
    random_peptide_scores = []
    with open(filename) as file :
        for line in file :
            line = line.rstrip().split()
            if "PEPLIST" in line and "Number" not in line:
                random_peptide_scores.append(float(line[11]))
                
    return random_peptide_scores

#Define a function to create a dataframe containing each peptide with its binding score and rank
def parse_binding_scores(filename) :
    peptides = []
    binding_scores = []
    ranks = []
    with open(filename) as file :
        for line in file :
            if line.isspace() == True :
                continue
            line = line.rstrip().split()
            if (bool(re.match("\d+", line[0]))) == True:
                peptides.append(line[2])
                binding_scores.append(float(line[11]))
                ranks.append(float(line[12]))
    
    data = {
        "Peptide": peptides,
        "Binding Score": binding_scores,
        "Built-In Rank": ranks
    }
    
    return pd.DataFrame(data)

#Define a function that transforms a list of binding scores into a list of ranks using random peptide scores
def transform_scores(binding_scores, random_peptide_scores) :
    ranks = []
    for score in binding_scores :
        ranks.append(100 - stats.percentileofscore(random_peptide_scores, score))
        
    return ranks

## Parsing files

In [4]:
#Parse random peptides
HLA_A_02_01_random_peptide_scores = parse_random_peptide_scores("Random Peptides/Random_peptides_HLA_A_02_01.out")
HLA_A_03_01_random_peptide_scores = parse_random_peptide_scores("Random Peptides/Random_peptides_HLA_A_03_01.out")
Patr_A_01_01_random_peptide_scores = parse_random_peptide_scores("Random Peptides/Random_peptides_Patr_A_01_01.out")
Gogo_A_01_01_01_random_peptide_scores = parse_random_peptide_scores("Random Peptides/Random_peptides_Gogo_A_01_01_01.out")

#Create empty dataframes with the correct columns
empty_df_columns = {
    "Virus": [],
    "Peptide": [],
    "Binding Score": [],
    "Built-In Rank": []
}

#Create empty dataframes with the correct columns (for Gogo) - no built-in rank
empty_df_columns_Gogo = {
    "Virus": [],
    "Peptide": [],
    "Binding Score": [],
}

HLA_A_02_01_df = pd.DataFrame(empty_df_columns)
HLA_A_03_01_df = pd.DataFrame(empty_df_columns)
Patr_A_01_01_df = pd.DataFrame(empty_df_columns)
Gogo_A_01_01_01_df = pd.DataFrame(empty_df_columns_Gogo)

In [5]:
#Parse each virus file and append them to the dataframes
viruses = ["Western_lowland_gorilla_simian_foamy_virus", "Torque_teno_hominid_virus_1", "Alphapolyomavirus_gorillae", "Primate_bocaparvovirus_1", "Human_mastadenovirus_B", "Simian_immunodeficiency_virus_1", "Hepatitis_B_virus", "Human_metapneumovirus", "Torque_teno_virus_23", "Chimpanzee_associated_circovirus_1", "Chimpanzee_associated_cyclovirus_1", "Chimpanzee_associated_porprismacovirus_1", "Chimpanzee_faeces_associated_circular_DNA_virus_1", "Panine_alphaherpesvirus_3", "Primate_T-lymphotropic_virus", "Monkeypox_virus", "Rhinovirus_A"]

for virus in viruses :
    HLA_A_02_01_parsing = parse_binding_scores("NetMHCpan Output Files/" + virus + "_HLA_A_02_01.out")
    HLA_A_02_01_parsing.insert(0, "Virus", virus)
    HLA_A_02_01_df = pd.concat([HLA_A_02_01_df, HLA_A_02_01_parsing], ignore_index=True)
    HLA_A_03_01_parsing = parse_binding_scores("NetMHCpan Output Files/" + virus + "_HLA_A_03_01.out")
    HLA_A_03_01_parsing.insert(0, "Virus", virus)
    HLA_A_03_01_df = pd.concat([HLA_A_03_01_df, HLA_A_03_01_parsing], ignore_index=True)
    Patr_A_01_01_parsing = parse_binding_scores("NetMHCpan Output Files/" + virus + "_Patr_A_01_01.out")
    Patr_A_01_01_parsing.insert(0, "Virus", virus)
    Patr_A_01_01_df = pd.concat([Patr_A_01_01_df, Patr_A_01_01_parsing], ignore_index=True)
    Gogo_A_01_01_01_parsing = parse_binding_scores("NetMHCpan Output Files/" + virus + "_Gogo_A_01_01_01.out")
    Gogo_A_01_01_01_parsing.insert(0, "Virus", virus)
    Gogo_A_01_01_01_df = pd.concat([Gogo_A_01_01_01_df, Gogo_A_01_01_01_parsing], ignore_index=True)

In [None]:
#Compute rank scores using random peptides
HLA_A_02_01_df['Computed Rank'] = transform_scores(list(HLA_A_02_01_df.iloc[:,2]), HLA_A_02_01_random_peptide_scores)
HLA_A_03_01_df['Computed Rank'] = transform_scores(list(HLA_A_03_01_df.iloc[:,2]), HLA_A_03_01_random_peptide_scores)
Patr_A_01_01_df['Computed Rank'] = transform_scores(list(Patr_A_01_01_df.iloc[:,2]), Patr_A_01_01_random_peptide_scores)
Gogo_A_01_01_01_df['Computed Rank'] = transform_scores(list(Gogo_A_01_01_01_df.iloc[:,2]), Gogo_A_01_01_01_random_peptide_scores)

In [None]:
HLA_A_02_01_df.to_csv("Dataframes/HLA_A_02_01_df.csv", index=False)
HLA_A_03_01_df.to_csv("Dataframes/HLA_A_03_01_df.csv", index=False)
Patr_A_01_01_df.to_csv("Dataframes/Patr_A_01_01_df.csv", index=False)
Gogo_A_01_01_01_df.to_csv("Dataframes/Gogo_A_01_01_01_df.csv", index=False)

In [6]:
empty_df_columns = {
    "Virus": [],
    "Peptide": [],
    "Binding Score": [],
    "Built-In Rank": []
}

viruses = ["Western_lowland_gorilla_simian_foamy_virus", "Torque_teno_hominid_virus_1", "Alphapolyomavirus_gorillae", "Primate_bocaparvovirus_1", "Human_mastadenovirus_B", "Simian_immunodeficiency_virus_1", "Hepatitis_B_virus", "Human_metapneumovirus", "Torque_teno_virus_23", "Chimpanzee_associated_circovirus_1", "Chimpanzee_associated_cyclovirus_1", "Chimpanzee_associated_porprismacovirus_1", "Chimpanzee_faeces_associated_circular_DNA_virus_1", "Panine_alphaherpesvirus_3", "Primate_T-lymphotropic_virus", "Monkeypox_virus", "Rhinovirus_A"]

Patr_A_06_01_random_peptide_scores = parse_random_peptide_scores("Random Peptides/Random_peptides_Patr_A_06_01.out")

Patr_A_06_01_df = pd.DataFrame(empty_df_columns)

for virus in viruses :
    Patr_A_06_01_parsing = parse_binding_scores("NetMHCpan Output Files/" + virus + "_Patr_A_06_01.out")
    Patr_A_06_01_parsing.insert(0, "Virus", virus)
    Patr_A_06_01_df = pd.concat([Patr_A_06_01_df, Patr_A_06_01_parsing], ignore_index=True)
    
Patr_A_06_01_df['Computed Rank'] = transform_scores(list(Patr_A_06_01_df.iloc[:,2]), Patr_A_06_01_random_peptide_scores)
Patr_A_06_01_df.to_csv("Dataframes/Patr_A_06_01_df.csv", index=False)