In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re

In [6]:
df = pd.read_csv("epitope_classfication/epitope_influenza.csv", header=[0, 1])

In [7]:
df['Epitope'].head()

Unnamed: 0,Epitope ID,Object Type,Description,Epitope Modified Residue(s),Epitope Modification(s),Starting Position,Ending Position,Non-peptidic epitope Accession,Epitope Synonyms,Antigen Name,Antigen Accession,Parent Protein,Parent Protein Accession,Organism Name,Parent Organism,Parent Organism ID,Epitope Comments
0,133,Linear peptide,AAFEDLRVLSFIRG,,,336.0,349.0,,,nucleoprotein,AAO46537.1,Nucleoprotein,P03466,Influenza A virus,Influenza A virus,11320,This reference was originally imported from th...
1,134,Linear peptide,AAFEDLRVLSFIRGTKVSPR,,,336.0,355.0,,,Nucleoprotein,P22435.2,Nucleoprotein,P03466,Influenza A virus,Influenza A virus,11320,A tyrosyl residue was added to NP 336-349 in o...
2,142,Linear peptide,AAGAAVKGV,,,193.0,201.0,,,nucleoprotein,CAZ65591.1,Nucleoprotein,P03466,Influenza A virus,Influenza A virus,11320,
3,570,Linear peptide,ACKRGPGSGFFSRLN,,,138.0,152.0,,,Hemagglutinin,P04663.1,Hemagglutinin,P03452,Influenza A virus,Influenza A virus,11320,
4,798,Linear peptide,ADYEELREQLSSVSSFERFE,,,113.0,132.0,,,hemagglutinin,ABI96104.1,Hemagglutinin,P03452,Influenza A virus,Influenza A virus,11320,


In [8]:
df.columns

MultiIndex(levels=[['Epitope', 'Related Object'], ['Antigen Accession', 'Antigen Name', 'Description', 'Ending Position', 'Epitope Comments', 'Epitope ID', 'Epitope Modification(s)', 'Epitope Modified Residue(s)', 'Epitope Relationship', 'Epitope Synonyms', 'Non-peptidic epitope Accession', 'Non-peptidic object Accession', 'Object Type', 'Organism Name', 'Parent Organism', 'Parent Organism ID', 'Parent Protein', 'Parent Protein Accession', 'Starting Position', 'Synonyms']],
           codes=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [5, 12, 2, 7, 6, 18, 3, 10, 9, 1, 0, 16, 17, 13, 14, 15, 4, 8, 12, 2, 18, 3, 11, 19, 1, 16, 13, 14]])

In [9]:
df = df['Epitope']
df.columns

Index(['Epitope ID', 'Object Type', 'Description',
       'Epitope Modified Residue(s)', 'Epitope Modification(s)',
       'Starting Position', 'Ending Position',
       'Non-peptidic epitope Accession', 'Epitope Synonyms', 'Antigen Name',
       'Antigen Accession', 'Parent Protein', 'Parent Protein Accession',
       'Organism Name', 'Parent Organism', 'Parent Organism ID',
       'Epitope Comments'],
      dtype='object')

In [10]:
df["Object Type"].value_counts()

Linear peptide                          2216
Discontinuous peptide                    120
Discontinuous peptide on multi chain      36
Name: Object Type, dtype: int64

In [11]:
df["Parent Protein"].value_counts()

Hemagglutinin                                    930
Nucleoprotein                                    392
Matrix protein 1                                 277
RNA-directed RNA polymerase catalytic subunit    195
Neuraminidase                                    160
Polymerase basic protein 2                       132
Polymerase acidic protein                        109
Non-structural protein 1                          72
Matrix protein 2                                  34
Nuclear export protein                            29
Protein PB1-F2                                     6
Name: Parent Protein, dtype: int64

In [12]:
ha = df["Parent Protein"] == "Hemagglutinin"
le = df["Object Type"] == "Linear peptide"
df_ha_linear = df[ha & le].sort_values(by=['Starting Position'])

In [14]:
df_ha_linear['Type'] = df_ha_linear["Organism Name"].str.extract(r'([Hh].[Nn].)', expand=False).str.upper()

In [15]:
df_ha_linear["Type"].value_counts(dropna=False)

H1N1    310
H3N2    180
H5N1    115
NaN     100
HONG     40
H7N9     30
H2N2     22
H1N2     13
HANG      4
H5N2      2
H3N8      1
H2N3      1
H4N4      1
Name: Type, dtype: int64

In [16]:
df_ha_linear_h3 = df_ha_linear[df_ha_linear['Type'] == "H3N2"]
df_ha_linear_h1 = df_ha_linear[df_ha_linear['Type'] == "H1N1"]

In [117]:
df_ha_linear_h3[["Antigen Accession", "Description"]].to_json("h3_linear.json", orient='values')

In [18]:
import json
with open('epitope_classfication/h3_linear.json') as f:
    h3_linear = json.load(f)

with open('epitope_classfication/h3_linear.fasta', 'w') as g:
    for item in h3_linear:
        g.write(">%s\n%s\n" % (item[0], item[1]))