# Get Uniprot IDS for influenza
## Christian Roncal Cmsc499a Dr. Leiserson

In [1]:
import os 
import pandas as pd
import numpy as np
from utils import *
import re

In [2]:
os.listdir()

['make_matrix.py',
 'utils.py',
 'dataprocessing_playground.ipynb',
 'influenza',
 'flustrains.txt',
 '.ipynb_checkpoints',
 'influenza_a_processing.ipynb',
 'flu_interactions_full.csv',
 '__pycache__']

# 1 load data

In [3]:
flu_csv = './influenza/flu_phi.csv'

In [4]:
fludf = pd.read_csv(flu_csv, index_col=0)

In [5]:
fludf.head()

Unnamed: 0,Pathogen,Taxonomy ID,virusUprot,Pathogen Protein,humanUprot,Human Protein,Experimental Method,Pubmed ID
0,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03433,PA_I34A1,P49736,MCM2_HUMAN,anti bait coimmunoprecipitation,17932485
1,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03466,NCAP_I34A1,P49736,MCM2_HUMAN,anti bait coimmunoprecipitation,17932485
2,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03428,PB2_I34A1,P49736,MCM2_HUMAN,anti bait coimmunoprecipitation,17932485
3,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03485,M1_I34A1,P15311,EZRI_HUMAN,anti bait coimmunoprecipitation,17022977
4,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03485,M1_I34A1,P11142,HSP7C_HUMAN,anti bait coimmunoprecipitation,17022977


In [6]:
len(fludf['Pathogen'].unique()) #not all of this will be of interest

526

# 2 Extract relevant strains

Use regex to get strains we're interested in: H1N1|H3N2|H5N1|H7N3

In [7]:
regex = 'H1N1|H3N2|H5N1|H7N3'
fludf = regex_select(fludf, 'Pathogen', regex)

Pathogen H1N1|H3N2|H5N1|H7N3


In [8]:
fludf.head()

Unnamed: 0,Pathogen,Taxonomy ID,virusUprot,Pathogen Protein,humanUprot,Human Protein,Experimental Method,Pubmed ID
0,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03433,PA_I34A1,P49736,MCM2_HUMAN,anti bait coimmunoprecipitation,17932485
1,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03466,NCAP_I34A1,P49736,MCM2_HUMAN,anti bait coimmunoprecipitation,17932485
2,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03428,PB2_I34A1,P49736,MCM2_HUMAN,anti bait coimmunoprecipitation,17932485
3,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03485,M1_I34A1,P15311,EZRI_HUMAN,anti bait coimmunoprecipitation,17022977
4,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03485,M1_I34A1,P11142,HSP7C_HUMAN,anti bait coimmunoprecipitation,17022977


In [9]:
len(fludf['Pathogen'].unique())

52

# 3 Get virus|human uniprots of interactions

get a df with cols virus uniprot and human uniprot for matrix interaction processing.

In [10]:
# drop cols we don't need
fludf = fludf[['virusUprot', 'humanUprot']]

In [11]:
fludf.head()

Unnamed: 0,virusUprot,humanUprot
0,P03433,P49736
1,P03466,P49736
2,P03428,P49736
3,P03485,P15311
4,P03485,P11142


In [12]:
# save current interaction matrix
fludf.to_csv('flu_interactions_full.csv')

In [13]:
ranked_fludf = rank_by_interactions(fludf, 20)

In [14]:
ranked_fludf.head()

Unnamed: 0,virus,n_pos,n_neg,ratio
0,P03470,721,2007,0.264296
1,P05777,716,2012,0.262463
2,P05780,666,2062,0.244135
3,P15682,613,2115,0.224707
4,P03427,553,2175,0.202713


In [15]:
top28 = ranked_fludf[:28]

In [16]:
top28['ratio'].mean()

0.1139374738165061

In [17]:
top16 = ranked_fludf[:16]

In [18]:
top16['ratio'].mean()

0.1605800953079179

# Extract topX virus entries

In [26]:
flu_train_full = pd.read_csv('flu_train_full.csv', index_col=0)

In [27]:
flu_train_full.head()

Unnamed: 0.1,Unnamed: 0,index,virusUprot,humanUprot,edge
0,0,0,P03433,P49736,1.0
1,1,1,P03433,P15311,0.0
2,2,2,P03433,P11142,0.0
3,3,3,P03433,Q86U42,0.0
4,4,4,P03433,P33992,1.0


In [24]:
edges = flu_train_full['edge'].values

In [25]:
len(edges[edges==1]) / len(edges)

0.018499992776967192

In [34]:
flu_train_full.iloc[:, 2:]

Unnamed: 0,virusUprot,humanUprot,edge
0,P03433,P49736,1.0
1,P03433,P15311,0.0
2,P03433,P11142,0.0
3,P03433,Q86U42,0.0
4,P03433,P33992,1.0
5,P03433,P33991,1.0
6,P03433,P25205,1.0
7,P03433,P33993,1.0
8,P03433,O00459,0.0
9,P03433,O43719,0.0
