# Processing data - make train matrix / features
## Christian Roncal Cmsc499a Dr. Leiserson

In [6]:
import os 
import pandas as pd
import numpy as np
from utils import *
import re

In [7]:
os.listdir()

['utils.py',
 'influenza',
 'process_data.ipynb',
 '.ipynb_checkpoints',
 '__pycache__']

# 1 load data

In [8]:
flu_csv = './influenza/flu_phi.csv'

In [9]:
fludf = pd.read_csv(flu_csv, index_col=0)

In [10]:
fludf.head()

Unnamed: 0,Pathogen,Taxonomy ID,VirusUniprot,Pathogen Protein,HumanUniprot,Human Protein,Experimental Method,Pubmed ID
0,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03433,PA_I34A1,P49736,MCM2_HUMAN,anti bait coimmunoprecipitation,17932485
1,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03466,NCAP_I34A1,P49736,MCM2_HUMAN,anti bait coimmunoprecipitation,17932485
2,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03428,PB2_I34A1,P49736,MCM2_HUMAN,anti bait coimmunoprecipitation,17932485
3,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03485,M1_I34A1,P15311,EZRI_HUMAN,anti bait coimmunoprecipitation,17022977
4,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03485,M1_I34A1,P11142,HSP7C_HUMAN,anti bait coimmunoprecipitation,17022977


In [11]:
fludf['Pathogen'].unique()

array(['Influenza A virus STRAIN A / PUERTO RICO / 8 / 1934 (H1N1)',
       'Influenza A virus STRAIN A / TURKEY / OREGON / 1971 (H7N3)',
       'Influenza A virus STRAIN A / hvPR8 / 34 (H1N1)',
       'Influenza A virus STRAIN A / VICTORIA / 3 / 1975 (H3N2)',
       'Influenza A virus STRAIN A / UDORN / 307 / 1972 (H3N2)',
       'Influenza A virus STRAIN A / AICHI / 2 / 1968 (H3N2)',
       'Bacillus anthracis', 'Yersinia pestis',
       'Human herpesvirus 1 STRAIN 17',
       'Influenza A virus STRAIN A / Wilson-Smith / 1933 (H1N1)',
       'Influenza A virus STRAIN A / TEXAS / 36 / 1991 (H1N1)',
       'Influenza A virus STRAIN A / WSN / 1933 (H1N1)',
       'Influenza A virus STRAIN A / NEW CALEDONIA / 20 / 1999 (H1N1)',
       'Influenza A virus STRAIN A / PANAMA / 2007 / 1999 (H3N2)',
       'Influenza A virus STRAIN A / WYOMING / 03 / 2003 (H3N2)',
       'Human immunodeficiency virus 1 (HIV1) ISOLATE HXB2 ',
       'Influenza A virus STRAIN A / ENGLAND / 878 / 1969 (H3N2)',
  

Use regex to get strains we're interested in

In [12]:
def regex_selector(name):
#     print(name)
    if type(name) != str: return False
    return bool(re.search('H1N1|H3N2|H5N1|H7N3', name))

In [13]:
fludf = fludf[fludf['Pathogen'].apply(regex_selector)]

In [14]:
len(fludf)

11684

In [15]:
fludf.head()

Unnamed: 0,Pathogen,Taxonomy ID,VirusUniprot,Pathogen Protein,HumanUniprot,Human Protein,Experimental Method,Pubmed ID
0,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03433,PA_I34A1,P49736,MCM2_HUMAN,anti bait coimmunoprecipitation,17932485
1,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03466,NCAP_I34A1,P49736,MCM2_HUMAN,anti bait coimmunoprecipitation,17932485
2,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03428,PB2_I34A1,P49736,MCM2_HUMAN,anti bait coimmunoprecipitation,17932485
3,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03485,M1_I34A1,P15311,EZRI_HUMAN,anti bait coimmunoprecipitation,17022977
4,Influenza A virus STRAIN A / PUERTO RICO / 8 /...,211044,P03485,M1_I34A1,P11142,HSP7C_HUMAN,anti bait coimmunoprecipitation,17022977


In [16]:
# drop cols we don't need
fludf = fludf[['VirusUniprot', 'HumanUniprot']]

In [17]:
fludf.head()

Unnamed: 0,VirusUniprot,HumanUniprot
0,P03433,P49736
1,P03466,P49736
2,P03428,P49736
3,P03485,P15311
4,P03485,P11142


# Make interaction matrix

get unique uniprots

In [18]:
virus_uniprots = fludf['VirusUniprot'].unique()
human_uniprots = fludf['HumanUniprot'].unique()

numericalize

In [19]:
vtoi = {v:i for i,v in enumerate(virus_uniprots)}
itov = {i:v for i,v in enumerate(virus_uniprots)}
htoi = {h:i for i,h in enumerate(human_uniprots)}
itoh = {i:h for i,h in enumerate(human_uniprots)}

In [20]:
num_virus_uniprots = np.asarray([vtoi[v] for v in virus_uniprots])
num_human_uniprots = np.asarray([htoi[h] for h in human_uniprots])

In [21]:
def make_numerical_df(vtoi, htoi, df):
    vals = df.values
    d = {'v_idx':[], 'h_idx':[]}
    
    for row in vals:
        d['v_idx'].append(vtoi[row[0]])
        d['h_idx'].append(htoi[row[1]])
        
    return pd.DataFrame(d)

In [22]:
numdf = make_numerical_df(vtoi, htoi, fludf)

In [47]:
import multiprocessing as mp
import os

def mp_pairmatcher(v, human_idxs, pairs):
    '''
    given a dict with single virus index (v), and a list of UNIQUE human indices (hidxs) and 
    observed pairs (pairs) return a dataframe wit cols: vidx, hidx, edge(0/1)
    '''
    d = {'v_idx':[], 'h_idx':[], 'edge':[]}
    
    for h in human_idxs:
        d['v_idx'].append(v)
        d['h_idx'].append(h)
        
        if (v, h) in pairs:
            d['edge'].append(1.0)
        else:
            d['edge'].append(0.0)
    
    try:
        return pd.DataFrame(d)
    except:
        print(len(d['v_idx']), len(d['h_idx']), len(d['edge']))

def makedf(df):
    virus_idxs = df['v_idx'].values
    human_idxs = df['h_idx'].values
    
    # get all possible pairs
    pairs = [(v, h) for v, h in zip(virus_idxs, human_idxs)]

    virus_idxs_uniq = df['v_idx'].unique()
    human_idxs_uniq = df['h_idx'].unique()

    pool = mp.Pool(20)
    results = pool.starmap(mp_pairmatcher, 
                           [(v, human_idxs_uniq, pairs) for v in virus_idxs_uniq], 15)
    
    return pd.concat(results).sample(frac=1) #concatenate 

In [49]:
flu_train = makedf(numdf)

In [59]:
flu_train

Unnamed: 0,v_idx,h_idx,edge
2130,166,2130,0.0
798,191,798,0.0
2179,79,2179,0.0
2368,114,2368,0.0
2048,65,2048,0.0
1070,163,1070,0.0
1620,17,1620,0.0
1931,159,1931,0.0
2429,94,2429,0.0
1066,70,1066,0.0


In [61]:
edges = flu_train['edge'].values

In [62]:
len(edges[edges==1])

10245

In [63]:
len(edges)

553784

In [64]:
10245 / 553784

0.018499992776967192

In [65]:
flu_train.to_csv('flu_interactions.csv')