In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import os

# 1. Loading data

We're given two types of data, train.txt representst the ajacency matrix, while kmerfeats for human/viruses represent x, y respectively. Loading train.txt is straightforward, but we should split the others just to be clean

### 1.1 Load train.csv

In [2]:
df_train = pd.read_csv('data/old_data/train.txt', delimiter=' ', header=None, names=['node1', 'node2', 'edge'])

In [3]:
df_train.head()

Unnamed: 0,node1,node2,edge
0,151,2841,1
1,151,2874,1
2,151,780,1
3,151,1183,1
4,155,2346,1


In [4]:
df_train.to_csv('data/train.csv', index=False)

### 1.2 Split up human/virus kmerfeats

We have three tasks: Influenza, hep C, Ebola. I plan to make three separate dataframes for each.

In [5]:
with open('./data/old_data/human_ids_kmerfeats.txt', mode='r') as h:
    human_feats = h.read().split('\n')[:-1] # read txt file as str and then split at newlines to create array

with open('./data/old_data/virus_ids_kmerfeats.txt', mode='r') as v:
    virus_feats = v.read().split('\n')[: -1] #skip the last item which is just an empty line

assert(len(human_feats) == 7210 and len(virus_feats) == 1108)

The files are in a weird format, I'm not sure how to interact with. Exploring the data we find that
human feats contain 7210 lines and each line is a key:value pair with keys going from 1 to 2799. This is the same case for virus feats, but instead of 7210, there's 1108

In [6]:
human_feats[0]

'1 1:0.5079 2:0.8413 3:0.4286 4:0.4603 5:0.4127 6:0.127 7:0.03175 8:0.9206 9:0.9841 10:0.6667 11:0.3492 12:0.4286 13:0.5397 14:0.03175 15:0.381 16:0.6667 17:0.3492 18:0.3016 19:0.1905 20:0.254 22:0.2381 23:0.619 24:0.254 25:0.1429 26:0.2063 27:0.1429 28:0.03175 29:0.4762 30:0.381 31:0.3175 32:0.1905 33:0.2698 34:0.1746 36:0.2857 37:0.4444 38:0.1111 39:0.1905 40:0.2381 41:0.2381 47:0.04762 48:0.01587 50:0.3889 51:0.4444 52:0.05556 53:0.3333 54:0.3333 57:0.6667 58:0.6667 59:0.6111 60:0.2222 61:0.3889 62:0.1111 64:0.05556 65:0.2222 66:0.1667 67:0.1667 68:0.1667 69:0.3889 71:0.1111 72:0.4444 73:0.1667 74:0.05556 75:0.2222 76:0.2222 77:0.05556 78:0.3889 79:0.05556 80:0.2778 81:0.1111 82:0.2222 83:0.1111 85:0.05556 89:0.1667 99:0.6111 100:0.8333 101:0.6667 102:0.5 103:0.2222 104:0.05556 106:0.9444 107:0.7222 108:0.2778 109:0.1111 110:0.2222 111:0.8333 113:0.3889 114:0.8889 115:0.2778 116:0.2222 117:0.1111 118:0.1667 120:0.3333 121:0.4444 122:0.05556 123:0.05556 124:0.05556 127:0.4444 128:0.2

In [7]:
virus_feats[1]

'2 1:0.9615 2:0.6154 3:0.8846 4:0.4231 5:0.5769 6:0.1923 7:0.03846 8:0.5385 9:0.6923 10:0.6923 11:0.4231 12:0.6154 13:0.4231 15:0.8462 16:0.7692 17:0.6154 18:0.3846 19:0.3846 20:0.7692 21:0.1154 22:0.4615 23:0.3846 24:0.4615 25:0.2308 26:0.3077 27:0.1538 29:0.6154 30:0.3462 31:0.8462 32:0.2308 33:0.2692 34:0.1538 36:0.2692 37:0.5385 38:0.3077 39:0.2308 40:0.3077 41:0.3077 45:0.03846 46:0.03846 50:0.5455 51:0.6364 52:0.1818 53:0.3636 54:0.1818 57:0.1818 58:0.09091 59:0.3636 61:0.09091 62:0.2727 64:0.4545 65:0.09091 66:0.5455 67:0.09091 68:0.09091 69:0.3636 71:0.09091 72:0.09091 73:0.1818 75:0.2727 78:0.09091 79:0.3636 80:0.3636 83:0.09091 88:0.09091 90:0.09091 99:0.1818 101:0.09091 102:0.1818 103:0.2727 106:0.2727 107:0.1818 108:0.09091 109:0.09091 110:0.5455 113:0.3636 114:0.5455 116:0.09091 118:0.1818 122:0.09091 123:0.1818 124:0.09091 125:0.1818 127:0.4545 128:0.1818 129:0.2727 131:0.2727 135:0.1818 138:0.1818 139:0.3636 148:0.9091 149:0.2727 150:0.2727 151:0.09091 155:0.09091 156:0.

I believe this is supposed to be a matrix. human_feats is 7210 x 2799 while virus_feats is 1108 x 2799. We can easily make a dataframe for both with regex. Notice, however that while the keys are increasing, there are gaps: 2259 follows 2241 etc... We can also show this by looking at two different rows

In [8]:
print(len(human_feats[0].split()), len(human_feats[1].split()))

416 116


We can make a dataframe with row_idx, col_idx, value for both

In [41]:
def make_df(feature_list):
    n = len(feature_list)
    out = np.zeros((n, 2799))
    key_pattern = re.compile('(\d+):')
#     val_pattern = re.compile(':(\d+\.\d+)|:(\d+)')
    val_pattern = re.compile(':(\d+\.\d+)')
    data_dict = {'row':[], 'col':[], 'val': []}
        
    for i in range(n):
        keys = key_pattern.findall(feature_list[i])
        if '2799' in keys: keys.remove('2799')
        vals = val_pattern.findall(feature_list[i])
#         print(len(keys), len(vals))
        assert(len(keys) == len(vals))
#         vals = [m1 if m2 == '' else m2 for m1, m2 in vals]
        
        pairs = zip(keys, vals)
        
        for k,v in pairs:
            data_dict['row'].append(i)
            data_dict['col'].append(k)
            data_dict['val'].append(v)
            
    return pd.DataFrame(data_dict)

In [46]:
human_df = make_df(human_feats)
virus_df = make_df(virus_feats)

In [48]:
max(virus_df['row'])

1107

In [43]:
human_df.head()

Unnamed: 0,row,col,val
0,0,1,0.5079
1,0,2,0.8413
2,0,3,0.4286
3,0,4,0.4603
4,0,5,0.4127


In [44]:
virus_df.head()

Unnamed: 0,row,col,val
0,0,1,0.8056
1,0,2,0.6667
2,0,3,0.4722
3,0,4,0.05556
4,0,5,0.2222


In [45]:
human_df.to_csv('human_feats.csv', index=False)
virus_df.to_csv('virus_feats.csv', index=False)