In [1]:
import numpy as np
import pandas as pd
import gzip
import pickle
import itertools
import tqdm
from suffix_trees import STree

In [265]:
def read_shape_file(shape):
    sh = pd.read_csv('shape_data/'+shape+'.gz', skiprows=[0], sep = '\t', error_bad_lines=False)
    sh.columns = [shape]
    #sh = sh[:100098]
    sh = sh.dropna()
    sh = sh[:sh.shape[0]-1]
    sh = sh.reset_index(drop=True)
    print('Built file with name '+ shape+' and with shape '+ str(sh.shape) )
    return sh

In [266]:
helt = read_shape_file('helt')
prot = read_shape_file('prot')
roll = read_shape_file('roll')
mgw = read_shape_file('mgw')

Built file with name helt and with shape (100000, 1)
Built file with name prot and with shape (100000, 1)
Built file with name roll and with shape (100000, 1)
Built file with name mgw and with shape (100000, 1)


## Open DNA sequence

In [267]:
! wc -l 'shape_data/chr6.fa'

 3422303 shape_data/chr6.fa


50 nucleotides for each line, 171,115,150 nucleotides in total, almost perfect match with shape files

Chromosome 6, middle part, 2,000,000 -> 2,100,000

In [268]:
L = []
i = 0
skipped = 0
with gzip.open('shape_data/chr6.fa.gz', 'rb') as f:
    for line in f:
        # skip 1+40k lines
        if i<40001:
            skipped +=1
            pass
        else:
            # read 2000 lines
            L.append(line[:-1].lower())
            if i>=40001+2000-1:
                break
        i+=1
print('skipped', skipped)
print('included', len(L))

skipped 40001
included 2000


## Match it using pentamers!

In [269]:
4**5 #n_pentamers

1024

In [270]:
def get_pentamer(L, idx):
    assert idx>=2 and idx < 99998, 'out of sequence'
    q = idx//50
    r = idx%50
    
    if r+2<50 and r-2>=0:
        return L[q][r-2:r+2+1]
    elif r+2>=50:
        left = L[q][r-2:]
        return left + L[q+1][:(5-len(left))]
    elif r-2<0:
        right = L[q][:r+2+1]
        return L[q-1][(50-(5-len(right))):] + right

In [271]:
def build_dict(shape_name, shape_df, mean = True):
    dic = {}
    for idx in range(2, 99998):
        pent = get_pentamer(L, idx)
        h = float(shape_df[shape_name][idx])
        if pent not in dic.keys():
            dic[pent] = [h]
        else:
            dic[pent].append(h)
    if mean:
        for k in dic.keys():
            dic[k] = np.array(dic[k]).mean()
    print("Built dictionary for shape "+ shape_name+ " and got "+str(len(dic.keys()))+ ' entries')
    return dic

In [272]:
dic_helt = build_dict('helt', helt)
dic_prot = build_dict('prot', prot)
dic_roll = build_dict('roll', roll)
dic_mgw = build_dict('mgw', mgw)

Built dictionary for shape helt and got 1024 entries
Built dictionary for shape prot and got 1024 entries
Built dictionary for shape roll and got 1024 entries
Built dictionary for shape mgw and got 1024 entries


In [274]:
pickle.dump(dic_helt, open('shape_data/helt.frq', 'wb'))
pickle.dump(dic_prot, open('shape_data/prot.frq', 'wb'))
pickle.dump(dic_roll, open('shape_data/roll.frq', 'wb'))
pickle.dump(dic_mgw, open('shape_data/mgw.frq', 'wb'))

## Build shape features on dataset

In [196]:
dic_helt = pickle.load(open('shape_data/helt.frq', 'rb'))
dic_prot = pickle.load(open('shape_data/prot.frq', 'rb'))
dic_roll = pickle.load(open('shape_data/roll.frq', 'rb'))
dic_mgw = pickle.load(open('shape_data/mgw.frq', 'rb'))

In [4]:
def build_voc(letters, length):
    vocl = [''.join(x) for x in itertools.product(letters, repeat=length)]
    voc = {}
    i = 0
    for v in vocl:
        voc[v] = i
        i+=1
    return voc

In [5]:
def compute_shape_features(x, k, dic_helt, dic_prot, dic_roll, dic_mgw, letters = 'ATCG'):
    
    st = STree.STree(x)
    
    voc = build_voc(letters, k)
    
    n = len(x)
    
    enc = np.zeros(len(voc))
    mgw = np.zeros((len(voc), k))
    roll = np.zeros((len(voc), k))
    helt = np.zeros((len(voc), k))
    prot = np.zeros((len(voc), k))
    
    i = 0
    
    for w in voc.keys():
        kmer_positions = st.find_all(w)
        enc[i] = len(kmer_positions)
        
        if len(kmer_positions)>0:
            mgw_list = [ [] for _ in range(k)]
            prot_list = [ [] for _ in range(k)]
            roll_list = [ [] for _ in range(k)]
            helt_list = [ [] for _ in range(k)]
            found = False
            for pos in kmer_positions:
                if pos>=2 and pos+k+3<n:
                    found = True
                    surround = x[pos-2:pos+k+2]
                    for j in range(k):
                        mgw_list[j].append(dic_mgw[surround[j:j+5].lower().encode('UTF-8')])
                        prot_list[j].append(dic_prot[surround[j:j+5].lower().encode('UTF-8')])
                        roll_list[j].append(dic_roll[surround[j:j+5].lower().encode('UTF-8')])
                        helt_list[j].append(dic_helt[surround[j:j+5].lower().encode('UTF-8')])
            for j in range(k):
                if found:
                    mgw[i, j] = np.array(mgw_list[j]).mean()
                    prot[i, j] = np.array(prot_list[j]).mean()
                    roll[i, j] = np.array(roll_list[j]).mean()
                    helt[i, j] = np.array(helt_list[j]).mean()
                #print(j, mgw[i, j])
        i+=1
    cat = np.concatenate((enc, helt.flatten(), prot.flatten(), roll.flatten(), mgw.flatten() ))
    # The first one contains all the features, the others are just for bug fixes
    return cat, enc, helt, prot, roll, mgw

In [199]:
df = pd.read_csv('data/Xtr0.csv')
x = df['seq'][0]

In [200]:
cat, enc, helt, prot, roll, mgw = compute_shape_features(x, 6, dic_helt, dic_prot, dic_roll, dic_mgw, letters = 'ATCG')

In [201]:
mgw.sum(), helt.sum(), prot.sum(), roll.sum(), enc.sum()

(2676.6954703613083,
 18365.655814988837,
 -3377.7499960608525,
 -556.2929045413541,
 96.0)

In [202]:
cat.shape

(102400,)

In [207]:
cat

array([0., 0., 0., ..., 0., 0., 0.])

## Build the embeddings

In [9]:
dic_helt = pickle.load(open('shape_data/helt.frq', 'rb'))
dic_prot = pickle.load(open('shape_data/prot.frq', 'rb'))
dic_roll = pickle.load(open('shape_data/roll.frq', 'rb'))
dic_mgw = pickle.load(open('shape_data/mgw.frq', 'rb'))

letters = 'ATCG'
length = 3

In [10]:
for ind in range(3):
    df = pd.read_csv('data/Xtr'+str(ind)+'.csv')
    df_emb = pd.DataFrame(columns = [str(i) for i in range((4**length)*(4*length+1))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        cat, _, _, _, _, _ = compute_shape_features(seq, length, dic_helt, dic_prot, dic_roll, dic_mgw,
                                                    letters = 'ATCG')
        df_emb.loc[i] = cat
    df_emb.to_csv('data/'
              + 'Xtr' +str(ind) + '_shapespectr'+str(length)+'.csv', header = False, index = False, sep=" ")

2000it [00:56, 35.11it/s]
2000it [01:21, 24.53it/s]
2000it [00:42, 34.65it/s]


In [11]:
for ind in range(3):
    df = pd.read_csv('data/Xte'+str(ind)+'.csv')
    df_emb = pd.DataFrame(columns = [str(i) for i in range((4**length)*(4*length+1))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        cat, _, _, _, _, _ = compute_shape_features(seq, length, dic_helt, dic_prot, dic_roll, dic_mgw,
                                                    letters = 'ATCG')
        df_emb.loc[i] = cat
    df_emb.to_csv('data/'
              + 'Xte' +str(ind) + '_shapespectr'+str(length)+'.csv', header = False, index = False, sep=" ")

1000it [00:22, 45.01it/s]
1000it [00:19, 52.10it/s]
1000it [00:19, 38.22it/s]
