In [261]:
import numpy as np
import pandas as pd
import gzip
import pickle

In [265]:
def read_shape_file(shape):
    sh = pd.read_csv('shape_data/'+shape+'.gz', skiprows=[0], sep = '\t', error_bad_lines=False)
    sh.columns = [shape]
    #sh = sh[:100098]
    sh = sh.dropna()
    sh = sh[:sh.shape[0]-1]
    sh = sh.reset_index(drop=True)
    print('Built file with name '+ shape+' and with shape '+ str(sh.shape) )
    return sh

In [266]:
helt = read_shape_file('helt')
prot = read_shape_file('prot')
roll = read_shape_file('roll')
mgw = read_shape_file('mgw')

Built file with name helt and with shape (100000, 1)
Built file with name prot and with shape (100000, 1)
Built file with name roll and with shape (100000, 1)
Built file with name mgw and with shape (100000, 1)


## Open DNA sequence

In [267]:
! wc -l 'shape_data/chr6.fa'

 3422303 shape_data/chr6.fa


50 nucleotides for each line, 171,115,150 nucleotides in total, almost perfect match with shape files

Chromosome 6, middle part, 2,000,000 -> 2,100,000

In [268]:
L = []
i = 0
skipped = 0
with gzip.open('shape_data/chr6.fa.gz', 'rb') as f:
    for line in f:
        # skip 1+40k lines
        if i<40001:
            skipped +=1
            pass
        else:
            # read 2000 lines
            L.append(line[:-1].lower())
            if i>=40001+2000-1:
                break
        i+=1
print('skipped', skipped)
print('included', len(L))

skipped 40001
included 2000


## Match it using pentamers!

In [269]:
4**5 #n_pentamers

1024

In [270]:
def get_pentamer(L, idx):
    assert idx>=2 and idx < 99998, 'out of sequence'
    q = idx//50
    r = idx%50
    
    if r+2<50 and r-2>=0:
        return L[q][r-2:r+2+1]
    elif r+2>=50:
        left = L[q][r-2:]
        return left + L[q+1][:(5-len(left))]
    elif r-2<0:
        right = L[q][:r+2+1]
        return L[q-1][(50-(5-len(right))):] + right

In [271]:
def build_dict(shape_name, shape_df, mean = True):
    dic = {}
    for idx in range(2, 99998):
        pent = get_pentamer(L, idx)
        h = float(shape_df[shape_name][idx])
        if pent not in dic.keys():
            dic[pent] = [h]
        else:
            dic[pent].append(h)
    if mean:
        for k in dic.keys():
            dic[k] = np.array(dic[k]).mean()
    print("Built dictionary for shape "+ shape_name+ " and got "+str(len(dic.keys()))+ ' entries')
    return dic

In [272]:
dic_helt = build_dict('helt', helt)
dic_prot = build_dict('prot', prot)
dic_roll = build_dict('roll', roll)
dic_mgw = build_dict('mgw', mgw)

Built dictionary for shape helt and got 1024 entries
Built dictionary for shape prot and got 1024 entries
Built dictionary for shape roll and got 1024 entries
Built dictionary for shape mgw and got 1024 entries


In [274]:
pickle.dump(dic_helt, open('shape_data/helt.frq', 'wb'))
pickle.dump(dic_prot, open('shape_data/prot.frq', 'wb'))
pickle.dump(dic_roll, open('shape_data/roll.frq', 'wb'))
pickle.dump(dic_mgw, open('shape_data/mgw.frq', 'wb'))

## Build shape features on dataset