### Converting windows to 1D vectors and then 2D arrays

1. since we need to give CNN a matrix input of some sort, we will first convert each nucleotide to a 1D vector 
2. then each window will be a 2D array of vectors  

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'



In [2]:
ago1234 = pd.read_csv('../data/intermediate/ago1234_windowed.csv', index_col=0)
ago1234.head()

Unnamed: 0,ID,sequence_identity,sequence,RBP_binding,seq_len,padded_seqs,padded_len,window0,window50,window100,window150,window200,window250,window300
0,"CID_041533;chrX,47028785,47028819,+",positive_test,gaacacaucaccugggccucuugcaccuuuuagaaagggcaaccuu...,1,334,gaacacaucaccugggccucuugcaccuuuuagaaagggcaaccuu...,400,gaacacaucaccugggccucuugcaccuuuuagaaagggcaaccuu...,ucccuccccauccuucuuccuccacuagugguggucguggugacag...,ggccgcuauggagccacugaccgcucgcaggaugaugguggggaga...,cagccgagaccacgacuaccgggacauggacuaccguucauauccu...,aguauggcagccaggagggcaagcaugacuaugacgacucaucuga...,cagagugcggaggugaggagggggcgcgcugcgccaggccuggcug...,gggcucccaagggcccucugugucuggcugcagcnnnnnnnnnnnn...
1,"CID_014973;chr15,90774588,90774613,-",positive_test,guccuguggcguuuguucuccuaggccaaccccuucaaggagcgaa...,1,325,guccuguggcguuuguucuccuaggccaaccccuucaaggagcgaa...,400,guccuguggcguuuguucuccuaggccaaccccuucaaggagcgaa...,cagggucuucuccacauccccagccaaagacagccuuagcuuugag...,uccuggaucuccucaguguguucagugacacagccacgccagacau...,ucccauuaugccuuccgcaucuuuggugagaaccaggagcagcccg...,gacaccaccccuucccacgugggcuuugugguggcccugcuuggga...,ggcgccguuuggggaaccucagagccccucccaaccgugucuuuug...,ccugugucugcucucuagacuuugannnnnnnnnnnnnnnnnnnnn...
2,"CID_015737;chr16,30959327,30959347,+",positive_test,ccuucccccugacccugacuccuugaacgucacugaaaacggcagc...,1,320,ccuucccccugacccugacuccuugaacgucacugaaaacggcagc...,400,ccuucccccugacccugacuccuugaacgucacugaaaacggcagc...,gcaaggagugggggccgcgggcagccgcucuucagcucgcggccca...,aguggcgaggggcgccccaacccccugcccgccucuccgcacaaua...,aacauucaucuguacugaaguguuacuugaaccgggggaaucucgg...,gggggagccggggugugaggggacuggaccagcuuggacugagacc...,accgggccggugggcgcccauuugggacugcgccacccccaggcuu...,uuguuuuacuguauugagcgnnnnnnnnnnnnnnnnnnnnnnnnnn...
3,"CID_025246;chr2,227661131,227661173,-",positive_test,ugccucaccccaaacccccaguggagagcagcggugguaagcucuu...,1,342,ugccucaccccaaacccccaguggagagcagcggugguaagcucuu...,400,ugccucaccccaaacccccaguggagagcagcggugguaagcucuu...,ugcacaggugacuacaugaacaugucaccagugggggacuccaaca...,cagccccuccgacugcuacuacggcccugaggacccccagcacaag...,uccucuccuacuacucauugccaagauccuuuaagcacacccagcg...,ggggagccggaggagggugcccggcaucagcaccuccgccuuucca...,cucuggucgccuucucuaugcugcaacagcagaugauucuuccucu...,ccagcagcgacagccugggugggggauacugcggggcuaggcnnnn...
4,"CID_009003;chr12,6347000,6347052,+",positive_test,gacccaugucucucccuuucccucagccuuccuucagaucaaacca...,1,352,gacccaugucucucccuuucccucagccuuccuucagaucaaacca...,400,gacccaugucucucccuuucccucagccuuccuucagaucaaacca...,gauccucauguuucuuccuaucuccuagauauuuggcaugaucuuc...,ugaucuugugcugugcuauccgcaggaaccgcgagauggucuagag...,cuuacaucccugagcaggaaaguuuacccaugaagauuggugggau...,uguuuguuuguuuuguuuuguuuguuguuuguuguuuguuuuuuug...,uaauuuuaguauucauucugcauugcuagauaaaagcugaaguuac...,uguuugucuuuuaaugcuucauucaauauugacauuuguaguugag...


In [3]:
def get_RNA_conv_array(seq):
    seq_len = len(seq)
    #print(seq_len)
    
    alpha='acgu'
    seq_arr = np.zeros([seq_len, 4])
    for idx,char in enumerate(seq):
        if char not in alpha:
            seq_arr[idx] = np.array([0.25*4])
        else:
            index = alpha.index(char)
            seq_arr[idx][index] = 1
            
    return seq_arr

In [4]:
def get_all_subseq_arrays(row):
    arrs = []
    for i in range(0, 301, 50):
        subseq = row[f'window{i}']
        subseq_arr = get_RNA_conv_array(subseq)
        arrs.append(subseq_arr.T)
    return arrs

In [5]:
X = []
y = []

for index,row in ago1234.iterrows():
    X.append(get_all_subseq_arrays(row))
    y.append(row['RBP_binding'])

In [6]:
X = np.array(X)
y = np.array(y)

print(X.shape)
print(y.shape)

(63118, 7, 4, 100)
(63118,)


In [7]:
print(len(['window0', 'window50', 'window100', 'window150', 'window200', 'window250', 'window300']))

7
