# Criando transformações

Precisamos de um dicionário que mapeia cada permutação do nosso alfabeto para sua representação binária... sem one hot vectors

In [42]:
import itertools 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

In [7]:
arrangement = []

for word in itertools.product(['a', 'c', 'g', 't'], repeat=3):
   newString = ''.join(word)
   #print(newString)
   arrangement.append(newString)

sorted_arrangement = sorted(arrangement)



In [8]:
x = np.empty((0,6))
k = np.array([])
y = np.array([5, 7, 9])
w = np.array([1, 2, 3])
z0 = np.append(k, y)
z0 = np.append(z0, w)
print(z0.shape)

a = np.vstack((x, z0))
b = np.vstack((a, a))
print(b)

(6,)
[[5. 7. 9. 1. 2. 3.]
 [5. 7. 9. 1. 2. 3.]]


In [9]:
def generateArrangementsOfsizeN(nucleotideAlphabetList, subSeqSize):
    arrangement = []

    for word in itertools.product(nucleotideAlphabetList, repeat=subSeqSize):
        newString = ''.join(word)
        arrangement.append(newString)

    sorted_arrangement = sorted(arrangement)

    return sorted_arrangement


def intToBinaryCharArray(number, numOfBits):
    return list(bin(number)[2:].zfill(numOfBits))

def binaryCharArrayToBinaryIntArray(binaryCharArray):
    binaryIntArray = []
    for c in binaryCharArray:
        binaryIntArray.append(int(c))
    return binaryIntArray


def numToBinaryArray(num, numOfBits):
    l0 = intToBinaryCharArray(num, numOfBits)
    l1 = binaryCharArrayToBinaryIntArray(l0)
    return l1

def createDictForNucleotidesAlphabet(nucleotideAlphabet, subSeqSize, numOfBits):
    gen_dict={}
    subSeqs = generateArrangementsOfsizeN(nucleotideAlphabet, subSeqSize)
    for i in range(0, len(subSeqs)):
        gen_dict[subSeqs[i]] = numToBinaryArray(i+1, numOfBits)
    
    return gen_dict




In [10]:
def testIfSeqWindowAndStrideMatches(seqSize, windowSize, strideSize):
    last_window_start = 0
    nOfStrides = 0
    while (last_window_start + windowSize - 1) < (seqSize - 1):
        last_window_start = strideSize * nOfStrides
        nOfStrides = nOfStrides + 1
        #print(last_window_start)
        #print(nOfStrides)
        #print(' ')

    print('n of windows: ' + str(nOfStrides))
    print('last window start: ' + str(last_window_start))
    print('last window end: ' + str(last_window_start + windowSize - 1))
    print('max seq position: ' + str(seqSize - 1))



In [24]:
dict_test = createDictForNucleotidesAlphabet(['a','c','g','t'], 3, 8)

dict_test

{'aaa': [0, 0, 0, 0, 0, 0, 0, 1],
 'aac': [0, 0, 0, 0, 0, 0, 1, 0],
 'aag': [0, 0, 0, 0, 0, 0, 1, 1],
 'aat': [0, 0, 0, 0, 0, 1, 0, 0],
 'aca': [0, 0, 0, 0, 0, 1, 0, 1],
 'acc': [0, 0, 0, 0, 0, 1, 1, 0],
 'acg': [0, 0, 0, 0, 0, 1, 1, 1],
 'act': [0, 0, 0, 0, 1, 0, 0, 0],
 'aga': [0, 0, 0, 0, 1, 0, 0, 1],
 'agc': [0, 0, 0, 0, 1, 0, 1, 0],
 'agg': [0, 0, 0, 0, 1, 0, 1, 1],
 'agt': [0, 0, 0, 0, 1, 1, 0, 0],
 'ata': [0, 0, 0, 0, 1, 1, 0, 1],
 'atc': [0, 0, 0, 0, 1, 1, 1, 0],
 'atg': [0, 0, 0, 0, 1, 1, 1, 1],
 'att': [0, 0, 0, 1, 0, 0, 0, 0],
 'caa': [0, 0, 0, 1, 0, 0, 0, 1],
 'cac': [0, 0, 0, 1, 0, 0, 1, 0],
 'cag': [0, 0, 0, 1, 0, 0, 1, 1],
 'cat': [0, 0, 0, 1, 0, 1, 0, 0],
 'cca': [0, 0, 0, 1, 0, 1, 0, 1],
 'ccc': [0, 0, 0, 1, 0, 1, 1, 0],
 'ccg': [0, 0, 0, 1, 0, 1, 1, 1],
 'cct': [0, 0, 0, 1, 1, 0, 0, 0],
 'cga': [0, 0, 0, 1, 1, 0, 0, 1],
 'cgc': [0, 0, 0, 1, 1, 0, 1, 0],
 'cgg': [0, 0, 0, 1, 1, 0, 1, 1],
 'cgt': [0, 0, 0, 1, 1, 1, 0, 0],
 'cta': [0, 0, 0, 1, 1, 1, 0, 1],
 'ctc': [0, 0,

In [12]:
testIfSeqWindowAndStrideMatches(57, 3, 1)



n of windows: 55
last window start: 54
last window end: 56
max seq position: 56


In [13]:

seqTeste = 'gtactagagaactagtgcattagcttatttttttgttatcatgctaaccacccggcg'
windowSize = 3
strideSize = 1
seqSize = len(seqTeste)
start = 0

listSubSeqs = []
while start + (windowSize - 1) <= (seqSize -1):
    listSubSeqs.append(seqTeste[start : start + windowSize])
    start = start + strideSize

print(listSubSeqs)
print(seqSize)
print(len(listSubSeqs))










['gta', 'tac', 'act', 'cta', 'tag', 'aga', 'gag', 'aga', 'gaa', 'aac', 'act', 'cta', 'tag', 'agt', 'gtg', 'tgc', 'gca', 'cat', 'att', 'tta', 'tag', 'agc', 'gct', 'ctt', 'tta', 'tat', 'att', 'ttt', 'ttt', 'ttt', 'ttt', 'ttt', 'ttg', 'tgt', 'gtt', 'tta', 'tat', 'atc', 'tca', 'cat', 'atg', 'tgc', 'gct', 'cta', 'taa', 'aac', 'acc', 'cca', 'cac', 'acc', 'ccc', 'ccg', 'cgg', 'ggc', 'gcg']
57
55


In [None]:
def generateSubSeqsFromSeq(seq, windowSize = 3, strideSize = 1, printInfo = False):
    windowSize = 3
    strideSize = 1
    seqSize = len(seq)
    start = 0

    listSubSeqs = []
    while start + (windowSize - 1) <= (seqSize -1):
        listSubSeqs.append(seq[start : start + windowSize])
        start = start + strideSize
    
    if printInfo:
        print(listSubSeqs)
        print(seqSize)
        print(len(listSubSeqs))

    return listSubSeqs
    
    


In [21]:
# TODO: testar...
genMatrix = np.empty((0,2))
regionSize = 2
subseqIndex = 0
subSeqLength = len(listSubSeqs)
subSeqLastIndex = subSeqLength - 1
while subseqIndex <= (subSeqLastIndex - (regionSize - 1)):
    newColumnStart = subseqIndex
    newColumn = np.array([])
    while newColumnStart < subseqIndex + regionSize:
        newColumn = np.append(newColumn, listSubSeqs[newColumnStart])   #listSubSeqs[newColumnStart])
        newColumnStart = newColumnStart + 1
    genMatrix = np.vstack((genMatrix, newColumn))
    subseqIndex = subseqIndex + 1

genMatrix

array([['gta', 'tac'],
       ['tac', 'act'],
       ['act', 'cta'],
       ['cta', 'tag'],
       ['tag', 'aga'],
       ['aga', 'gag'],
       ['gag', 'aga'],
       ['aga', 'gaa'],
       ['gaa', 'aac'],
       ['aac', 'act'],
       ['act', 'cta'],
       ['cta', 'tag'],
       ['tag', 'agt'],
       ['agt', 'gtg'],
       ['gtg', 'tgc'],
       ['tgc', 'gca'],
       ['gca', 'cat'],
       ['cat', 'att'],
       ['att', 'tta'],
       ['tta', 'tag'],
       ['tag', 'agc'],
       ['agc', 'gct'],
       ['gct', 'ctt'],
       ['ctt', 'tta'],
       ['tta', 'tat'],
       ['tat', 'att'],
       ['att', 'ttt'],
       ['ttt', 'ttt'],
       ['ttt', 'ttt'],
       ['ttt', 'ttt'],
       ['ttt', 'ttt'],
       ['ttt', 'ttg'],
       ['ttg', 'tgt'],
       ['tgt', 'gtt'],
       ['gtt', 'tta'],
       ['tta', 'tat'],
       ['tat', 'atc'],
       ['atc', 'tca'],
       ['tca', 'cat'],
       ['cat', 'atg'],
       ['atg', 'tgc'],
       ['tgc', 'gct'],
       ['gct', 'cta'],
       ['ct

In [None]:
def organizeSubSeqs(listSubSeqs, regionSize = 2):

    genMatrix = np.empty((0,regionSize))
    #regionSize = regionSize
    subseqIndex = 0
    subSeqLength = len(listSubSeqs)
    subSeqLastIndex = subSeqLength - 1
    while subseqIndex <= (subSeqLastIndex - (regionSize - 1)):
        newColumnStart = subseqIndex
        newColumn = np.array([])
        while newColumnStart < subseqIndex + regionSize:
            newColumn = np.append(newColumn, listSubSeqs[newColumnStart])   #listSubSeqs[newColumnStart])
            newColumnStart = newColumnStart + 1
        genMatrix = np.vstack((genMatrix, newColumn))
        subseqIndex = subseqIndex + 1

    return genMatrix



In [39]:
num_lines = genMatrix.shape[0]
num_cols = genMatrix.shape[1]
bitsMatrix = np.empty((0,16))


for l in range(0, num_lines):
    bitsLine = np.array([])
    for c in range(0, num_cols):
        bitsLine = np.append(bitsLine, dict_test[genMatrix[l][c]])
    
    bitsMatrix = np.vstack((bitsMatrix, bitsLine))

bitsMatrix








    



array([[0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1.],
       [0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 1., 1.],
       [0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1.],
       [0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1.],
       [0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 1., 1.],
       [0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0.],
       [0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1.,

In [None]:
def genBitMatrixFromSubSeqMatrix(subSeqMatrix, dictSubSeqToArrayBit, colSize = 16):
    num_lines = subSeqMatrix.shape[0]
    num_cols = subSeqMatrix.shape[1]
    bitsMatrix = np.empty((0,colSize))  #TODO: fixo


    for l in range(0, num_lines):
        bitsLine = np.array([])
        for c in range(0, num_cols):
            bitsLine = np.append(bitsLine, dictSubSeqToArrayBit[subSeqMatrix[l][c]])
        
        bitsMatrix = np.vstack((bitsMatrix, bitsLine))

    return bitsMatrix

In [48]:
#img = Image.fromarray(bitsMatrix)
#img
img = Image.fromarray(np.uint8(bitsMatrix * 255) , 'L')
img.save('teste00.png')
img.show()

Fizemos a primeira transformação de sequência de nucleotídeos em uma matriz que supostamente respeita informação  posicional da sequência. Próximos passos é transformar esses trechos de código em métodos genéricos e gerar representações para o dataset todo. Depois, podemos implementar uma rede pra ver se essa joça funciona de alguma forma.