In [1]:
import itertools 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import pickle

In [2]:
# utils

def generateArrangementsOfsizeN(nucleotideAlphabetList, subSeqSize):
    arrangement = []

    for word in itertools.product(nucleotideAlphabetList, repeat=subSeqSize):
        newString = ''.join(word)
        arrangement.append(newString)

    sorted_arrangement = sorted(arrangement)

    return sorted_arrangement


def intToBinaryCharArray(number, numOfBits):
    return list(bin(number)[2:].zfill(numOfBits))

def binaryCharArrayToBinaryIntArray(binaryCharArray):
    binaryIntArray = []
    for c in binaryCharArray:
        binaryIntArray.append(int(c))
    return binaryIntArray


def numToBinaryArray(num, numOfBits):
    l0 = intToBinaryCharArray(num, numOfBits)
    l1 = binaryCharArrayToBinaryIntArray(l0)
    return l1

def createDictForNucleotidesAlphabet(nucleotideAlphabet, subSeqSize, numOfBits):
    gen_dict={}
    subSeqs = generateArrangementsOfsizeN(nucleotideAlphabet, subSeqSize)
    for i in range(0, len(subSeqs)):
        gen_dict[subSeqs[i]] = numToBinaryArray(i+1, numOfBits)
    
    return gen_dict

def generateSubSeqsFromSeq(seq, windowSize = 3, strideSize = 1, printInfo = False):
    #windowSize = 3
    #strideSize = 1
    seqSize = len(seq)
    start = 0

    listSubSeqs = []
    while start + (windowSize - 1) <= (seqSize -1):
        listSubSeqs.append(seq[start : start + windowSize])
        start = start + strideSize
    
    if printInfo:
        print(listSubSeqs)
        print(seqSize)
        print(len(listSubSeqs))

    return listSubSeqs

def organizeSubSeqs(listSubSeqs, regionSize = 2):

    genMatrix = np.empty((0,regionSize))
    #regionSize = regionSize
    subseqIndex = 0
    subSeqLength = len(listSubSeqs)
    subSeqLastIndex = subSeqLength - 1
    while subseqIndex <= (subSeqLastIndex - (regionSize - 1)):
        newColumnStart = subseqIndex
        newColumn = np.array([])
        while newColumnStart < subseqIndex + regionSize:
            newColumn = np.append(newColumn, listSubSeqs[newColumnStart])   #listSubSeqs[newColumnStart])
            newColumnStart = newColumnStart + 1
        genMatrix = np.vstack((genMatrix, newColumn))
        subseqIndex = subseqIndex + 1

    return genMatrix


def genBitMatrixFromSubSeqMatrix(subSeqMatrix, dictSubSeqToArrayBit, colSize = 16):
    num_lines = subSeqMatrix.shape[0]
    num_cols = subSeqMatrix.shape[1]
    bitsMatrix = np.empty((0,colSize))  #TODO: fixo


    for l in range(0, num_lines):
        bitsLine = np.array([])
        for c in range(0, num_cols):
            bitsLine = np.append(bitsLine, dictSubSeqToArrayBit[subSeqMatrix[l][c]])
        
        bitsMatrix = np.vstack((bitsMatrix, bitsLine))

    return bitsMatrix


def verifyIfSeqWindowAndStrideMatches(seqSize, windowSize, strideSize):
    last_window_start = 0
    nOfStrides = 0
    while (last_window_start + windowSize - 1) < (seqSize - 1):
        last_window_start = strideSize * nOfStrides
        nOfStrides = nOfStrides + 1
        #print(last_window_start)
        #print(nOfStrides)
        #print(' ')

    print('n of windows: ' + str(nOfStrides))
    print('last window start: ' + str(last_window_start))
    print('last window end: ' + str(last_window_start + windowSize - 1))
    print('max seq position: ' + str(seqSize - 1))


def salvar_pickle(objeto, nome_arquivo):
  
  with open(nome_arquivo, 'wb') as arquivo:
    pickle.dump(objeto, arquivo)

def carregar_pickle(nome_arquivo):
  with open(nome_arquivo, 'rb') as arquivo:
    objeto = pickle.load(arquivo)
  return objeto




In [3]:
promoters = pd.read_table('promoters.data', delimiter= ',')

promoters.iloc[0,2]

linesSize = promoters.shape[0]

for line in range(linesSize):
    promoters.iloc[line, 2] = promoters.iloc[line, 2].replace('\t', '')
    if(promoters.iloc[line, 0] == '+'):
        promoters.iloc[line, 0] = '1'
    else:
        promoters.iloc[line, 0] = '0'

promoters

Unnamed: 0,class,instance name,sequence
0,1,S10,tactagcaatacgcttgcgttcggtggttaagtatgtataatgcgc...
1,1,AMPC,tgctatcctgacagttgtcacgctgattggtgtcgttacaatctaa...
2,1,AROH,gtactagagaactagtgcattagcttatttttttgttatcatgcta...
3,1,DEOP2,aattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaata...
4,1,LEU1_TRNA,tcgataattaactattgacgaaaagctgaaaaccactagaatgcgc...
...,...,...,...
101,0,799,cctcaatggcctctaaacgggtcttgaggggttttttgctgaaagg...
102,0,987,gtattctcaacaagattaaccgacagattcaatctcgtggatggac...
103,0,1226,cgcgactacgatgagatgcctgagtgcttccgttactggattgtca...
104,0,794,ctcgtcctcaatggcctctaaacgggtcttgaggggttttttgctg...


In [4]:
seqList = promoters['sequence'].to_numpy()
classList = promoters['class'].to_numpy()

In [8]:
dictProm = createDictForNucleotidesAlphabet(['a','c','g','t'], 6, 16)

In [9]:
dictMatrixBySeq = {}
for seq in seqList:
    subSeqsList = generateSubSeqsFromSeq(seq, 6, 3) #def generateSubSeqsFromSeq(seq, windowSize = 3, strideSize = 1, printInfo = False)
    subSeqMatrix = organizeSubSeqs(subSeqsList)
    bitsMatrix = genBitMatrixFromSubSeqMatrix(subSeqMatrix, dictProm, colSize = 32)
    dictMatrixBySeq[seq] = bitsMatrix

dictMatrixBySeq

{'tactagcaatacgcttgcgttcggtggttaagtatgtataatgcgcgggcttgtcgt': array([[0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1.,
         0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0.,
         0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0.,
         0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1.],
        [0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 0., 1., 

In [6]:
keyList = list(dictMatrixBySeq.keys())
#imagesHumanDataSetMatrix
for key in keyList:
    bitsMatrix = dictMatrixBySeq[key]
    img = Image.fromarray(np.uint8(bitsMatrix * 255) , 'L')
    img.save('imagesHumanDataSetMatrix/'+key[0:15]+'.png')

#### montando dataset

In [10]:
listClasses = []
listMatrix = []
listSequence = []
dictToExport = {}

sequences = list(dictMatrixBySeq.keys())

for sequence in sequences:
    
    qr = 'sequence == ' + '"' + sequence + '"'
    strClass = promoters.query(qr)['class'].iloc[0]
    valClass = int(strClass)

    matrix = dictMatrixBySeq[sequence]

    listClasses.append(valClass)
    listMatrix.append(matrix)
    listSequence.append(sequence)

listMatrixNp = np.array(listMatrix)
listClassesNp = np.array(listClasses)

dictToExport['inputs'] = listMatrixNp
dictToExport['outputs'] = listClassesNp
dictToExport['sequences'] = listSequence
    

In [11]:
salvar_pickle(dictToExport, 'dataset_prom_16v1.pkl')