In [1]:
import itertools 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import pickle

In [2]:
# utils 

def generateArrangementsOfsizeN(nucleotideAlphabetList, subSeqSize):
    arrangement = []

    for word in itertools.product(nucleotideAlphabetList, repeat=subSeqSize):
        newString = ''.join(word)
        arrangement.append(newString)

    sorted_arrangement = sorted(arrangement)

    return sorted_arrangement


def intToBinaryCharArray(number, numOfBits):
    return list(bin(number)[2:].zfill(numOfBits))

def binaryCharArrayToBinaryIntArray(binaryCharArray):
    binaryIntArray = []
    for c in binaryCharArray:
        binaryIntArray.append(int(c))
    return binaryIntArray


def numToBinaryArray(num, numOfBits):
    l0 = intToBinaryCharArray(num, numOfBits)
    l1 = binaryCharArrayToBinaryIntArray(l0)
    return l1

def createDictForNucleotidesAlphabet(nucleotideAlphabet, subSeqSize, numOfBits):
    gen_dict={}
    subSeqs = generateArrangementsOfsizeN(nucleotideAlphabet, subSeqSize)
    for i in range(0, len(subSeqs)):
        gen_dict[subSeqs[i]] = numToBinaryArray(i+1, numOfBits)
    
    return gen_dict

def generateSubSeqsFromSeq(seq, windowSize = 3, strideSize = 1, printInfo = False):
    #windowSize = 3
    #strideSize = 1
    seqSize = len(seq)
    start = 0

    listSubSeqs = []
    while start + (windowSize - 1) <= (seqSize -1):
        listSubSeqs.append(seq[start : start + windowSize])
        start = start + strideSize
    
    if printInfo:
        print(listSubSeqs)
        print(seqSize)
        print(len(listSubSeqs))

    return listSubSeqs

def organizeSubSeqs(listSubSeqs, regionSize = 2):

    genMatrix = np.empty((0,regionSize))
    #regionSize = regionSize
    subseqIndex = 0
    subSeqLength = len(listSubSeqs)
    subSeqLastIndex = subSeqLength - 1
    while subseqIndex <= (subSeqLastIndex - (regionSize - 1)):
        newColumnStart = subseqIndex
        newColumn = np.array([])
        while newColumnStart < subseqIndex + regionSize:
            newColumn = np.append(newColumn, listSubSeqs[newColumnStart])   #listSubSeqs[newColumnStart])
            newColumnStart = newColumnStart + 1
        genMatrix = np.vstack((genMatrix, newColumn))
        subseqIndex = subseqIndex + 1

    return genMatrix


def genBitMatrixFromSubSeqMatrix(subSeqMatrix, dictSubSeqToArrayBit, colSize = 16):
    num_lines = subSeqMatrix.shape[0]
    num_cols = subSeqMatrix.shape[1]
    bitsMatrix = np.empty((0,colSize))  #TODO: fixo


    for l in range(0, num_lines):
        bitsLine = np.array([])
        for c in range(0, num_cols):
            #print(subSeqMatrix[l][c])
            bitsLine = np.append(bitsLine, dictSubSeqToArrayBit[subSeqMatrix[l][c]])
        
        bitsMatrix = np.vstack((bitsMatrix, bitsLine))

    return bitsMatrix


def verifyIfSeqWindowAndStrideMatches(seqSize, windowSize, strideSize):
    last_window_start = 0
    nOfStrides = 0
    while (last_window_start + windowSize - 1) < (seqSize - 1):
        last_window_start = strideSize * nOfStrides
        nOfStrides = nOfStrides + 1
        #print(last_window_start)
        #print(nOfStrides)
        #print(' ')

    print('n of windows: ' + str(nOfStrides))
    print('last window start: ' + str(last_window_start))
    print('last window end: ' + str(last_window_start + windowSize - 1))
    print('max seq position: ' + str(seqSize - 1))


def salvar_pickle(objeto, nome_arquivo):
  
  with open(nome_arquivo, 'wb') as arquivo:
    pickle.dump(objeto, arquivo)

def carregar_pickle(nome_arquivo):
  with open(nome_arquivo, 'rb') as arquivo:
    objeto = pickle.load(arquivo)
  return objeto




In [4]:
h3df = pd.read_csv('h3_csv.csv');
h3df = h3df.drop(columns= 'Unnamed: 0')
h3df = h3df.drop(columns= 'seqSize')
h3df
# promoters.iloc[0,2]

# linesSize = promoters.shape[0]

# for line in range(linesSize):
#     promoters.iloc[line, 2] = promoters.iloc[line, 2].replace('\t', '')
#     if(promoters.iloc[line, 0] == '+'):
#         promoters.iloc[line, 0] = '1'
#     else:
#         promoters.iloc[line, 0] = '0'

# promoters

Unnamed: 0,seqName,sequence,class
0,>iYAL067W-A_5085,AATTTTTATAGGTCGACCCTTCTGTCGCTTACTGGGTTGATTATCT...,0
1,>iYAL067W-A_6786,AATTATATTTCCATCAGCTCAATACCGCAGTACTTTGAAACCTGAT...,0
2,>iYAL067W-A_7060,AACAATAGTGGGTATGAGTAAAGATATATAGATCGATATTTTGAAT...,0
3,>YAL067C_SEO1_8061,CAAAGATTTCAACCATAGTAGGTATGCCCCAGATGAAACATTACTG...,1
4,>YAL067C_SEO1_8340,CACAATAGCGTCAATAATAAAGTTCCATCTCCATCCCTCTAAACCA...,1
...,...,...,...
14958,>iYPR198W_936110,TCATAATTACAAAATACCCTGTGATTAGGATATCATTATTACTAAC...,0
14959,>iYPR198W_937594,ATATGTTTACAACAAAGTTATAAAAAGTTTTCTCAAACCTTTTCCA...,1
14960,>YPR200C_ARR2_939397,ATTATATTTAATTTAATAAGAAAAGAAACGAAAAAAAAAAAAAAAA...,1
14961,>YPR201W_ARR3_940099,TATAAATGAATGCTCTCGTTGTAATTCAAGAGAACCCAACCAACAA...,1


In [5]:
seqList = h3df['sequence'].to_numpy()
classList = h3df['class'].to_numpy()

#### calculando tamanhos

In [17]:
verifyIfSeqWindowAndStrideMatches(500, 3, 1)   #500, 3, 1; 500,15, 5

n of windows: 498
last window start: 497
last window end: 499
max seq position: 499


In [6]:
dictH3 = createDictForNucleotidesAlphabet(['A','C','G','T'], 3, 8)

In [14]:
dictH3['AGC']

[0, 0, 0, 0, 1, 0, 1, 0]

In [9]:
dictMatrixBySeq = {}
progressCounter = 0;
for seq in seqList:
    subSeqsList = generateSubSeqsFromSeq(seq, 4, 2) #def generateSubSeqsFromSeq(seq, windowSize = 3, strideSize = 1, printInfo = False)
    subSeqMatrix = organizeSubSeqs(subSeqsList)
    bitsMatrix = genBitMatrixFromSubSeqMatrix(subSeqMatrix, dictH3, colSize = 18)
    dictMatrixBySeq[seq] = bitsMatrix
    progressCounter = progressCounter + 1;
    print(str(progressCounter) + '/' + str(len(seqList)) + ' sequências processadas')

dictMatrixBySeq

1/14963 sequências processadas
2/14963 sequências processadas
3/14963 sequências processadas
4/14963 sequências processadas
5/14963 sequências processadas
6/14963 sequências processadas
7/14963 sequências processadas
8/14963 sequências processadas
9/14963 sequências processadas
10/14963 sequências processadas
11/14963 sequências processadas
12/14963 sequências processadas
13/14963 sequências processadas
14/14963 sequências processadas
15/14963 sequências processadas
16/14963 sequências processadas
17/14963 sequências processadas
18/14963 sequências processadas
19/14963 sequências processadas
20/14963 sequências processadas
21/14963 sequências processadas
22/14963 sequências processadas
23/14963 sequências processadas
24/14963 sequências processadas
25/14963 sequências processadas
26/14963 sequências processadas
27/14963 sequências processadas
28/14963 sequências processadas
29/14963 sequências processadas
30/14963 sequências processadas
31/14963 sequências processadas
32/14963 sequênci

{'AATTTTTATAGGTCGACCCTTCTGTCGCTTACTGGGTTGATTATCTTGTGCTTTCTTAGTATCTATCACAAAGGAGACAAAATCGTTGATAAAAAGTGCATCAACATTCCCAGCCAGAAAATGCACATCATAAAGACATGTTATTCAAGAGCCACGACCGTCTTCAATTTATCTTTTATAAAAAACCCTTGTTCTACTGACAGGATGGAATAGATATTAAATATACATTTTGCATTTTTTTTTTTTTCTGTATTGAAGATTTGTATATGAAAGATGTTTATACATCAAATGCTTTGAATAAAGCCATCTTAATTTCAATTTCATGCCCTCCTTCACCGTTTTCTGTTGGTCTAGAGGTAGCTTGTTGTGGTCACTAATGAGAACTTAAATAGTTTTCAACTGCTGGTGGTAAATCAATAATTTATGTTCTTAACCTAACATTTGATGACCTTTGATGCGTTGGTTATGTTGAAGACAAATTGCCTCTAATCAGTTCCATT': array([[0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 1., 0., 1.],
        [0., 1., 1., ..., 1., 0., 1.],
        ...,
        [0., 0., 1., ..., 1., 1., 0.],
        [0., 1., 0., ..., 1., 0., 1.],
        [0., 1., 1., ..., 0., 0., 0.]]),
 'AATTATATTTCCATCAGCTCAATACCGCAGTACTTTGAAACCTGATTTATATATTGCAGAACTTAATTAAAAGTACATTGTAGTTCAAAAAATAAATATCAAACTTTTGGACCCTCTCTTATTGCCTCCCAATTAATTAAAACATCTTTTCTTCCAATCTACAGGTTTGAAAAGGTAATAAGTAATATAAACTTGAGAACCAAAAAAAAAAAAAAAAAAATACTGATCCTTACAGGTTTTAAGGT

In [36]:
dictH3['AAAAAAAAAAAA']

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]

In [11]:
keyList = list(dictMatrixBySeq.keys())
#imagesHumanDataSetMatrix
counter = 0
while counter < 15:
    key = keyList[counter]
    bitsMatrix = dictMatrixBySeq[key]
    img = Image.fromarray(np.uint8(bitsMatrix * 255) , 'L')
    img.save('imagesH3S2W4/'+key[0:15]+'.png')
    counter = counter + 1

In [43]:
keyList = list(dictMatrixBySeq.keys())

index = 0

while index < 10:
    key = keyList[index]
    bitsMatrix = dictMatrixBySeq[key]
    img = Image.fromarray(np.uint8(bitsMatrix * 255) , 'L')
    img.save('imagesH3S4W12/'+key[0:15]+'.png')
    index = index + 1

#### montando dataset

In [12]:
listClasses = []
listMatrix = []
listSequence = []
dictToExport = {}

sequences = list(dictMatrixBySeq.keys())

for sequence in sequences:
    
    qr = 'sequence == ' + '"' + sequence + '"'
    strClass = h3df.query(qr)['class'].iloc[0]
    valClass = int(strClass)

    matrix = dictMatrixBySeq[sequence]

    listClasses.append(valClass)
    listMatrix.append(matrix)
    listSequence.append(sequence)

listMatrixNp = np.array(listMatrix)
listClassesNp = np.array(listClasses)

dictToExport['inputs'] = listMatrixNp
dictToExport['outputs'] = listClassesNp
dictToExport['sequences'] = listSequence
    

In [13]:
salvar_pickle(dictToExport, 'dataset_H3_W4S2.pkl')