In [107]:
"""This submodule will handler DNA sequence and one-hot encoded seqeunce convert"""
import numpy as np
class CodeException(Exception):
    """Raise when input code is not in in defined space"""
    def __init__(self,invalid_code,valid_codes=None):
        self._invalid_code = invalid_code
        self._valid_codes = valid_codes
        mess = str(invalid_code)+' is not in defined space'
        if self._valid_codes is not None:
            mess+=(",valid codes are "+str(self._valid_codes))
        super().__init__(mess)
    @property
    def invalid_code(self):
        return self._invalid_code
    @property
    def valid_codes(self):
        return self._valid_codes
class SeqException(Exception):
    """Raise when input sequences has at least a code is not in in defined space"""
    def __init__(self,invalid_code,valid_codes=None):
        mess = "Seqeunce has a invalid code,"+str(invalid_code)
        if valid_codes is not None:
            mess+=(" ,valid codes are "+str(valid_codes))
        super().__init__(mess)
class CodesConverter():
    def __init__(self,code_vec_dictionay=None,codes=None,is_case_sensitivity=False):
        if code_vec_dictionay is not None:
            if CodesConverter.is_dict_inversible(code_vec_dictionay):
                if is_case_sensitivity:
                    self._code_vec_dictionay = code_vec_dictionay
                    self._codes = list(code_vec_dictionay.keys())
                else:
                    self._code_vec_dictionay = {}
                    self._codes = []
                    for code,vec in code_vec_dictionay.items():
                        code_key = str(code).toupper()
                        self._code_vec_dictionay[code_key]=vec
                        self._codes.append(code_key)
            else:
                raise Exception("Diciotnary is not inversible")
        else:
            self._codes=codes or self._default_code()
            if CodesConverter.is_values_unique(self._codes):
                self._code_vec_dictionary = self._codes_to_vecs_dictionay(self._codes)
            else:
                raise Exception("Codes is not unique")
        self._vec_code_dictionary = {str(v):k for k,v in self._code_vec_dictionary.items()}
        self._is_case_sensitivity = is_case_sensitivity
    @staticmethod
    def is_values_unique(values):
        return len(values)==len(set(values))
    @staticmethod
    def is_dict_inversible(dict_):
        dict_values = list(dict_.values())
        return CodesConverter.lis_values_unique(dict_values)
    def _default_code(self):
        return ['A', 'T', 'C', 'G']
    def _codes_to_vecs_dictionay(self, codes):
        dict_ = {}
        for index,code in enumerate(codes):
            zero = np.zeros(len(codes),dtype='int')
            zero[index] = 1
            dict_[code]=zero.tolist()
        return dict_
    def _preprocess(self,value):
        value =str(value)
        if self._is_case_sensitivity:
            return value
        else:
            return value.upper()
    def _element_convert(self,element,dictionary):
        """convert element by dictionay"""
        result = dictionary.get(self._preprocess(element))
        if result is None:
            raise CodeException(str(element),list(dictionary.keys()))
        else:
            return result
    def _seq_convert(self,seq,element_convert_method):
        """convert sequence by dictionay"""
        code_list = list(seq)
        arr = []
        for code in code_list:
            try:
                arr.append(element_convert_method(code))
            except CodeException as exp:
                print(exp.invalid_code)
                raise SeqException(exp.invalid_code,exp.valid_codes)
        return arr
    def code2vec(self, code):
        """convert DNA code to one hot encoding"""
        return self._element_convert(code,self._code_vec_dictionary)
    def vec2code(self, vec):
        """convert DNA code to one hot encoding"""
        return self._element_convert(vec,self._vec_code_dictionary)
    def seq2vec(self,seq):
        """convert sequence to one hot encoding sequence"""
        return self._seq_convert(seq,self.code2vec)
    def vecs2seq(self,vecs,join=True):
        """convert vector of vectir to converted result"""
        result = self._seq_convert(vecs,self.vec2code)
        if join:
            return "".join(result)
        else:
            return result
    def seqs2dnn_data(data, discard_dirty_sequence):
    """read and return valid sequnece's one-hot-encoding vector"""
    code_dim = 4
    vec_data = {}
    for name, seq in data.items():
        try:
            vec = codes2vec(seq)
            vec_data[name] = vec#np.array(vec).reshape(len(seq), code_dim)
        except SeqException as exception:
            if not discard_dirty_sequence:
                raise exception
    return vec_data
def code2vec(code):
    """convert DNA code to one hot encoding"""
    target = ['A', 'T', 'C', 'G']
    nucleotide_a = [1, 0, 0, 0]
    nucleotide_t = [0, 1, 0, 0]
    nucleotide_c = [0, 0, 1, 0]
    nucleotide_g = [0, 0, 0, 1]
    vec = [nucleotide_a, nucleotide_t, nucleotide_c, nucleotide_g]
    length_of_nucleotide_type = len(target)
    for i in range(length_of_nucleotide_type):
        if code.upper() == target[i]:
            return vec[i]
    raise CodeException(str(code)+' is not in space')
def vec2code(vector):
    """convert one hot encoding to DNA code"""
    nucleotide_a = [1, 0, 0, 0]
    nucleotide_t = [0, 1, 0, 0]
    nucleotide_c = [0, 0, 1, 0]
    nucleotide_g = [0, 0, 0, 1]
    target = [nucleotide_a, nucleotide_t, nucleotide_c, nucleotide_g]
    code = ['A', 'T', 'C', 'G']
    length_of_nucleotide_type = len(target)
    for i in range(length_of_nucleotide_type):
        if vector == target[i]:
            return code[i]
    raise CodeException(str(vector)+' is not in space')

def codes2vec(codes):
    """convert DNA sequence to one hot encoding sequence"""
    code_list = list(codes)
    arr = []
    for code in code_list:
        try:
            arr.append(code2vec(code))
        except CodeException:
            raise SeqException('Sequence has invalid code in it')
    return arr

def vec2codes(vector):
    """convert one hot encoding sequence to DNA sequence"""
    characters = list(vector)
    arr = []
    for character in characters:
        try:
            arr.append(vec2code(character))
        except CodeException:
            raise SeqException('Sequence vector has invalid vector in it')
    return arr



In [108]:
converter = CodesConverter(is_case_sensitivity=True)
converter.vecs2seq(converter.seq2vec('ATTCGTTC'))

'ATTCGTTC'

In [109]:
seqs2dnn_data({'An':'ATTCGTTC','B':'AAA'},True)

{'An': [[1, 0, 0, 0],
  [0, 1, 0, 0],
  [0, 1, 0, 0],
  [0, 0, 1, 0],
  [0, 0, 0, 1],
  [0, 1, 0, 0],
  [0, 1, 0, 0],
  [0, 0, 1, 0]],
 'B': [[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0]]}

In [106]:
seqs2dnn_data({'An':'ATTCGTTC','B':'AAA'},True)

{'An': array([[1, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0]]), 'B': array([[1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0]])}