In [14]:
import re
import string
import math

standard_code = {
     "UUU": "F", "UUC": "F", "UUA": "L", "UUG": "L", "UCU": "S",
     "UCC": "S", "UCA": "S", "UCG": "S", "UAU": "Y", "UAC": "Y",
     "UAA": "*", "UAG": "*", "UGA": "*", "UGU": "C", "UGC": "C",
     "UGG": "W", "CUU": "L", "CUC": "L", "CUA": "L", "CUG": "L",
     "CCU": "P", "CCC": "P", "CCA": "P", "CCG": "P", "CAU": "H",
     "CAC": "H", "CAA": "Q", "CAG": "Q", "CGU": "R", "CGC": "R",
     "CGA": "R", "CGG": "R", "AUU": "I", "AUC": "I", "AUA": "I",
     "AUG": "M", "ACU": "T", "ACC": "T", "ACA": "T", "ACG": "T",
     "AAU": "N", "AAC": "N", "AAA": "K", "AAG": "K", "AGU": "S",
     "AGC": "S", "AGA": "R", "AGG": "R", "GUU": "V", "GUC": "V",
     "GUA": "V", "GUG": "V", "GCU": "A", "GCC": "A", "GCA": "A",
     "GCG": "A", "GAU": "D", "GAC": "D", "GAA": "E", "GAG": "E",
     "GGU": "G", "GGC": "G", "GGA": "G", "GGG": "G"}

kyte_doolittle = {
    "A": "1.8", "R": "-4.5", "N": "-3.5", "D": "-3.5", "C": "2.5", 
    "Q": "-3.5", "E": "-3.5", "G": "-0.4", "H": "-3.2", "I": "4.5", 
    "L": "3.8", "K": "-3.9", "M": "1.9", "F": "2.8", "P": "-1.6", 
    "S": "-0.8", "T": "-0.7", "W": "-0.9", "Y": "-1.3", "V": "4.2"}

In [2]:
## Seq class

class Seq:
    
    def __init__ (self, sequence, gene, species, kmers=''):
        self.sequence = sequence.strip().upper()
        self.gene = gene
        self.species = species    
        self.kmers = []
    
    def __str__(self):
        return self.sequence
    
    def print_record(self):
        print(self.species +", "+ self.gene +" : "+ self.sequence)
    
    def make_kmers(self, subsequence_length = 3):
        self.kmers = []
        total_sequence_length = len(self.sequence)
        for x in range(total_sequence_length-subsequence_length+1):
            self.kmers.append(self.sequence[x])
            for y in range(1,subsequence_length):
                self.kmers[x] = self.kmers[x] + self.sequence[x+y]
    
    def fasta(self):
        return "\n>" + self.species + " " + self.gene + "\n" + self.sequence
            
    
    
        

In [3]:
## Testing class Seq

myseq=Seq("    aT 9ATAG","my_gene","H.sapiens")
myseq.print_record()
print(myseq.gene)
print(myseq.species)
print(myseq)    
myseq.make_kmers(3)
print(myseq.kmers)
print(myseq.fasta())

H.sapiens, my_gene : AT 9ATAG
my_gene
H.sapiens
AT 9ATAG
['AT ', 'T 9', ' 9A', '9AT', 'ATA', 'TAG']

>H.sapiens my_gene
AT 9ATAG


In [4]:
## DNA class

class DNA(Seq):
    
    def __init__(self, sequence, gene, species, gene_id, **kwargs):
        super().__init__(sequence, gene, species)
        self.sequence = re.sub('[^ATGCU]', 'N', sequence)
        self.gene_id = gene_id
    
    def analysis(self):
        y = 0
        for x in self.sequence:
            if x == 'G' or x == 'C':
                y = y + 1
        return y
    
    def print_info(self):
        print(self.gene_id + " " + self.species +", "+ self.gene +" : "+ self.sequence)

In [5]:



## Testing class DNA

d=DNA("GATCTC","my_dna","D.terebrans","AX5667.2")
d.print_record()
print(d)

d.source="Mexico"
print(d.source)
print(d.analysis())
d.print_info()

D.terebrans, my_dna : GATCTC
GATCTC
Mexico
3
AX5667.2 D.terebrans, my_dna : GATCTC


In [6]:
class RNA(DNA):
    
    def __init__(self, sequence, gene, species, gene_id, codons='', **kwargs):
        super().__init__(sequence, gene, species, gene_id)
        self.sequence = re.sub('[T]', 'U', sequence)
        self.codons = []
        
    def make_codons(self):
        self.codons = []
        partition = math.floor(len(self.sequence)/3)
        for x in range(partition):
            self.codons.append(self.sequence[3*x])
            for y in range(1,3):
                self.codons[x] = self.codons[x] + self.sequence[3*x+y]
                
    def translate(self):
        for x in range(len(self.codons)):
            self.codons[x] = standard_code[self.codons[x]]
            
        

In [7]:
r = RNA("GATCTCGATCTCGATCTAA","my_dna","D.terebrans","AX5667.2")
r.print_record()
r.make_codons()
print(r.codons)
r.translate()
print(r.codons)

D.terebrans, my_dna : GAUCUCGAUCUCGAUCUAA
['GAU', 'CUC', 'GAU', 'CUC', 'GAU', 'CUA']
['D', 'L', 'D', 'L', 'D', 'L']


In [12]:
class Protein(Seq):
    
    def __init__(self, sequence, gene, species, kmers='', counts='', **kwargs):
        super().__init__(sequence, gene, species, kmers)
        self.sequence = re.sub('[^A-Z]+', 'X', sequence)
        self.counts = {}
        
    def tabulate_amino_acids(self):
        # counts amino acids in self.sequence
        # adds definitions to self.counts
        return 0
    
    def total_hydro(self, subsequence):
        # use hydrophobicity value from aa_couts for a sequence?
        return 0
    
    def hydro_scan(self):
        if self.kmers == []:
            self.make_kmers(5)
        hydro_list = []
        for x in range(len(self.kmers)):
            hydro_list.append(kmer_average)
        return hydro_list
    
    aa_counts = {"H": "0", "F": "0", "R": "0", "P": "0", "N": "0",
                 "I": "0", "T": "0", "C": "0", "Y": "0", "E": "0",
                 "L": "0", "W": "0", "Q": "0", "A": "0", "S": "0",
                 "K": "0", "V": "0", "G": "0", "D": "0", "U": "0",
                 "M": "0", "O": "0"}
        

In [13]:
p = Protein("    aT 9ATAG","my_gene","H.sapiens")

p.tabulate_amino_acids()
print(p.counts)













{'first': 'A', 'Second': 'B', 'third': 'C'}
