In [3]:
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import geneddit as ge
from collections import Counter

ModuleNotFoundError: No module named 'pandas'

In [None]:
def readFASTA(filename):
    """This function reads a FASTA format file and
    returns a pair of strings
    with the header and the sequence
    """
    with open(filename) as a:
        lines = [line.strip() for line in a] #Get every line in the file, remove the blank space (\n) and add to the list
    lines = [line for line in lines if len(line) > 0] #Remove empty elements on the previous list
    
    if lines[0].startswith('>'): #Conditional for two possible FASTA types. If the first line is not the protein or DNA line, then separate it
        return lines[0], ''.join(lines[1:])
    else: #If not, just join all the lines
        return '', ''.join(lines)

# A função 'open' lê cada linha como um elemento individual do ficheiro.
# É quase como se cada linha fosse um elemento da lista gerada com a função.
# A função retorna uma string de acordo com as operações mencionadas.

basesDNA = 'ATGC'
basesRNA = 'AUGC'

aa_residues   = "ACDEFGHIKLMNPQRSTVWY"

complementDNA = { 'A':'T', 'T':'A', 'G':'C', 'C':'G'}
complementRNA = { 'A':'U', 'T':'A', 'G':'C', 'C':'G'}

gencode = {'UUU': 'F', 'UUC': 'F', 'UUA': 'L', 'UUG': 'L', 'UCU': 'S',
     'UCC': 'S', 'UCA': 'S', 'UCG': 'S', 'UAU': 'Y', 'UAC': 'Y',
     'UGU': 'C', 'UGC': 'C', 'UGG': 'W', 'CUU': 'L', 'CUC': 'L',
     'CUA': 'L', 'CUG': 'L', 'CCU': 'P', 'CCC': 'P', 'CCA': 'P',
     'CCG': 'P', 'CAU': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
     'CGU': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'AUU': 'I',
     'AUC': 'I', 'AUA': 'I', 'AUG': 'M', 'ACU': 'T', 'ACC': 'T',
     'ACA': 'T', 'ACG': 'T', 'AAU': 'N', 'AAC': 'N', 'AAA': 'K',
     'AAG': 'K', 'AGU': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
     'GUU': 'V', 'GUC': 'V', 'GUA': 'V', 'GUG': 'V', 'GCU': 'A',
     'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAU': 'D', 'GAC': 'D',
     'GAA': 'E', 'GAG': 'E', 'GGU': 'G', 'GGC': 'G', 'GGA': 'G',
     'GGG': 'G', 'UAA': 'STOP', 'UAG': 'STOP', 'UGA': 'STOP'}

trans123 = {'A': 'Ala', 'C': 'Cys', 'E': 'Glu', 'D': 'Asp', 'G': 'Gly',
            'F': 'Phe', 'I': 'Ile', 'H': 'His', 'K': 'Lys', 'M': 'Met',
            'L': 'Leu', 'N': 'Asn', 'Q': 'Gln', 'P': 'Pro', 'S': 'Ser',
            'R': 'Arg', 'T': 'Thr', 'W': 'Trp', 'V': 'Val', 'Y': 'Tyr'}

aa_masses = {'A': 71.0788, 
             'C': 103.1448, 
             'E': 129.1155, 
             'D': 115.0886, 
             'G': 57.0519, 
             'F': 147.1766, 
             'I': 113.1594, 
             'H': 137.1411, 
             'K': 128.1741, 
             'M': 131.1926, 
             'L': 113.1594, 
             'N': 114.1038, 
             'Q': 128.1307, 
             'P': 97.11667, 
             'S': 87.0782, 
             'R': 156.1875, 
             'T': 101.1051, 
             'W': 186.2132, 
             'V': 99.1326, 
             'Y': 163.176
             }

aa_classes = { 'small'       : 'PCAGVTDSN',
    'tiny'        : 'AGCS',
    'aliphatic'   : 'ILV',
    'aromatic'    : 'FYWH',
    'positive'    : 'KHR',
    'negative'    : 'DE',
    'charged'     : 'KHRDE',
    'hydrophobic' : 'CAGIVLTMHYWF',
    'polar'       : "CSTNDYWHKQRDE",
    'proline'     : 'P'}


def complement(seq):
    complementair = []
    for n in seq:
        complementair.append(complementDNA[n])
    comp = ''.join(complementair)
    return comp

def prot_noncod(seq): #From the noncoding chain it produces the associated protein
    comp = complement(seq)
    mRNA = []
    for n in comp:
        mRNA.append(complementRNA[n])
    mRNA = ''.join(mRNA)

    protein = []
    cod = [mRNA[i] + mRNA[i+1] + mRNA[i+2] for i in range(0, len(mRNA), 3)]
    for n in cod:
        protein.append(gencode[n])
    protein = '-'.join(protein)
    return protein

def prot_cod(seq): #From the noncoding chain it produces the associated protein
    mRNA = []
    for n in seq:
        mRNA.append(complementRNA[n])
    mRNA = ''.join(mRNA)

    protein = []
    cod = [mRNA[i] + mRNA[i+1] + mRNA[i+2] for i in range(0, len(mRNA), 3)]
    for n in cod:
        protein.append(gencode[n])
    protein = '-'.join(protein)
    return protein
    
def BLAST(seq, seq1): #For a sequence (seq), this function analyses if another, smaller sequence (seq1) is contained inside the first one
    complementair = []
    for n in seq:
        complementair.append(complementDNA[n])
    comp = ''.join(complementair)

    anneal = str(seq1)

    if anneal in seq:
        return print(f'{anneal} appears in the requested sequence')
    elif anneal in comp:
        return print(f'{anneal} appears in the sequence complementary to the requested one')
    else:
        return print(f'{anneal} does not appear in any sequence') 

def gene_size(gene, primerfw, primerrv):
    if primerfw in gene:
        a = gene.split(primerfw)[1]
        a = primerfw + a
        a = a.split(complement(primerrv))[0]
        a = a + complement(primerrv)
        size = len(a)
    return print(f'The requested amplicon has {size} bp')

def amplicon(gene, primerfw, primerrv):
    try:
        if primerfw in gene:
            a = gene.split(primerfw)[1]
            a = primerfw + a
            a = a.split(complement(primerrv))[0]
            a = a + complement(primerrv)
            return a
    except:
        print('Foward primer not found')
    
def reverse(x):
    return x[::-1]

def mass(seq):
    mass_list = []
    for aa in seq:
        mass = aa_masses[aa]
        mass_list.append(mass)
    return sum(mass_list)

## Gene analyser functions' test

In [None]:
import geneddit as ge
gene = readFASTA('GLO1_flanking.fsa')[1]
gene2 = readFASTA('VPS13_flanking.fsa')[1]

#print(gene)
#GLO1:
primerfw = 'AAGGGGCTTTACGATGGAGT' #5' - 3'
primerrv = 'TCAGTTCCTAGGTCGTTCCT' #3' - 5'

#VPS13:
primerfw1 = 'CGATCAGGCGAAAATAGC'
primerrv1 = 'TACGCGGTGTTTTTTAGTTATA'

BLAST(gene, primerrv)
BLAST(gene, primerfw)
BLAST(gene2, primerfw1)
BLAST(gene2, primerrv1)
gene_size(gene, primerfw, primerrv)
gene_size(gene2, primerfw1, primerrv1)
amplicon = ge.amplicon(gene, primerfw, primerrv)
print(len(amplicon))

#print(prot_noncod(gene))


TCAGTTCCTAGGTCGTTCCT appears in the sequence complementary to the requested one
AAGGGGCTTTACGATGGAGT appears in the requested sequence
CGATCAGGCGAAAATAGC appears in the requested sequence
TACGCGGTGTTTTTTAGTTATA appears in the sequence complementary to the requested one
The requested amplicon has 336 bp
The requested amplicon has 489 bp
336


## Altering a protein sequence for structure homology modeling 

In [None]:
HMGcoa = 'GAMASSVLVTQEPEIELPREPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETLIETHERGVSIRRQLLSKKLSEPSSLQYLPYRDYNYSLVMGACCENVIGYMPIPVGVAGPLCLDEKEFQVPMATTEGCLVASTNRGCRAIGLGGGASSRVLADGMTRGPVVRLPRACDSAEVKAWLETSEGFAVIKEAFDSTSRFARLQKLHTSIAGRNLYIRFQSRSGDAMGMNMISKGTEKALSKLHEYFPEMQILAVSGNYCTDKKPAAINWIEGRGKSVVCEAVIPAKVVREVLKTTTEAMIEVNINKNLVGSAMAGSIGGYNAHAANIVTAIYIACGQDAAQNVGSSNCITLMEASGPTNEDLYISCTMPSIEIGTVGGGTNLLPQQACLQMLGVQGACKDNPGENARQLARIVCGTVMAGELSLMAALAAGHLVKSHMIHNRSKINLQDLQGACTKKTA'
aminoacids = []
for aa in HMGcoa:
    aminoacids.append(aa)
#print(len(aminoacids))
index = range(422,889)
#print(len(index))
df = pd.DataFrame(aminoacids, index, ['aminoacids'])

# Insert mutations on the protein sequence
df.loc[692] = 'A'
df.loc[590] = 'A'
df.loc[735] = 'D'
df.loc[752] = 'E'

mutated_aa = df['aminoacids'].tolist()
mutated_HMGcoa = ''.join(mutated_aa)
print(HMGcoa)
print(mutated_HMGcoa)

# Confirming mutation occurred at the right spot
for i in range(0,len(HMGcoa)):
    if HMGcoa[i] == mutated_HMGcoa[i]:
        continue
    else: 
        print(i+422,HMGcoa[i],'->',mutated_HMGcoa[i])

count = dict()
for aminoacid in HMGcoa:
    if aminoacid in count:
        count[aminoacid] = count[aminoacid] + 1
    else:
        count[aminoacid] = 1
print(count)
count2 = dict()
for aminoacid in mutated_HMGcoa:
    if aminoacid in count2:
        count2[aminoacid] = count2[aminoacid] + 1
    else:
        count2[aminoacid] = 1
print(count2)

#pd.DataFrame(data=count,index=['count']).transpose()

GAMASSVLVTQEPEIELPREPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETLIETHERGVSIRRQLLSKKLSEPSSLQYLPYRDYNYSLVMGACCENVIGYMPIPVGVAGPLCLDEKEFQVPMATTEGCLVASTNRGCRAIGLGGGASSRVLADGMTRGPVVRLPRACDSAEVKAWLETSEGFAVIKEAFDSTSRFARLQKLHTSIAGRNLYIRFQSRSGDAMGMNMISKGTEKALSKLHEYFPEMQILAVSGNYCTDKKPAAINWIEGRGKSVVCEAVIPAKVVREVLKTTTEAMIEVNINKNLVGSAMAGSIGGYNAHAANIVTAIYIACGQDAAQNVGSSNCITLMEASGPTNEDLYISCTMPSIEIGTVGGGTNLLPQQACLQMLGVQGACKDNPGENARQLARIVCGTVMAGELSLMAALAAGHLVKSHMIHNRSKINLQDLQGACTKKTA
GAMASSVLVTQEPEIELPREPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETLIETHERGVSIRRQLLSKKLSEPSSLQYLPYRDYNYSLVMGACCENVIGYMPIPVGVAGPLCLDEKEFQVPMATTEGCLVASTNRGCRAIGLGGGASSRVLADGMTAGPVVRLPRACDSAEVKAWLETSEGFAVIKEAFDSTSRFARLQKLHTSIAGRNLYIRFQSRSGDAMGMNMISKGTEKALSKLHEYFPEMQILAVSGNYCTDKAPAAINWIEGRGKSVVCEAVIPAKVVREVLKTTTEAMIEVNINDNLVGSAMAGSIGGYNAEAANIVTAIYIACGQDAAQNVGSSNCITLMEASGPTNEDLYISCTMPSIEIGTVGGGTNLLPQQACLQMLGVQGACKDNPGENARQLARIVCGTVMAGELSLMAALAAGHLVKSHMIHNRSKINLQDLQGACTKKTA
590 R -> A
692 K -> A
735 K -> D
752 H -> E
{'G': 42, 'A': 50, '

In [None]:
seq = open('P19838.fasta.txt')
seq_NFKBI = []
for line in seq:
    if line.startswith('>'):
        continue
    else:
        seq_NFKBI.append(line.strip())
seq_NFKBI = ''.join(seq_NFKBI)
print(seq_NFKBI)
mass_NFKBI = mass(seq_NFKBI)
print(mass_NFKBI, 'g/mol')

seq2 = open('Q04206.fasta.txt')
seq_RELA = []
for line in seq2:
    if line.startswith('>'):
        continue
    else:
        seq_RELA.append(line.strip())
seq_RELA = ''.join(seq_RELA)
mass_RELA = mass(seq_RELA)
print(mass_RELA, 'g/mol')

MAEDDPYLGRPEQMFHLDPSLTHTIFNPEVFQPQMALPTDGPYLQILEQPKQRGFRFRYVCEGPSHGGLPGASSEKNKKSYPQVKICNYVGPAKVIVQLVTNGKNIHLHAHSLVGKHCEDGICTVTAGPKDMVVGFANLGILHVTKKKVFETLEARMTEACIRGYNPGLLVHPDLAYLQAEGGGDRQLGDREKELIRQAALQQTKEMDLSVVRLMFTAFLPDSTGSFTRRLEPVVSDAIYDSKAPNASNLKIVRMDRTAGCVTGGEEIYLLCDKVQKDDIQIRFYEEEENGGVWEGFGDFSPTDVHRQFAIVFKTPKYKDINITKPASVFVQLRRKSDLETSEPKPFLYYPEIKDKEEVQRKRQKLMPNFSDSFGGGSGAGAGGGGMFGSGGGGGGTGSTGPGYSFPHYGFPTYGGITFHPGTTKSNAGMKHGTMDTESKKDPEGCDKSDDKNTVNLFGKVIETTEQDQEPSEATVGNGEVTLTYATGTKEESAGVQDNLFLEKAMQLAKRHANALFDYAVTGDVKMLLAVQRHLTAVQDENGDSVLHLAIIHLHSQLVRDLLEVTSGLISDDIINMRNDLYQTPLHLAVITKQEDVVEDLLRAGADLSLLDRLGNSVLHLAAKEGHDKVLSILLKHKKAALLLDHPNGDGLNAIHLAMMSNSLPCLLLLVAAGADVNAQEQKSGRTALHLAVEHDNISLAGCLLLEGDAHVDSTTYDGTTPLHIAAGRGSTRLAALLKAAGADPLVENFEPLYDLDDSWENAGEDEGVVPGTTPLDMATSWQVFDILNGKPYEPEFTSDDLLAQGDMKQLAEDVKLQLYKLLEIPDPDKNWATLAQKLGLGILNNAFRLSPAPSKTLMDNYEVSGGTVRELVEALRQMGYTEAIEVIQAASSPVKTTSQAHSLPLSPASTRQQIDELRDSDSVCDSGVETSFRKLSFTESLTSGASLLTLNKMPHDYGQEGPLEGKI
105338.04820000062 g/mol
60201.