# Scratch for HW 12.1

[RefSeq human proteins](https://www.ncbi.nlm.nih.gov/projects/genome/guide/human/index.shtml)

In [19]:
# note: index writes to hard disk, slow

# key is to not dump all these guys into memory
import sys
import gzip
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [20]:
# Check the input
if len(sys.argv) < 2:
    print("Please provide a sequence file", file=sys.stderr)
    sys.exit(1)

#seq_file = sys.argv[1]
seq_file = 'human.protein.fasta'

In [24]:
def aa_count(filename, filetype):
    res_dict = {}
    f = open(filename)
    for record in SeqIO.parse(f, filetype):
        temp_dict = ProteinAnalysis(str(record.seq)).count_amino_acids()
        for key, val in temp_dict.items():
            if key in res_dict:
                res_dict[key] = res_dict[key] + val
            else:
                res_dict[key] = val
    return res_dict

In [28]:
refseq = aa_count('human.protein.fasta', "fasta")

In [29]:
swissprot = aa_count('uniprot_sprot_human.dat', "swiss")

In [31]:
refseq

{'A': 4483022,
 'C': 1415928,
 'D': 3126834,
 'E': 4777624,
 'F': 2266714,
 'G': 4165441,
 'H': 1737711,
 'I': 2801647,
 'K': 3844605,
 'L': 6425942,
 'M': 1382974,
 'N': 2366382,
 'P': 4177524,
 'Q': 3240476,
 'R': 3691190,
 'S': 5604342,
 'T': 3527664,
 'V': 3844408,
 'W': 771259,
 'Y': 1658945}

In [30]:
max(refseq, key=refseq.get)

'L'

In [32]:
min(refseq, key=refseq.get)

'W'

In [21]:
#%%timeit
# use index - slow (13 sec)
index = SeqIO.index(seq_file, "fasta")

res_dict = {}
for i in index:
    temp_dict = ProteinAnalysis(str(index[i].seq)).count_amino_acids()

    for key, val in temp_dict.items():
        if key in res_dict:
            res_dict[key] = res_dict[key] + val
        else:
            res_dict[key] = val

res_dict

12.9 s ± 143 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
#%%timeit
# use dictionary - bit faster (7 sec)
f = open(seq_file)
seq_dict = SeqIO.to_dict(SeqIO.parse(f, 'fasta'))

res_dict = {}
for i in seq_dict.keys():
    temp_dict = ProteinAnalysis(str(seq_dict[i].seq)).count_amino_acids()
    for key, val in temp_dict.items():
        if key in res_dict:
            res_dict[key] = res_dict[key] + val
        else:
            res_dict[key] = val

res_dict

7.01 s ± 295 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
#%%timeit
# repeat count for uniprot

# useing dictionary method - 36 sec
seq_file  = 'uniprot_sprot_human.dat'

f = open(seq_file)
seq_dict = SeqIO.to_dict(SeqIO.parse(f, 'swiss'))

res_dict = {}
for i in seq_dict.keys():
    temp_dict = ProteinAnalysis(str(seq_dict[i].seq)).count_amino_acids()
    for key, val in temp_dict.items():
        if key in res_dict:
            res_dict[key] = res_dict[key] + val
        else:
            res_dict[key] = val

res_dict

36.2 s ± 2.09 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
%%timeit
# use index - slow (1 min)
seq_file  = 'uniprot_sprot_human.dat'
index = SeqIO.index(seq_file, "swiss")

res_dict = {}
for i in index:
    temp_dict = ProteinAnalysis(str(index[i].seq)).count_amino_acids()

    for key, val in temp_dict.items():
        if key in res_dict:
            res_dict[key] = res_dict[key] + val
        else:
            res_dict[key] = val

res_dict

1min 4s ± 703 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
seq = str(seq_dict['gi|767999332|ref|XP_011524511.1|'].seq)

In [30]:
a = ProteinAnalysis(seq)
d = a.count_amino_acids()
d

{'A': 153,
 'C': 44,
 'D': 75,
 'E': 181,
 'F': 50,
 'G': 111,
 'H': 50,
 'I': 94,
 'K': 138,
 'L': 175,
 'M': 50,
 'N': 98,
 'P': 194,
 'Q': 115,
 'R': 73,
 'S': 303,
 'T': 152,
 'V': 113,
 'W': 6,
 'Y': 17}

In [21]:
res_dict = {}
for i in index:
    coun
    

<dict_keyiterator at 0x107bb3048>