In [3]:
def get_class(amino_acid):
    
    #Aliphatic class
    if amino_acid in ['G','A','V','L','I']:
        return 'Aliphatic'
    #Hydroxyl/Selenium/Sulfur class
    elif amino_acid in ['S','C','T','M']:
        return 'Hydroxyl'
    #Cyclic class
    elif amino_acid in ['P']:
        return 'Cyclic'
    #Aromatic class
    elif amino_acid in ['F','Y','W']:
        return 'Aromatic'
    #Basic class
    elif amino_acid in ['H','K','R']:
        return 'Basic'
    #Acidic/Amide class
    else:
        return 'Acidic'

def levenshtein(s1, s2):
    s1 = ''.join(s1)
    s2 = ''.join(s2)
    
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            #evaluate from above
            insertions = previous_row[j + 1] + 1 
            #evluate from left
            deletions = current_row[j] + 1
            #evaluate from diagonal above
            substitutions = previous_row[j] + (get_class(c1) != get_class(c2))
            current_row.append(min(insertions, deletions, substitutions))      
        previous_row = current_row
    
    return previous_row[-1]            

In [4]:
import sys
fasta = {}
with open('proteins.fasta') as file_one:
    for line in file_one:
        line = line.strip()
        if not line:
            continue
        if line.startswith(">"):
            active_sequence_name = line[1:]
            if active_sequence_name not in fasta:
                fasta[active_sequence_name] = []
            continue
        sequence = line
        sequence = ''.join(sequence)
        fasta[active_sequence_name].append(sequence)
for keys in fasta:
    flattened = [val for sublist in fasta[keys] for val in sublist]
    fasta[keys] = flattened

In [5]:
def output_k_homologs(reference,test):
    
    reference = ''.join(reference)
    LD = {}
    for keys in test:
        LD[keys] = levenshtein(reference,test[keys])
        
    return LD

## Step 3.1

In [6]:
levenshtein(fasta['NP_001157030.1 prolactin precursor [Homo sapiens]'],
    ['PIDNYLKLLKCRIIHNNNC'])

208

In [7]:
LD = output_k_homologs(fasta['NP_001157030.1 prolactin precursor [Homo sapiens]'],fasta)
from operator import itemgetter
sorted_LD = sorted(LD.items(), key=itemgetter(1))
sorted_LD

[('NP_001157030.1 prolactin precursor [Homo sapiens]', 0),
 ('XP_011513055.1 PREDICTED: prolactin isoform X1 [Homo sapiens]', 1),
 ('XP_011513056.1 PREDICTED: prolactin isoform X1 [Homo sapiens]', 1),
 ('XP_014991125.1 PREDICTED: prolactin isoform X2 [Macaca mulatta]', 2),
 ('XP_002816520.2 PREDICTED: prolactin [Pongo abelii]', 2),
 ('NP_001040593.1 prolactin precursor [Macaca mulatta]', 2),
 ('XP_005553987.1 PREDICTED: prolactin isoform X2 [Macaca fascicularis]', 3),
 ('XP_011740959.1 PREDICTED: prolactin isoform X2 [Macaca nemestrina]', 3),
 ('XP_018885279.1 PREDICTED: prolactin [Gorilla gorilla gorilla]', 3),
 ('XP_011886824.1 PREDICTED: prolactin isoform X2 [Cercocebus atys]', 3),
 ('XP_009448859.1 PREDICTED: prolactin isoform X2 [Pan troglodytes]', 3),
 ('XP_004043380.1 PREDICTED: prolactin [Gorilla gorilla gorilla]', 3),
 ('XP_014991124.1 PREDICTED: prolactin isoform X1 [Macaca mulatta]', 3),
 ('XP_012364516.1 PREDICTED: prolactin [Nomascus leucogenys]', 3),
 ('XP_016810474.1 PRE

In [8]:
sorted_LD[:10]

[('NP_001157030.1 prolactin precursor [Homo sapiens]', 0),
 ('XP_011513055.1 PREDICTED: prolactin isoform X1 [Homo sapiens]', 1),
 ('XP_011513056.1 PREDICTED: prolactin isoform X1 [Homo sapiens]', 1),
 ('XP_014991125.1 PREDICTED: prolactin isoform X2 [Macaca mulatta]', 2),
 ('XP_002816520.2 PREDICTED: prolactin [Pongo abelii]', 2),
 ('NP_001040593.1 prolactin precursor [Macaca mulatta]', 2),
 ('XP_005553987.1 PREDICTED: prolactin isoform X2 [Macaca fascicularis]', 3),
 ('XP_011740959.1 PREDICTED: prolactin isoform X2 [Macaca nemestrina]', 3),
 ('XP_018885279.1 PREDICTED: prolactin [Gorilla gorilla gorilla]', 3),
 ('XP_011886824.1 PREDICTED: prolactin isoform X2 [Cercocebus atys]', 3)]