In [1]:
# import libraries
import re
import pandas as pd
import csv
import numpy as np

In [2]:
# load waltz data
waltz_data = pd.read_csv('Waltz_Data')

In [4]:
# remove any sequences not of length 6, so only hexapeptides remain
waltz_data = waltz_data[waltz_data['Sequence'].map(len) == 6]

In [5]:
# define a function to count matching amino acids in same position for two peptide sequences
def similarity(string1, string2):
    overlap = 0
    # make sure string1 is at least shorter than string2
    if (len(string2) < len(string1)):
        return similarity(string2, string1)
    else:
        for i in range(0, len(string1)):
            if (string1[i] == string2[i]):
                overlap += 1
    return overlap

In [6]:
# set the threshold maximum similarity
threshold = 3

# create an empty list, to which 'non-similar' sequences will be added
non_similar_list = []

# loop through all of the sequences in the waltz_data
for sequence in waltz_data['Sequence']:
    # create a list of the similarities between sequence and every element in the non_similar_list
    list_matches = [similarity(sequence, y) for y in non_similar_list]
    # if no sequence has similarity greater than the threshold, then add sequence to the non_similar_list
    if (max(list_matches, default = 0) <= threshold):
        non_similar_list.append(sequence)

In [7]:
# filter out similar hexapeptides
waltz_data = waltz_data[waltz_data['Sequence'].isin(non_similar_list)]

# save to new .csv file
waltz_data.to_csv('Waltz_Data_Filtered', sep = ',', index = False)