Checking to see if protein sequences Zach sent in excel file called "initial_enzymes_1" are in the training set. If so, remove them from the training set.

Initial attempt that includes finding the longest common substring (consecutive amino acids) between two sequences, and comparing amino acids in each position of the sequences.

In [6]:
%matplotlib inline
import pickle
import random
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from Bio import pairwise2 as pw2

In [7]:
# load in prot sequences from dataset
df = pd.read_excel('../data/uniprot_reviewed_signalpeptides.xlsx')
dataset = df['Sequence'].values   # [:10]

# load in prot sequences from Zach's excel
df = pd.read_excel('../data/initial_enzymes_1.xlsx', sheet_name=1)
prot_excel = df['Protein -met -sigp'].values   # [:10]

In [8]:
def lcs(S,T):
    """ Function that returns a dictionary of the longest common substrings of two strings """
    m = len(S)
    n = len(T)
    counter = [[0]*(n+1) for x in range(m+1)]
    longest = 0
    lcs_set = set()
    for i in range(m):
        for j in range(n):
            if S[i] == T[j]:
                c = counter[i][j] + 1
                counter[i+1][j+1] = c
                if c > longest:
                    lcs_set = set()
                    longest = c
                    lcs_set.add(S[i-c+1:i+1])
                elif c == longest:
                    lcs_set.add(S[i-c+1:i+1])

    return lcs_set

In [9]:
def compare(a, b):
    """ Function that returns the number of the same amino acids in the same position in two sequences. """
    sim = 0
    for x, y in zip(a, b):
        if x == y:
            sim += 1
    
    return sim

In [10]:
for i, prot in enumerate(prot_excel):
    for j, dat in enumerate(dataset):
        sim = {}
        sim = lcs(prot, dat)
        max_length = len(max(sim, key=len))
        if max_length > 20:
            print(prot)
            print("-----")
            print(dat)
            print("-----")
            print(sim)
            print("**********************************************")

In [11]:
type(dataset)

numpy.ndarray

In [12]:
# 50% similarity

sim_50 = []

for i, prot in enumerate(prot_excel):
    for j, dat in enumerate(dataset):
        sim = 0
        sim = compare(prot, dat)
        sim /= len(dat)
        if sim >= 0.5:
            print(prot)
            print("-----")
            print(dat)
            print("-----")
            print(sim)
            print("**********************************************")
            sim_50.append(dat)

sim_50 = np.asarray(sim_50)

In [13]:
# 75% similarity

sim_75 = []

for i, prot in enumerate(prot_excel):
    for j, dat in enumerate(dataset):
        sim = 0
        sim = compare(prot, dat)
        sim /= len(dat)
        if sim >= 0.75:
            print(prot)
            print("-----")
            print(dat)
            print("-----")
            print(sim)
            print("**********************************************")
            sim_75.append(dat)

sim_75 = np.asarray(sim_75)

In [14]:
# 75% similarity

sim_90 = []

for i, prot in enumerate(prot_excel):
    for j, dat in enumerate(dataset):
        sim = 0
        sim = compare(prot, dat)
        sim /= len(dat)
        if sim >= 0.9:
            print(prot)
            print("-----")
            print(dat)
            print("-----")
            print(sim)
            print("**********************************************")
            sim_90.append(dat)

sim_90 = np.asarray(sim_90)

In [15]:
len(dataset[2])

2351