Checking to see if protein sequences Zach sent in excel file called "initial_enzymes_1" are in the training set. If so, remove them from the training set.

First, align sequences using biopython module, then compute percentage of similarity

Reference: https://www.biostars.org/p/208540/

In [1]:
%matplotlib inline
import pickle
import random
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

import csv

from Bio import pairwise2 as pw2

In [2]:
# load in prot sequences from dataset
df = pd.read_excel('../dataset.xls')
dataset = df['Prot Sequences'].values

# load in prot sequences from Zach's excel
df = pd.read_excel('initial_enzymes_1.xlsx')
prot_excel = df['Protein -met -sigp'].values

In [3]:
len(dataset)

28723

In [None]:
def write_to_csv(filename, query_prot, dataset_prot, sim):
    """ Function that writes out query sequence, dataset sequence, and similarity to csv file. """
    lst = [[query_prot], [dataset_prot], [sim]]
    with open(filename, 'a') as myfile:
        for row in lst:
            for col in row:
                myfile.write(str(col))
            myfile.write('\n')

In [None]:
# Create csv files for each query protein in Zach's excel with the similarity to each protein in dataset--after alignment

for i, prot in enumerate(prot_excel[20:21]):
    filename = "Query protein " + str(i + 21) + ".csv"
    for j, dat in enumerate(dataset):
        global_align = pw2.align.globalxx(prot, dat) # align sequences
        seq_length = min(len(prot), len(dat))
        matches = global_align[0][2]
        
        percent_match = (matches / seq_length) * 100
        write_to_csv(filename, prot, dat, percent_match)

In [4]:
sim = [] # a list of lists of similarities of each protein sequence in the dataset to each query protein

for i, prot in enumerate(prot_excel): # read each csv made for each query protein
    filename = "Query+protein+" + str(i + 1) + ".csv"
    with open(filename, "r") as f:
        reader = csv.reader(f, delimiter="\t")
        index = 0 # for appending to lists of similarities within the list "sim"
        for j, line in enumerate(reader): # reads in each row of the csv
            if i == 0: # make initial lists of similarities of each protein in dataset
                if (j + 1) % 3 == 0:
                    sim.append([float(line[0])])
            else: # append to lists of similarities of each protein in dataset
                if (j + 1) % 3 == 0:
                    sim[index].append(float(line[0]))
                    index += 1

In [5]:
indices_50 = [] # stores indices of protein sequences in dataset at least 50% similar
indices_75 = [] # stores indices of protein sequences in dataset at least 75% similar
indices_90 = [] # stores indices of protein sequences in dataset at least 90% similar
indices_95 = [] # stores indices of protein sequences in dataset at least 95% similar
indices_99 = [] # stores indices of protein sequences in dataset at least 99% similar

for i, dat in enumerate(dataset):
    if max(sim[i]) >= 50:
        indices_50.append(i)
    if max(sim[i]) >= 75:
        indices_75.append(i)
    if max(sim[i]) >= 90:
        indices_90.append(i)
    if max(sim[i]) >= 95:
        indices_95.append(i)
    if max(sim[i]) >= 99:
        indices_99.append(i)
        
dataset_50 = np.delete(dataset, indices_50) # remove protein sequences at least 50% similar
dataset_75 = np.delete(dataset, indices_75) # remove protein sequences at least 75% similar
dataset_90 = np.delete(dataset, indices_90) # remove protein sequences at least 90% similar
dataset_95 = np.delete(dataset, indices_95) # remove protein sequences at least 95% similar
dataset_99 = np.delete(dataset, indices_99) # remove protein sequences at least 99% similar

In [7]:
print(len(dataset_75))
print(len(dataset_90))
print(len(dataset_95))
print(len(dataset_99))

22120
27687
28218
28546


In [None]:
# write new datasets out to csv

def write_prot_to_csv(filename, prot):
    """ Function that writes protein sequence out to one row in csv """
    with open(filename, 'a', newline='') as myfile:
        wr = csv.writer(myfile)
        wr.writerow([str(prot)])
            
for i, dataset in enumerate([dataset_50, dataset_75, dataset_90, dataset_95, dataset_99]):
    filename = "dataset_" + str(i + 1) + ".csv"
    for j, prot in enumerate(dataset):
        write_prot_to_csv(filename, prot)

In [None]:
len(dataset)

In [None]:
# function that writes out to csv - similarity
# take out of instance
# c5.xlarge
# top for multiple notebooks, new file for each query (41 files total)