# Import dependencies

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from Bio import SeqIO



# Import data & convert it to dataframe

In [10]:
enzyme_df = pd.read_csv('', header=0, sep=',')
enzyme_df = enzyme_df.drop('EC number', axis=1)
enzyme_df = enzyme_df.drop('Entry', axis=1)
with open("") as file:
    recs = SeqIO.parse(file, 'fasta')
    data = []
    for rec in recs:
        data.append({'id': rec.id, 'sequence': str(rec.seq)})
    non_enzyme_df = pd.DataFrame(data)
    non_enzyme_df = non_enzyme_df.drop('id', axis=1)

#### Look at dataframes

In [None]:
enzyme_df.head()

In [None]:
non_enzyme_df.head()

 #### Verify data errors

In [None]:
(len(enzyme_df['Sequence']) != 0 ) & (len(non_enzyme_df['sequence']) != 0)

In [None]:
len(enzyme_df)

In [None]:
len(non_enzyme_df)

In [89]:
enzyme_df_downsampled = resample(enzyme_df,replace=False,n_samples=1000,random_state=42)
print(len(enzyme_df_downsampled))

5000


In [91]:
non_enzyme_df_downsampled = resample(non_enzyme_df,replace=False,n_samples=000,random_state=42)
print(len(non_enzyme_df_downsampled))

5000


# Format data

In [92]:
amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'X']
encoder = OneHotEncoder(categories=[amino_acids])
def one_hot_encode_sequence(sequence):
    res = []
    bin = pd.get_dummies(list(sequence))
    for index, row in bin.iterrows():
        bin_res = []
        for acid in amino_acids:
            if acid not in list(sequence):
               entry = False
            else:
                entry = row[acid]
                #print(str(entry) + " for " + acid)
            bin_res.append(entry)
        res.append(bin_res)


    #test
    sum = 0
    for r in res:
        sum = sum + len(r)
    if len(res) != len(sequence):
        raise Exception("res != seq")
    if len(sequence) * len(amino_acids) != sum:
        raise Exception("aa*seq != sum")

    #print(str(len(sequence)) + " * " + str(len(amino_acids)) + " = " + str(len(sequence) * len(amino_acids)) + " >>> " + str(len(res)) + " -- " + str(sum))
    #print("___")
    return res

all_data = []

In [93]:
for seq in enzyme_df_downsampled.Sequence:
    bin = one_hot_encode_sequence(seq)
    all_data.append({'label':1, 'sequence':bin})

KeyboardInterrupt: 

In [86]:
for seq in non_enzyme_df_downsampled.sequence:
    bin = one_hot_encode_sequence(seq)
    all_data.append({'label':0, 'sequence':bin})

#### Make test and train set

In [87]:
df = pd.DataFrame(all_data)
df.head()
#df.to_csv('./test.csv')


Unnamed: 0,label,sequence
0,1,"[[False, False, False, False, False, False, Fa..."
1,1,"[[False, False, False, False, False, False, Fa..."
2,1,"[[False, False, False, False, False, False, Fa..."
3,1,"[[False, False, False, False, False, False, Fa..."
4,1,"[[False, False, False, False, False, False, Fa..."
