# Import dependencies

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from Bio import SeqIO



# Import data & convert it to dataframe

In [5]:
enzyme_df = pd.read_csv('', header=0, sep=',')
enzyme_df = enzyme_df.drop('EC number', axis=1)
enzyme_df = enzyme_df.drop('Entry', axis=1)
with open("") as file:
    recs = SeqIO.parse(file, 'fasta')
    data = []
    for rec in recs:
        data.append({'id': rec.id, 'sequence': str(rec.seq)})
    non_enzyme_df = pd.DataFrame(data)
    non_enzyme_df = non_enzyme_df.drop('id', axis=1)

#### Look at dataframes

In [6]:
enzyme_df.head()

Unnamed: 0,Sequence
0,MPAPTQLFFPLVRNCELSRIYGTACYCHHKHLCCSPPYIPQNRLRY...
1,MRLGNAYAYCKPSQNVGLKLDLLRGLPGYVGHATSRINRLENQDNY...
2,MAKLARVVGLVQEEQPSDMTNHPRYSPPPQQPGTPGYAQGQQQTYS...
3,MNKLAVLYAEHIATLQKRTREIIERENLDGVVFHSGQAKRQFLDDM...
4,MEDGPSNNASCFRRLTECFLSPSLTDEKVKAYLSLHPQVLDEFVSE...


In [7]:
non_enzyme_df.head()

Unnamed: 0,sequence
0,MRKVLKKAALCTFGFSMLFGCASEEDTIVMAPVPVVQNQFEPTTEW...
1,MRYFVIGTMIALAGLLVGGGVGSYFTSSKLLKQFQNIPGSPIVLSA...
2,MICASKITMLGLLVMCTLGGVLGKVDIRQTTANKAFMETMRADGYE...
3,MIICNFMNICMIDPNDVYKWFMEFSLDSYDWVMINNVYSMGLFADG...
4,DCEQHTDCSAASGPVYCCQDSDCCGGVDYICTNYGQCVRHF


#### Verify data errors

In [8]:
(len(enzyme_df['Sequence']) != 0 ) & (len(non_enzyme_df['sequence']) != 0)

True

In [9]:
len(enzyme_df)

9204

In [10]:
len(non_enzyme_df)

39502

In [11]:
enzyme_df_downsampled = resample(enzyme_df,replace=False,n_samples=1000,random_state=42)
print(len(enzyme_df_downsampled))

1000


In [12]:
non_enzyme_df_downsampled = resample(non_enzyme_df,replace=False,n_samples=1000,random_state=42)
print(len(non_enzyme_df_downsampled))

1000


# Format data

In [44]:
amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'X']
encoder = OneHotEncoder(categories=[amino_acids])
def one_hot_encode_sequence(sequence):
    res = []
    bin = pd.get_dummies(list(sequence))
    for index, row in bin.iterrows():
        bin_res = []
        for acid in amino_acids:
            if acid not in list(sequence):
               entry = False
            else:
                entry = row[acid]
                #print(str(entry) + " for " + acid)
            bin_res.append(entry)
        res.append(bin_res)

    #test
    sum = 0
    for r in res:
        sum = sum + len(r)
    if len(res) != len(sequence):
        raise Exception("res != seq")
    if len(sequence) * len(amino_acids) != sum:
        raise Exception("aa*seq != sum")

    #print(str(len(sequence)) + " * " + str(len(amino_acids)) + " = " + str(len(sequence) * len(amino_acids)) + " >>> " + str(len(res)) + " -- " + str(sum))
    #print("___")
    return res

all_data = pd.DataFrame({'Sequence':[], 'Label': []})

In [45]:
for seq in enzyme_df_downsampled.Sequence:
    # TODO: fix bug of only ms => above should be fine
    bin = pd.DataFrame({'Sequence':[one_hot_encode_sequence(seq)], 'Label':['1']})
    all_data = all_data._append(bin, ignore_index=True)

KeyboardInterrupt: 

In [41]:
for seq in non_enzyme_df_downsampled.sequence:
    bin = pd.DataFrame({'Sequence':[one_hot_encode_sequence(seq)], 'Label':['0']})
    all_data = all_data._append(bin, ignore_index=True)

217 * 23 = 4991 >>> 217 -- 4991
___
226 * 23 = 5198 >>> 226 -- 5198
___
264 * 23 = 6072 >>> 264 -- 6072
___
954 * 23 = 21942 >>> 954 -- 21942
___
238 * 23 = 5474 >>> 238 -- 5474
___
76 * 23 = 1748 >>> 76 -- 1748
___
466 * 23 = 10718 >>> 466 -- 10718
___
96 * 23 = 2208 >>> 96 -- 2208
___
67 * 23 = 1541 >>> 67 -- 1541
___
337 * 23 = 7751 >>> 337 -- 7751
___
323 * 23 = 7429 >>> 323 -- 7429
___
120 * 23 = 2760 >>> 120 -- 2760
___
367 * 23 = 8441 >>> 367 -- 8441
___
1671 * 23 = 38433 >>> 1671 -- 38433
___
89 * 23 = 2047 >>> 89 -- 2047
___
1520 * 23 = 34960 >>> 1520 -- 34960
___
43 * 23 = 989 >>> 43 -- 989
___
175 * 23 = 4025 >>> 175 -- 4025
___
734 * 23 = 16882 >>> 734 -- 16882
___
48 * 23 = 1104 >>> 48 -- 1104
___
729 * 23 = 16767 >>> 729 -- 16767
___
219 * 23 = 5037 >>> 219 -- 5037
___
434 * 23 = 9982 >>> 434 -- 9982
___
103 * 23 = 2369 >>> 103 -- 2369
___
190 * 23 = 4370 >>> 190 -- 4370
___
483 * 23 = 11109 >>> 483 -- 11109
___
57 * 23 = 1311 >>> 57 -- 1311
___
97 * 23 = 2231 >>> 97 -- 2

KeyboardInterrupt: 

#### Make test and train set

In [26]:
all_data.head()

Unnamed: 0,Sequence,Label
0,"[False, False, False, False, False, False, Fal...",1
1,"[False, False, False, False, False, False, Fal...",1
2,"[False, False, False, False, False, False, Fal...",1
3,"[False, False, False, False, False, False, Fal...",1
4,"[False, False, False, False, False, False, Fal...",1
