# Data Exploration

In [1]:
import mxnet as mx
import numpy as np
import time
import matplotlib.pyplot as plt
import pandas as pd


In [2]:
def download_file(url, path):
    import requests
    myfile = requests.get(url)
    with open('data/deeploc_data.fasta', 'wb') as f:
        f.write(myfile.content)
        

In [3]:
data_file = 'data/deeploc_data.fasta'
data_url = 'http://www.cbs.dtu.dk/services/DeepLoc-1.0/deeploc_data.fasta'
# download_file(data_url, data_file)

In [4]:
def parse_classifications(text):
    m = dict()
    parts = text.split('-')
    m['location'] = parts[:-1]
    m['membrane_or_soluable'] = parts[-1]
    return m

def parse_description(desc):
    # remove leading '>'
    desc = desc[1:]
    
    parts = desc.split()
    
    m = dict()
    
    m['id'] = parts[0]
    m.update(parse_classifications(parts[1]))
    
    is_test = False
    if (len(parts) == 3):
        is_test = parts[2] == 'test'
    m['is_test'] = is_test
    return m

def parse_record(description, sequence):
    description = description.strip()
    sequence = sequence.strip()
    
    m = parse_description(description)
    m['sequence'] = list(sequence)
    
    return m

def parse_data_file(file):
    data = []
    with open(file, 'r') as f:
        line = f.readline()
        while line:
            desc = line
            seq = f.readline()
            data.append(parse_record(desc, seq))
            line = f.readline()
    return data

In [5]:
data_all = parse_data_file(data_file)

In [6]:
df_all = pd.DataFrame(data_all)

In [7]:
df_all.count()

id                      14004
location                14004
membrane_or_soluable    14004
is_test                 14004
sequence                14004
dtype: int64

In [8]:
def is_test_pred(m):
    return m['is_test']

def is_unknown_pred(m):
    return m['soluable'] == 'U'

def single_loc_pred(m):
    return len(m['location']) == 1

def valid_record_pred(m):
    return single_loc_pred(m)

def training_pred(m):
    return not is_test_pred(m) and not is_unknown_pred(m) and single_loc_pred(m)

In [9]:
def select_valid_records(data):
    return list(filter(valid_record_pred, data))

data_valid = select_valid_records(data_all)
    

In [10]:
df = pd.DataFrame(data_valid)

In [11]:
df.count()

id                      13858
location                13858
membrane_or_soluable    13858
is_test                 13858
sequence                13858
dtype: int64

In [12]:
# convert list to single element
df['location'] = df['location'].apply(lambda x : x[0])

In [13]:
df.where(lambda x : x['is_test'] == False).groupby(['location']).count()

Unnamed: 0_level_0,id,membrane_or_soluable,is_test,sequence
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cell.membrane,1067,1067,1067,1067
Cytoplasm,2034,2034,2034,2034
Endoplasmic.reticulum,689,689,689,689
Extracellular,1580,1580,1580,1580
Golgi.apparatus,286,286,286,286
Lysosome/Vacuole,257,257,257,257
Mitochondrion,1208,1208,1208,1208
Nucleus,3235,3235,3235,3235
Peroxisome,124,124,124,124
Plastid,605,605,605,605


In [14]:
df['location'].unique()

array(['Cell.membrane', 'Cytoplasm', 'Endoplasmic.reticulum',
       'Golgi.apparatus', 'Lysosome/Vacuole', 'Mitochondrion', 'Nucleus',
       'Peroxisome', 'Plastid', 'Extracellular'], dtype=object)

In [15]:
def limit_sequence(xs):
    if len(xs) > 1000:
        return xs[:500] + xs[-500:]
    else:
        return xs


In [16]:
df['sequence_limited'] =  df['sequence'].apply(limit_sequence)

In [17]:
df_train = df.drop(df[df.is_test == True].index, inplace = False)
df_test =  df.drop(df[df.is_test == False].index, inplace = False)


In [18]:
def encode_sequence(df):
    total = df['id'].count()
    data = np.zeros((total, 1000, 20))
    i = 0
    for r, row in df.iterrows():
        for j, a in enumerate(row['sequence_limited']):
            if (a == 'A'):
                data[i][j][0] = 1
            elif (a == 'B'):
                data[i][j][2] = 0.5
                data[i][j][11] = 0.5
            elif (a == 'C'):
                data[i][j][1] = 1
            elif (a == 'D'):
                data[i][j][2] = 1
            elif (a == 'E'):
                data[i][j][3] = 1
            elif (a == 'F'):
                data[i][j][4] = 1
            elif (a == 'G'):
                data[i][j][5] = 1
            elif (a == 'H'):
                data[i][j][6] = 1
            elif (a == 'I'):
                data[i][j][7] = 1
            elif (a == 'K'):
                data[i][j][8] = 1
            elif (a == 'L'):
                data[i][j][9] = 1
            elif (a == 'M'):
                data[i][j][10] = 1
            elif (a == 'N'):
                data[i][j][11] = 1
            elif (a == 'P'):
                data[i][j][12] = 1
            elif (a == 'Q'):
                data[i][j][13] = 1
            elif (a == 'R'):
                data[i][j][14] = 1
            elif (a == 'S'):
                data[i][j][15] = 1
            elif (a == 'T'):
                data[i][j][16] = 1
            elif (a == 'U'):
                pass
            elif (a == 'V'):
                data[i][j][17] = 1
            elif (a == 'W'):
                data[i][j][18] = 1
            elif (a == 'X'):
                for k in range(20):
                    data[i][j][k] = 0.05
            elif (a == 'Y'):
                data[i][j][19] = 1
            elif (a == 'Z'):
                data[i][j][3] = 0.5
                data[i][j][13] = 0.5
        i = i+1
    return data


In [19]:
def encode_label(df):
    total = df['id'].count()
    data = np.empty((total))
    i = 0
    for r, row in df.iterrows():
        l = row['location']
        if (l == 'Nucleus'):
            data[i] = 0
        elif (l == 'Cytoplasm'):
            data[i] = 1
        elif (l == 'Extracellular'):
            data[i] = 2
        elif (l == 'Mitochondrion'):
            data[i] = 3
        elif (l == 'Cell.membrane'):
            data[i] = 4
        elif (l == 'Endoplasmic.reticulum'):
            data[i] = 5
        elif (l == 'Plastid'):
            data[i] = 6
        elif (l == 'Golgi.apparatus'):
            data[i] = 7
        elif (l == 'Lysosome/Vacuole'):
            data[i] = 8
        elif (l == 'Peroxisome'):
            data[i] = 9
        i = i+1
        
    return data
    

In [20]:
def encode_mask(df):
    total = df['id'].count()
    data = np.zeros((total, 1000))
    i = 0
    for r, row in df.iterrows():
        for j, a in enumerate(row['sequence_limited']):
            data[i][j] = 1
        i = i+1
    return data
    

In [21]:
def encode_partition(df):
    total = df['id'].count()
    data = np.zeros((total))
    for i in range(total):
        data[i] = (i % 4) + 1
    return data
        

In [22]:
encode_partition(df_train)

array([1., 2., 3., ..., 3., 4., 1.])

In [24]:
mask_train = encode_mask(df_train)
mask_test = encode_mask(df_test)
X_train = encode_sequence(df_train)
X_test = encode_sequence(df_test)
y_train = encode_label(df_train)
y_test = encode_label(df_test)
partition = encode_partition(df_train)

np.savez('data/deeploc_full.npz',
         X_train=X_train,
         X_test=X_test,
         mask_train=mask_train,
         mask_test=mask_test,
         y_train=y_train,
         y_test=y_test,
         partition=partition)


In [None]:
m = dict()

def f(l):
    for x in l:
        if x in m:
            m[x] = m[x] + 1
        else:
            m[x] = 1

df['sequence_limited'].apply(f)
m

In [None]:
m = dict()

def f(l):
    for x in l:
        if x in m:
            m[x] = m[x] + 1
        else:
            m[x] = 1

df['sequence'].apply(f)
m

In [None]:
cond = df['sequence'].apply(lambda xs : 'X' in xs)

df[cond]

In [None]:
df['length'] = df['sequence'].apply(len)


In [None]:
{'A',  # 0
 'B',  # aspartate or asparagine D or N
 'C',  # 1
 'D',  # 2
 'E',  # 3
 'F',  # 4
 'G',  # 5
 'H',  # 6
 'I',  # 7
 'K',  # 8
 'L',  # 9
 'M',  # 10
 'N',  # 11
 'P',  # 12
 'Q',  # 13
 'R',  # 14
 'S',  # 15
 'T',  # 16
 'U',  # --
 'V',  # 17
 'W',  # 18
 'X',  # any
 'Y',  # 19
 'Z'} # glutamate or glutamine E or Q

In [None]:
numpy.histogram(df['length'])

In [None]:
plt.hist(df['length'])
plt.show()

In [None]:
ys = xs[:2] + xs[-2:]

In [None]:
df

In [None]:
train_file = 'subcellular_localization/data/train.npz'
test_file = 'subcellular_localization/data/test.npz'

train_npz = numpy.load(train_file)
test_npz = numpy.load(test_file)

mask_train = train_npz['mask_train']
partition = train_npz['partition']
X_train = train_npz['X_train']
y_train = train_npz['y_train']
X_test = test_npz['X_test']
mask_test = test_npz['mask_test']
y_test = test_npz['y_test']

train_npz.close()
test_npz.close()

In [None]:
plt.hist(y_test, bins=10)

In [None]:
 numpy.unique(y_train, return_counts=True)

In [None]:
 numpy.unique(y_test, return_counts=True)

In [None]:
y_train.shape