### Este notebook nos facilitará la codificación o caracterización por propiedades de las secuencias

Este notebook hace:

1. Lee el dataset de secuencias
2. Maneja las secuencias y extrae random de cada clase
3. Caracteriza las secuencias usando propiedades fisicoquímicas

In [1]:
pip install pandas modlamp

Note: you may need to restart the kernel to use updated packages.


- Seccion import libraries/modules

In [2]:
import pandas as pd
import numpy as np
from modlamp.descriptors import GlobalDescriptor

- Seccion de implementación de funciones auxiliares

In [3]:
def mw(sequence):
    gd = GlobalDescriptor([sequence])
    gd.calculate_MW(amide=True)
    val = float(np.round(gd.descriptor[0][0], 5))
    return val

In [4]:
def isoelectric_point(sequence):
    gd = GlobalDescriptor([sequence])
    gd.isoelectric_point(amide=True)
    val = float(np.round(gd.descriptor[0][0], 5))
    return val

In [5]:
def charge_density(sequence):
    gd = GlobalDescriptor([sequence])
    gd.charge_density(amide=True)
    val = float(np.round(gd.descriptor[0][0], 5))
    return val

In [6]:
def instability_index(sequence):
    gd = GlobalDescriptor([sequence])
    gd.instability_index()
    val = float(np.round(gd.descriptor[0][0], 5))
    return val

In [7]:
def boman_index(sequence):
    gd = GlobalDescriptor([sequence])
    gd.boman_index()
    val = float(np.round(gd.descriptor[0][0], 5))
    return val

In [8]:
def hydrophobic_ratio(sequence):
    gd = GlobalDescriptor([sequence])
    gd.hydrophobic_ratio()
    val = float(np.round(gd.descriptor[0][0], 5))
    return val

In [9]:
df = pd.read_csv("../raw_data/demo_amp.csv")
df.head(5)

Unnamed: 0,sequence,label
0,QEDCELCINVACTGC,0
1,MAATTTATSLFSSRLHFQNQNQGYGFPAKTPNSLQVNQIIDGRKMR...,0
2,SKGKKANKDVELARG,1
3,ADLEVVAATYVLVA,1
4,MAESPSESTSDSLSTTTSTKPAQSGTVSISSPQSHHVVFPEIPIEIVS,0


In [10]:
df.shape

(21220, 2)

In [11]:
df["label"].value_counts()

label
0    10610
1    10610
Name: count, dtype: int64

In [12]:
df[:10]

Unnamed: 0,sequence,label
0,QEDCELCINVACTGC,0
1,MAATTTATSLFSSRLHFQNQNQGYGFPAKTPNSLQVNQIIDGRKMR...,0
2,SKGKKANKDVELARG,1
3,ADLEVVAATYVLVA,1
4,MAESPSESTSDSLSTTTSTKPAQSGTVSISSPQSHHVVFPEIPIEIVS,0
5,MLRFTHVLNNGAKRSALSLGRSYLRGFGSMHGPRVA,0
6,MFRRAAFIKPRLTGFIRFN,0
7,ISIGIKCSPSIDLCEGQCRIRKYFTGYCSGDTCHCSG,0
8,AISCGQVSSALSPCISYARGNGAKPPVACCSGVKRLAGAAQSTADK...,1
9,STQEVSGHPEHHLV,0


In [13]:
matrix_data = []

for sequence in df["sequence"].values:
    row = [
        mw(sequence),
        charge_density(sequence),
        hydrophobic_ratio(sequence)
    ]
    matrix_data.append(row)

df_1 = pd.DataFrame(data=matrix_data, columns=["mw", "charge_density", "hydrophobic_ratio"])
df_1

Unnamed: 0,mw,charge_density,hydrophobic_ratio
0,1599.84,-0.00142,0.53333
1,5854.58,0.00087,0.37037
2,1599.84,0.00250,0.26667
3,1432.67,-0.00070,0.71429
4,4971.40,-0.00056,0.27083
...,...,...,...
21215,5109.73,-0.00076,0.39583
21216,3794.47,0.00134,0.44118
21217,2823.28,-0.00121,0.57692
21218,2991.63,0.00064,0.53571


In [14]:
df = pd.concat([df, df_1], axis=1)
df.head(5)

Unnamed: 0,sequence,label,mw,charge_density,hydrophobic_ratio
0,QEDCELCINVACTGC,0,1599.84,-0.00142,0.53333
1,MAATTTATSLFSSRLHFQNQNQGYGFPAKTPNSLQVNQIIDGRKMR...,0,5854.58,0.00087,0.37037
2,SKGKKANKDVELARG,1,1599.84,0.0025,0.26667
3,ADLEVVAATYVLVA,1,1432.67,-0.0007,0.71429
4,MAESPSESTSDSLSTTTSTKPAQSGTVSISSPQSHHVVFPEIPIEIVS,0,4971.4,-0.00056,0.27083


In [15]:
df["boman_index"] = df["sequence"].apply(boman_index)
df["isoelectric_point"] = df["sequence"].apply(isoelectric_point)
df["instability_index"] = df["sequence"].apply(instability_index)

In [16]:
df.to_csv("../results/procesed_data.csv", index=False)