In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from modlamp.descriptors import GlobalDescriptor
import numpy as np

In [3]:
def mw(sequence):
    gd = GlobalDescriptor(sequence)
    gd.calculate_MW(amide=True)
    val = float(np.round(gd.descriptor[0], 5))
    return val

In [4]:
def isoelectric_point(sequence):
    gd = GlobalDescriptor(sequence)
    gd.isoelectric_point(amide=True)
    val = float(np.round(gd.descriptor[0], 5))
    return val

In [5]:
def charge_density(sequence):
    gd = GlobalDescriptor(sequence)
    gd.charge_density(amide=True)
    val = float(np.round(gd.descriptor[0], 5))
    return val

In [6]:
def instability_index(sequence):
    gd = GlobalDescriptor(sequence)
    gd.instability_index()
    val = float(np.round(gd.descriptor[0], 5))
    return val

In [7]:
def boman_index(sequence):
    gd = GlobalDescriptor(sequence)
    gd.boman_index()
    val = float(np.round(gd.descriptor[0], 5))
    return val

In [8]:
def hydrophobic_ratio(sequence):
    gd = GlobalDescriptor(sequence)
    gd.hydrophobic_ratio()
    val = float(np.round(gd.descriptor[0], 5))
    return val

In [9]:
df_data = pd.read_csv("../raw_data/demo_amp.csv")
df_data["label"].value_counts()

label
0    10610
1    10610
Name: count, dtype: int64

In [10]:
df_data["mw"] = df_data["sequence"].apply(mw)

In [11]:
df_data["hydrophobic_ratio"] = df_data["sequence"].apply(hydrophobic_ratio)
df_data["boman_index"] = df_data["sequence"].apply(boman_index)
df_data["instability_index"] = df_data["sequence"].apply(instability_index)
df_data["charge_density"] = df_data["sequence"].apply(charge_density)
df_data["isoelectric_point"] = df_data["sequence"].apply(isoelectric_point)

In [12]:
df_data.head(5)

Unnamed: 0,sequence,label,mw,hydrophobic_ratio,boman_index,instability_index,charge_density,isoelectric_point
0,QEDCELCINVACTGC,0,1599.84,0.53333,1.02267,80.38,-0.00142,3.6875
1,MAATTTATSLFSSRLHFQNQNQGYGFPAKTPNSLQVNQIIDGRKMR...,0,5854.58,0.37037,1.77722,28.67963,0.00087,11.88379
2,SKGKKANKDVELARG,1,1599.84,0.26667,3.21533,8.14,0.0025,11.10547
3,ADLEVVAATYVLVA,1,1432.67,0.71429,-1.07143,3.22143,-0.0007,3.92969
4,MAESPSESTSDSLSTTTSTKPAQSGTVSISSPQSHHVVFPEIPIEIVS,0,4971.4,0.27083,1.47667,100.18333,-0.00056,4.64307


In [13]:
df_data.columns

Index(['sequence', 'label', 'mw', 'hydrophobic_ratio', 'boman_index',
       'instability_index', 'charge_density', 'isoelectric_point'],
      dtype='object')

- Dataset con secuencias
- Descripcion de las secuencias
- División en datasets de entrenamiento, validación y testeo
- Aplicación de transformaciones, etc...

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X = df_data.drop(columns=['sequence', 'label'])
y = df_data["label"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=.1, 
    random_state=42)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size=.2, 
    random_state=42)

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

In [19]:
print(X_train.shape, X_val.shape, X_test.shape)

(15278, 6) (3820, 6) (2122, 6)


In [20]:
from joblib import dump
dump(scaler, "../model_amp/model_0710/scaler.joblib")

['../model_amp/model_0710/scaler.joblib']

In [21]:
X_train = pd.DataFrame(data=X_train, columns=['mw', 'hydrophobic_ratio', 'boman_index',
       'instability_index', 'charge_density', 'isoelectric_point'])

X_train["label"] = y_train.values
X_val["label"] = y_val.values
X_test["label"] = y_test.values


In [22]:
X_train.to_csv("../model_amp/model_0710/X_train.csv", index=False)
X_val.to_csv("../model_amp/model_0710/X_val.csv", index=False)
X_test.to_csv("../model_amp/model_0710/X_test.csv", index=False)