In [6]:
import warnings
warnings.filterwarnings('ignore')

In [24]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from Bio import SeqIO
import math
from scipy.fft import fft
import numpy as np

In [8]:
def encoding_residue(residue, df_properties, name_property):
    return df_properties[name_property][residue]

In [9]:
def encoding_sequence(sequence, df_properties, name_property, max_length):
    sequence = sequence.upper()
    sequence_encoding = []

    for i in range(len(sequence)):
        residue = sequence[i]
        response_encoding = encoding_residue(residue, df_properties, name_property)
        if response_encoding:
            sequence_encoding.append(response_encoding)

    for k in range(len(sequence_encoding), max_length + 1):
        sequence_encoding.append(0)

    return sequence_encoding

In [19]:
def get_near_pow(max_length):
    print("Get near pow 2 value")
    list_data = [math.pow(2, i) for i in range(1, 20)]
    stop_value = list_data[0]

    for value in list_data:
        if value >= max_length:
            stop_value = value
            break

    stop_value = int(stop_value)
    return stop_value

In [23]:
def complete_zero_padding(stop_value, dataset, max_length):
    print("Apply zero padding")
    list_df = [dataset]
    for i in range(max_length, stop_value):
        column = [0 for k in range(len(dataset))]
        key_name = "p_{}".format(i)
        df_tmp = pd.DataFrame()
        df_tmp[key_name] = column
        list_df.append(df_tmp)

    dataset = pd.concat(list_df, axis=1)
    return dataset

In [25]:
def apply_fft(index, dataset, stop_value):
    row = row = dataset.iloc[index].tolist()
    T = 1.0 / float(stop_value)
    yf = fft(row)

    xf = np.linspace(0.0, 1.0 / (2.0 * T), stop_value // 2)
    yf = np.abs(yf[0 : stop_value // 2])
    return [value for value in yf]

In [14]:
processed_sequences = []

for record in SeqIO.parse("tmp.fasta", "fasta"):

    if ("Alpha" in record.description or "Beta" in record.description):
        subtype = record.description.split("|")[-1]
    elif "Studied" in record.description:
        subtype = "A. purpuratus"
    else:
        subtype = "Non specificied"

    processed_sequences.append({
        "Organism" : record.description.replace("-", " ").split("]")[1].split("|")[0].split("Frame")[0],
        "Accession" : record.description.split("]")[0].replace("[", ""),
        "Subtype" : subtype,
        "Sequence" : str(record.seq).replace("*", "").replace("X", "")
    })

raw_df = pd.DataFrame(processed_sequences)
raw_df.head(5)

Unnamed: 0,Organism,Accession,Subtype,Sequence
0,A. irradians,XP_069111413,Alpha,MDKLKKERIRKVNPVSQKVKLFRASEPLLSVFMWGINHTINGLNHV...
1,P. maximus,XP_033762842,Alpha,MDKLKKERIRKVNPVSQKVKLFRASEPLLSVFMWGINHTINGLNHV...
2,Y. balloti,XP_060083998,Alpha,MDKLKKERIRKVNPVSQKVKLFRASEPLLSVFMWGINHTINGLNHV...
3,M. yessoensis,XP_021377781,Beta,MDKLKKERIRKVNPVSQKVKLFRASEPLLSVFMWGINHTINGLNHV...
4,R. philippinarum,XP_060589942,Alpha,MASVQTVPKQKVLKVKAVYQKRKLFRANEPLLSVFMWGINHTVSEL...


In [12]:
raw_df["length"] = raw_df["Sequence"].str.len()
raw_df["length"].describe()

count     46.000000
mean     384.130435
std       56.969235
min      156.000000
25%      389.250000
50%      407.000000
75%      411.750000
max      439.000000
Name: length, dtype: float64

- Encoding by physicochemical properties

In [11]:
input_encoder = pd.read_csv("../config_data/cluster_encoders.csv")
input_encoder.index = input_encoder["residue"].values
input_encoder.head()

Unnamed: 0,residue,Group_0,Group_1,Group_2,Group_3,Group_4,Group_5,Group_6,Group_7
A,A,290.40675,71.850787,6.250299,44.65141,-107.792042,15.33599,56.16028,92.925289
R,R,172.577375,-6.96389,84.091653,200.152218,51.157141,172.36012,1.448105,-37.39311
N,N,-38.377385,-90.145475,-21.731374,-191.180531,73.940581,-259.135737,-54.69043,-77.746565
D,D,159.436015,-56.585499,-28.963699,-232.261465,55.369736,-216.012067,-29.383132,-7.421269
C,C,-4.241925,15.678516,-34.886819,-156.2126,-54.192823,-242.000209,10.074813,40.041394


- Encoding dataset

In [20]:
matrix_data = []

for index in raw_df.index:
    sequence_encoder = encoding_sequence(
        raw_df["Sequence"][index],
        input_encoder,
        "Group_0",
        438
    )
    matrix_data.append(sequence_encoder)

print("Creating dataset")
header = ["p_{}".format(i) for i in range(len(matrix_data[0]))]
print("Export dataset")

df_data_encoded = pd.DataFrame(matrix_data, columns=header)

for column in ['Organism', 'Accession', 'Subtype']:
    df_data_encoded[column] = raw_df[column].values

Creating dataset
Export dataset


In [21]:
df_data_encoded.head()

Unnamed: 0,p_0,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,p_9,...,p_432,p_433,p_434,p_435,p_436,p_437,p_438,Organism,Accession,Subtype
0,21.944601,159.436015,195.599646,-91.117252,195.599646,195.599646,-0.028483,172.577375,-34.080828,172.577375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A. irradians,XP_069111413,Alpha
1,21.944601,159.436015,195.599646,-91.117252,195.599646,195.599646,-0.028483,172.577375,-34.080828,172.577375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,P. maximus,XP_033762842,Alpha
2,21.944601,159.436015,195.599646,-91.117252,195.599646,195.599646,-0.028483,172.577375,-34.080828,172.577375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Y. balloti,XP_060083998,Alpha
3,21.944601,159.436015,195.599646,-91.117252,195.599646,195.599646,-0.028483,172.577375,-34.080828,172.577375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M. yessoensis,XP_021377781,Beta
4,21.944601,290.40675,-314.201739,150.752932,-268.556728,-252.509397,150.752932,317.102424,195.599646,-268.556728,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,R. philippinarum,XP_060589942,Alpha


- Apply FFT

In [27]:
stop_value = get_near_pow(438)
dataset_to_encode = df_data_encoded.drop(columns=['Organism', 'Accession', 'Subtype'])

matrix_response = []
for index in dataset_to_encode.index:
    row_fft = apply_fft(index, dataset_to_encode, stop_value)
    matrix_response.append(row_fft)

print("Creating dataset")
header = ["p_{}".format(i) for i in range(len(matrix_response[0]))]
print("Export dataset")
df_fft = pd.DataFrame(matrix_response, columns=header)

for column in ['Organism', 'Accession', 'Subtype']:
    df_fft[column] = df_data_encoded[column].values

df_fft.head()


Get near pow 2 value
Creating dataset
Export dataset


Unnamed: 0,p_0,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,p_9,...,p_249,p_250,p_251,p_252,p_253,p_254,p_255,Organism,Accession,Subtype
0,10066.964891,3793.231221,3229.635002,3956.656936,4185.598907,2479.181915,2093.966295,4773.325969,3093.432544,634.764359,...,1283.922863,1479.330855,4476.214398,2340.273264,7128.741932,4520.215276,3675.829982,A. irradians,XP_069111413,Alpha
1,10048.393921,4116.477902,2679.810419,4272.576129,4196.89172,2354.879091,1639.914921,4574.025977,3043.244132,927.151416,...,2156.948751,2354.595902,3421.975039,2020.358627,6868.737586,4540.924566,3756.95991,P. maximus,XP_033762842,Alpha
2,9677.909136,4364.625224,1757.239238,4206.509038,4387.379489,2873.642213,1559.104347,4261.424107,3082.02848,991.630753,...,2088.116467,2953.6197,2883.430557,2292.868879,6723.58036,5641.556423,3840.515973,Y. balloti,XP_060083998,Alpha
3,10625.107866,4365.614571,2078.322536,4476.277038,4135.47937,3048.834549,1813.399086,3469.079749,3555.547452,496.230201,...,1592.340878,2376.664796,3800.785734,2325.244816,6969.198836,4861.487759,3484.67479,M. yessoensis,XP_021377781,Beta
4,11203.737755,4215.018682,3775.736308,5188.280523,3874.77259,1304.078198,1465.219467,4095.589896,4555.423897,2023.061669,...,2149.210534,2456.762959,883.888311,6018.2646,6840.889325,4238.324786,3388.747863,R. philippinarum,XP_060589942,Alpha
