In [9]:
import pandas as pd
import os
from Bio import SeqIO
from constants import (PATH_INPUT, PATH_OUTPUT, NAME_COL_LABEL)
from commons import CommonsFunctions

In [6]:
name_source = "Hasnat et al."

In [10]:
df_s1 = CommonsFunctions.read_fasta_docs(f"{PATH_INPUT}{name_source}/S1 file")
df_s1[NAME_COL_LABEL] = 1

In [11]:
df_s2 = CommonsFunctions.read_fasta_docs(f"{PATH_INPUT}{name_source}/S2 file")
df_s2[NAME_COL_LABEL] = 0

In [12]:
df_concat = pd.concat([df_s1, df_s2], axis=0)
df_concat[NAME_COL_LABEL].value_counts()

plastic_degrading_enzyme
0    1555
1     182
Name: count, dtype: int64

In [13]:
df_concat.shape

(1737, 3)

In [21]:
df_concat["sequence"] = df_concat["sequence"].str.strip()
df_concat["length"] = df_concat["sequence"].str.len()
df_concat["length"].describe()

count    1737.000000
mean      407.771445
std       242.357633
min         0.000000
25%       301.000000
50%       360.000000
75%       457.000000
max      3414.000000
Name: length, dtype: float64

In [22]:
df_concat = df_concat[df_concat["length"]>0]
df_concat["length"].describe()

count    1735.000000
mean      408.241499
std       242.101144
min        12.000000
25%       301.000000
50%       360.000000
75%       457.000000
max      3414.000000
Name: length, dtype: float64

In [23]:
df_concat["sequence"].unique().shape

(1649,)

In [26]:
df_group_data = df_concat.groupby(by="sequence").count()
df_group_data = df_group_data[["id_seq"]]
df_group_data["sequence"] = df_group_data.index
df_group_data.index = [i for i in range(len(df_group_data))]
df_group_data.sort_values(by="id_seq", ascending=False)

Unnamed: 0,id_seq,sequence
1319,10,MTAILERRESESLWGRFCNWITSTENRLYIGWFGVLMIPTLLTATS...
1321,7,MTAILERRESESLWGRFCNWITSTENRLYIGWFGVLMIPTLLTATS...
1413,4,MTIALGKFTKDENDLFDIMDDWLRRDRFVFVGWSGLLLFPCAYFAV...
1060,3,MRSIRLKRLIAAVALGGAAAATQAASPLPRLNVDKTQISVSGLSAG...
1412,3,MTIALGKFTKDENDLFDIMDDWLRRDRFVFVGWSGLLLFPCAYFAL...
...,...,...
556,1,MIKSFNEIIMKVKSKEMKKVAVAVAQDEPVLEAVRDAKKNGIADAI...
555,1,MIKLFSLKQQKKEEESAGGTKGSSKKASAAQLRIQKDITELNLPKT...
554,1,MIKLFSLKQQKKEEESAGGTKGSSKKASAAQLRIQKDINELNLPKT...
553,1,MIKKTILNDTHQALGAKMVDFSGWEMPIHYGSQIDEHHHVRRNAGI...


In [27]:
unique_sequences = df_group_data["sequence"].unique()
unique_sequences.shape

(1649,)

In [30]:
seqs_erros = []
processed_data = []

for sequence in unique_sequences:
    df_filter = df_concat[df_concat["sequence"] == sequence]
    df_filter.reset_index(inplace=True)

    if len(df_filter) == 1:
        row = {
            "id_seq": df_filter["id_seq"][0],
            "sequence" : sequence,
            NAME_COL_LABEL : df_filter[NAME_COL_LABEL][0]
        }
        processed_data.append(row)
    else:
        unique_labels = df_filter[NAME_COL_LABEL].unique()
        if len(unique_labels) == 1:
           row = {
                "id_seq": df_filter["id_seq"][0],
                "sequence" : sequence,
                NAME_COL_LABEL : df_filter[NAME_COL_LABEL][0]
            }
           processed_data.append(row)
        else:
            seqs_erros.append(sequence)
    

df_processed_data = pd.DataFrame(processed_data)
df_processed_data.shape

(1649, 3)

In [31]:
seqs_erros

[]

In [32]:
df_processed_data

Unnamed: 0,id_seq,sequence,plastic_degrading_enzyme
0,sp|O42412|IOD3_CHICK,AACILLFPRFLLTAVMLWLLDFLCIRKKMLTMPTAEEAAGAGEGPP...,0
1,sp|P35915|HMGCL_CHICK,AFPQRVKVVEVGPRDGLQNEKSVVPTPVKIRLIDMLSETGLPVIEA...,0
2,sp|Q7SIC3|SODM_VIRHA,AKFELPELPYAYDALEPTIDKETMNIHHTKHHNTYVTKLNGALEGH...,0
3,00136,ANPYERGPDPTESSIEAVRGPFAVAQTTVSRLQADGFGGGTIYYPT...,1
4,00106,ANPYERGPNPTDALLEARSGPFSVSEENVSRLSASGFGGGTIYYPR...,1
...,...,...,...
1644,sp|Q6KF81|PHCY2_HOMAM,VLLCSLVAATAAWPYFGGFQRDEPDGVPTAQKQHDINFLLHKLYEP...,0
1645,sp|Q6DN07|IOD3_SHEEP,VVGEGRGALGGAATMLRSLLLHSLRLCAQTASCLVLFPRFLGTAFM...,0
1646,00149,YDVRGGDAYYINNSPRCSIGFSVNGGFLTAGHCGPGTVTGSNRVAM...,1
1647,00104,YGHFYTEHNRGHHVRVATPEDPASSRLGESFWAFLPRSVWFSAVSA...,1


In [33]:
os.makedirs(f"{PATH_OUTPUT}{name_source}", exist_ok=True)

In [34]:
df_processed_data.to_csv(f"{PATH_OUTPUT}{name_source}/processed_dataset.csv", index=False)