In [113]:
import pandas as pd
from joblib import load
import os

In [114]:
def process_concat(path_data, list_df_sequences, columns_to_use):
    
    list_df = []

    for i in range(6):
        df = pd.read_csv(f"{path_data}coded_dataset_{i}.csv")
        df = df.drop(columns=["EC"])
        df_seqs = list_df_sequences[i]

        for column in columns_to_use:
            df[column] = df_seqs[column].values
        list_df.append(df)

    return list_df

In [115]:
list_generated_sequences = []

df_data = pd.read_csv("../../plastic_sequences/generated_sequences/all_generated_sequences.csv", chunksize=20000)
for chunk in df_data:
    list_generated_sequences.append(chunk)

len(list_generated_sequences)

6

In [116]:
columns_with_info = ["sequence", "perplexity", "EC", "length"]

In [117]:
list_df_protrans_xlu50 = process_concat("../../plastic_sequences/generated_sequences/protrans_xlu50/", list_generated_sequences, columns_with_info)
df_protrans_xlu50 = pd.concat(list_df_protrans_xlu50, axis=0)

list_df_esm1b = process_concat("../../plastic_sequences/generated_sequences/esm1b/", list_generated_sequences, columns_with_info)
df_esm1b = pd.concat(list_df_esm1b, axis=0)

list_df_esme = process_concat("../../plastic_sequences/generated_sequences/esme/", list_generated_sequences, columns_with_info)
df_esme = pd.concat(list_df_esme, axis=0)

list_df_plusrnn = process_concat("../../plastic_sequences/generated_sequences/plusrnn/", list_generated_sequences, columns_with_info)
df_plusrnn = pd.concat(list_df_plusrnn, axis=0)

list_df_protrans_bdf = process_concat("../../plastic_sequences/generated_sequences/protrans_bdf/", list_generated_sequences, columns_with_info)
df_protrans_bdf = pd.concat(list_df_protrans_bdf, axis=0)

list_df_protrans_albert = process_concat("../../plastic_sequences/generated_sequences/protrans_albert/", list_generated_sequences, columns_with_info)
df_protrans_albert = pd.concat(list_df_protrans_albert, axis=0)

In [118]:
pet_model = load("../../generated_models/PET_clf.joblib")
pla_model = load("../../generated_models/PLA_clf.joblib")
pcl_model = load("../../generated_models/PCL_clf.joblib")
nylon_model = load("../../generated_models/NYLON_PA_clf.joblib")
pha_model = load("../../generated_models/PHA_clf.joblib")
phb_model = load("../../generated_models/PHB_clf.joblib")
pu_model = load("../../generated_models/PU_PUR_clf.joblib")

In [119]:
predictions_pet = pet_model.predict_proba(df_protrans_xlu50.drop(columns=columns_with_info))
df_pet_predictions = pd.DataFrame(data=predictions_pet, columns=["p0", "p1"])

for column in columns_with_info:
    df_pet_predictions[columns_with_info] = df_protrans_xlu50[columns_with_info].values



In [120]:
df_pet_predictions["positive_class"] = (df_pet_predictions["p1"]>=0.5).astype(int)
df_pet_predictions["positive_class"].value_counts()

positive_class
0    115346
1      3729
Name: count, dtype: int64

In [121]:
df_pet_predictions[df_pet_predictions["positive_class"] == 1].sort_values(by="p1", ascending=False)[:3]

Unnamed: 0,p0,p1,sequence,perplexity,EC,length,positive_class
74881,0.12,0.88,MKILAPKPFTFEGGDRAVLLLHGFTGNSADVRMLGRFLEKKGYTCH...,1.314282,3.1.1.1,246.0,1
73620,0.13,0.87,MKISAPQPFTFEGGERAVLLLHGFTGNSADVRMLGRFLEKKGYTCH...,1.094859,3.1.1.1,245.0,1
73782,0.14,0.86,MTAPLILQPVKPADACVIWLHGLGADRYDFLPVAEALQESLLTTRF...,1.243911,3.1.1.1,218.0,1


In [122]:
predictions_phb = phb_model.predict_proba(df_esm1b.drop(columns=columns_with_info))
df_phb_predictions = pd.DataFrame(data=predictions_phb, columns=["p0", "p1"])

for column in columns_with_info:
    df_phb_predictions[columns_with_info] = df_esm1b[columns_with_info].values



In [123]:
df_phb_predictions["positive_class"] = (df_phb_predictions["p1"]>=0.5).astype(int)
df_phb_predictions["positive_class"].value_counts()

positive_class
0    103540
1     15535
Name: count, dtype: int64

In [124]:
df_phb_predictions[df_phb_predictions["positive_class"] == 1].sort_values(by="p1", ascending=False)[:3]

Unnamed: 0,p0,p1,sequence,perplexity,EC,length,positive_class
80687,0.13,0.87,MKRVLSLTLSAISVLGVLPAVSASGSVTAPAGCGKQPTLANGTYKT...,2.364434,3.1.1.102,397.0,1
92799,0.13,0.87,MKRFILTSCVALVAGTGLVQTNNTTPTQSPGPTSTSTQTPPSNEGN...,2.873978,3.1.1.102,332.0,1
82124,0.14,0.86,MKRIGVILATAIFFSAQSYTNGSPELASATSSASATASASAANSPT...,2.520978,3.1.1.102,338.0,1


In [125]:
predictions_pha = pha_model.predict_proba(df_esme.drop(columns=columns_with_info))
df_pha_predictions = pd.DataFrame(data=predictions_pha, columns=["p0", "p1"])

for column in columns_with_info:
    df_pha_predictions[columns_with_info] = df_esme[columns_with_info].values



In [126]:
df_pha_predictions["positive_class"] = (df_pha_predictions["p1"]>=0.5).astype(int)
df_pha_predictions["positive_class"].value_counts()

positive_class
0    72373
1    46702
Name: count, dtype: int64

In [127]:
df_pha_predictions[df_pha_predictions["positive_class"] == 1].sort_values(by="p1", ascending=False)[:3]

Unnamed: 0,p0,p1,sequence,perplexity,EC,length,positive_class
96284,0.0,1.0,MSVLAVGERIDPQVISFTARAPDPAPLVVVLHGCSGDDYGWNKHAS...,4.692308,3.1.1.2,274.0,1
119038,0.0,1.0,MTVPGKSLFRVRDGELQVIARAEPGGEAILLHGYPYDSLAMNEFVD...,1.733365,3.1.1.2,252.0,1
72518,0.0,1.0,MLTPVARIDSATGGRTPLFFDADTGEVRVTGTTVSTSAASPRTRGA...,4.146861,3.1.1.1,333.0,1


In [128]:
predictions_pla = pla_model.predict_proba(df_plusrnn.drop(columns=columns_with_info))
df_pla_predictions = pd.DataFrame(data=predictions_pla, columns=["p0", "p1"])

for column in columns_with_info:
    df_pla_predictions[columns_with_info] = df_plusrnn[columns_with_info].values




In [129]:
df_pla_predictions["positive_class"] = (df_pla_predictions["p1"]>=0.5).astype(int)
df_pla_predictions["positive_class"].value_counts()

positive_class
0    73953
1    45122
Name: count, dtype: int64

In [130]:
df_pla_predictions[df_pla_predictions["positive_class"] == 1].sort_values(by="p1", ascending=False)[:3]

Unnamed: 0,p0,p1,sequence,perplexity,EC,length,positive_class
15597,0.0001,0.9999,MKLTAIFSLAALAVPAMGQAALEARQLGTTNDLENGSCRTYVLIYA...,2.423614,3.1.1.74,214.0,1
1105,0.0001,0.9999,MKASVLALALAGVATAYPLDQRQSLESARDVLTRNDLEHGDSSNCP...,1.710498,3.1.1.74,224.0,1
15518,0.0001,0.9999,MHASTILLSALLGTAAVASPLGIESRQFGSASSGNSASSVDLFSEG...,3.54578,3.1.1.74,216.0,1


In [131]:
predictions_pcl = pcl_model.predict_proba(df_protrans_bdf.drop(columns=columns_with_info))
df_pcl_predictions = pd.DataFrame(data=predictions_pcl, columns=["p0", "p1"])

for column in columns_with_info:
    df_pcl_predictions[columns_with_info] = df_protrans_bdf[columns_with_info].values



In [132]:
df_pcl_predictions["positive_class"] = (df_pcl_predictions["p1"]>=0.5).astype(int)
df_pcl_predictions["positive_class"].value_counts()

positive_class
0    74353
1    44722
Name: count, dtype: int64

In [133]:
df_pcl_predictions[df_pcl_predictions["positive_class"] == 1].sort_values(by="p1", ascending=False)[:3]

Unnamed: 0,p0,p1,sequence,perplexity,EC,length,positive_class
18718,0.03,0.97,MHRSATLLTTCLVVLLGATPTLAQAADGPAASSVTELENRLAGNDL...,2.588698,3.1.1.74,220.0,1
18370,0.03,0.97,MFTAVVLAGLFTPTAQALDIRQSVTNGLEAGECKPITYIFSRGTGE...,1.962059,3.1.1.74,213.0,1
20405,0.04,0.96,MVLSLIGTLVAATPVDLQERQVLTENDLKNGSCKPIIYVFARATTE...,1.701613,3.1.1.74,200.0,1


In [134]:
predictions_pu = pu_model.predict_proba(df_protrans_albert.drop(columns=columns_with_info))
df_pu_predictions = pd.DataFrame(data=predictions_pu, columns=["p0", "p1"])

for column in columns_with_info:
    df_pu_predictions[columns_with_info] = df_protrans_albert[columns_with_info].values



In [135]:
df_pu_predictions["positive_class"] = (df_pu_predictions["p1"]>=0.5).astype(int)
df_pu_predictions["positive_class"].value_counts()

positive_class
0    66361
1    52714
Name: count, dtype: int64

In [136]:
df_pu_predictions[df_pu_predictions["positive_class"] == 1].sort_values(by="p1", ascending=False)[:3]

Unnamed: 0,p0,p1,sequence,perplexity,EC,length,positive_class
71432,0.2,0.8,MNLDEYLSKGELCARELCQAVKSGEASPLEITQSCLKRIADINPKV...,3.914504,3.5.2.12,473.0,1
50722,0.2,0.8,MRSTITEVGDVTLDAWDVVGFNALVDAGKIEPTELLKESYLRIQQV...,4.317424,3.5.2.12,489.0,1
52846,0.22,0.78,MDYESLAKLDAVGLADLIRREEITPRELMDSAVAKAGKLNPTIHIV...,5.092028,3.5.2.12,454.0,1


In [137]:
predictions_nylon = nylon_model.predict_proba(df_esme.drop(columns=columns_with_info))
df_nylon_predictions = pd.DataFrame(data=predictions_nylon, columns=["p0", "p1"])

for column in columns_with_info:
    df_nylon_predictions[columns_with_info] = df_esme[columns_with_info].values



In [138]:
df_nylon_predictions["positive_class"] = (df_nylon_predictions["p1"]>=0.5).astype(int)
df_nylon_predictions["positive_class"].value_counts()

positive_class
0    71790
1    47285
Name: count, dtype: int64

In [139]:
df_nylon_predictions[df_nylon_predictions["positive_class"] == 1].sort_values(by="p1", ascending=False)[:3]

Unnamed: 0,p0,p1,sequence,perplexity,EC,length,positive_class
50489,0.01,0.99,MNTTALLEDLSAYDAVGLAALIEADRVTPRELMESAHAAFERTNAR...,4.798578,3.5.2.12,469.0,1
50330,0.01,0.99,MTQIDATQLSPEELARTDAVALADRVRAGQFSAEQVIARAHDAVEQ...,4.691217,3.5.2.12,496.0,1
50413,0.01,0.99,MTEILNAGEAATPSDIASLDTLAVRARQVSPEELRDSAYARIESID...,4.598907,3.5.2.12,474.0,1


In [140]:
df_pet_predictions_filter = df_pet_predictions[df_pet_predictions["positive_class"] == 1]
df_pet_predictions_filter.sort_values(by="p1", ascending=False).to_csv("seqs_to_evaluate.csv", index=False)