# Playground

Resources:
1. http://biopython.org/DIST/docs/tutorial/Tutorial.html

In [252]:
# Imports.
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from utility_functions import desc_df
from utility_functions import print_color

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [253]:
df_iupac_aa_code = pd.read_csv('../Data/__iupac_amino_acid_code.csv')
df_iupac_aa_code.set_index('code1letter', inplace=True, drop=True)
df_iupac_aa_code

Unnamed: 0_level_0,code3letter,name
code1letter,Unnamed: 1_level_1,Unnamed: 2_level_1
A,Ala,Alanine
C,Cys,Cysteine
D,Asp,Aspartic Acid
E,Glu,Glutamic Acid
F,Phe,Phenylalanine
G,Gly,Glycine
H,His,Histidine
I,Ile,Isoleucine
K,Lys,Lysine
L,Leu,Leucine


In [254]:
proteins = {"id":[], "name":[], "source":[], "plastic":[], "n_plastics":[], "aa_seq":[]}
for p_data in SeqIO.parse("../Data/__plasticdb_protein_sequences.fasta", "fasta"):
    p_id, name, source, plastic = p_data.id.lower().replace('-','_').split("||")
    aa_seq = p_data.seq
    proteins['id'].append(int(p_id))
    proteins['name'].append(name)
    proteins['source'].append(source)
    proteins['plastic'].append(plastic)
    proteins['n_plastics'].append(len(plastic.split("_")))
    proteins['aa_seq'].append(aa_seq)

In [255]:
df_proteins = pd.DataFrame(proteins)
df_proteins.set_index('id', drop=True, inplace=True)
df_proteins

Unnamed: 0_level_0,name,source,plastic,n_plastics,aa_seq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,phb_depolymerase,ralstonia_pickettii,phb_pha,2,"(M, K, H, P, Y, G, Y, R, W, H, W, L, Y, A, L, ..."
2,pha_depolymerase,pseudomonas_fluorescens,pho_pha,2,"(M, P, L, R, T, L, L, C, G, L, L, L, A, V, C, ..."
3,phb_depolymerase_a,paucimonas_lemoignei,phb_pha,2,"(M, R, N, T, L, K, A, A, F, K, L, G, V, I, S, ..."
4,phb_depolymerase,paucimonas_lemoignei,phb_pha,2,"(M, L, A, K, Q, I, K, K, A, N, S, R, S, T, L, ..."
5,pha_depolymerase_b,paucimonas_lemoignei,phb_pha,2,"(M, M, S, S, Q, T, T, Q, S, S, K, F, S, L, F, ..."
...,...,...,...,...,...
204,petase,thermobifida_fusca,pet,1,"(M, A, A, N, P, Y, E, R, G, P, N, P, T, D, A, ..."
205,petase,thermobifida_alba,pet,1,"(M, A, N, P, Y, E, R, G, P, N, P, T, E, S, M, ..."
206,petase,thermobifida_cellulosilytica,pet,1,"(M, A, N, P, Y, E, R, G, P, D, P, T, Q, A, S, ..."
207,petase,thermobifida_halotolerans,pet,1,"(M, A, N, P, Y, E, R, G, P, N, P, T, N, S, S, ..."


In [256]:
vc = df_proteins['aa_seq'].value_counts()
print(len(vc))
print(vc)

197
aa_seq
(M, A, V, M, T, P, R, R, E, R, S, S, L, L, S, R, A, L, Q, V, T, A, A, A, A, T, A, L, V, T, A, V, S, L, A, A, P, A, H, A, A, N, P, Y, E, R, G, P, N, P, T, D, A, L, L, E, A, S, S, G, P, F, S, V, S, E, E, N, V, S, R, L, S, A, S, G, F, G, G, G, T, I, Y, Y, P, R, E, N, N, T, Y, G, A, V, A, I, S, P, G, Y, ...)    3
(M, R, S, I, R, L, K, R, L, I, A, A, V, A, L, G, G, A, A, A, A, T, Q, A, A, S, P, L, P, R, L, N, V, D, K, T, Q, I, S, V, S, G, L, S, A, G, G, F, M, A, N, Q, L, G, Y, A, Y, S, G, T, F, M, G, V, G, I, F, A, G, G, P, Y, M, C, A, G, H, S, N, Y, T, S, C, M, Y, N, A, T, I, T, S, S, M, R, S, A, M, Q, A, S, ...)    3
(M, G, V, F, D, Y, K, N, F, T, A, S, D, S, K, A, L, F, S, D, A, L, A, I, T, L, Y, S, Y, H, N, I, D, N, G, F, A, E, G, Y, Q, H, N, G, F, G, L, G, L, P, A, T, L, V, T, A, L, I, G, S, G, N, S, Q, G, V, I, P, G, I, P, W, N, P, D, S, E, K, A, A, L, D, A, L, H, Q, A, G, W, S, T, I, S, A, Q, Q, L, G, Y, D, ...)    2
(M, N, F, P, R, A, S, R, L, M, Q, A, A, V, L, G, G, L, M

In [257]:
min(df_proteins['aa_seq'].apply(len)), max(df_proteins['aa_seq'].apply(len))

(108, 914)

In [258]:
df_microbes = pd.read_csv('../Data/__plasticdb_microbes.tsv', delimiter='\t')
desc_df(df_microbes)

No. of rows = 1701
No. of columns = 22
[33m
Feature Overview[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1701 entries, 0 to 1700
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Microorganism                         1701 non-null   object 
 1   Tax ID                                1701 non-null   int64  
 2   Plastic                               1701 non-null   object 
 3   Ref                                   1701 non-null   object 
 4   Enzyme                                1654 non-null   object 
 5   Enzyme ID                             384 non-null    float64
 6   Database Enzyne Name                  384 non-null    object 
 7   Gene                                  1691 non-null   object 
 8   GenbankID                             320 non-null    object 
 9   Sequence                              384 non-null    object 
 10  Year              

Unnamed: 0,Microorganism,Tax ID,Plastic,Ref,Enzyme,Enzyme ID,Database Enzyne Name,Gene,GenbankID,Sequence,...,Plastic used,Manufacturer,Analitical grade,Thermophilic conditions,Isolation sample type,Isolation environment,Isolation location,Degradation extrapolated from enzyme,Paper id,lineage
0,Ralstonia pickettii,329,PHB,"Yukawa, H., Uchida, Y., Kohama, K., & Kurusu, ...",PHB depolymerase,1.0,00001 | PHB depolymerase | Ralstonia pickettii...,Yes,BAA04986.1,MKHPYGYRWHWLYALVVTLMTALATFSAHAAVTAGPGAWSSQQTWA...,...,,,,,Soil,Soil,Japan,Yes,,"superkingdom:Bacteria,phylum:Proteobacteria,cl..."
1,Ralstonia pickettii,329,PHA,"Yukawa, H., Uchida, Y., Kohama, K., & Kurusu, ...",PHB depolymerase,1.0,00001 | PHB depolymerase | Ralstonia pickettii...,Yes,BAA04986.1,MKHPYGYRWHWLYALVVTLMTALATFSAHAAVTAGPGAWSSQQTWA...,...,,,,,Soil,Soil,Japan,Yes,,"superkingdom:Bacteria,phylum:Proteobacteria,cl..."
2,Pseudomonas fluorescens,294,PHO,"Schirmer, A., & Jendrossek, D. (1994). Molecul...",PHA depolymerase,2.0,00002 | PHA depolymerase | Pseudomonas fluores...,Yes,AAA64538.1,MPLRTLLCGLLLAVCLGQHALAASRCSERPRTLLRPAEVSCSYQST...,...,P(3HO) was isolated from P. oleovorans grown ...,,Yes,No,,,,Yes,,"superkingdom:Bacteria,phylum:Proteobacteria,cl..."
3,Pseudomonas fluorescens,294,PHA,"Schirmer, A., & Jendrossek, D. (1994). Molecul...",PHA depolymerase,2.0,00002 | PHA depolymerase | Pseudomonas fluores...,Yes,AAA64538.1,MPLRTLLCGLLLAVCLGQHALAASRCSERPRTLLRPAEVSCSYQST...,...,P(3HO) was isolated from P. oleovorans grown ...,,Yes,No,,,,Yes,,"superkingdom:Bacteria,phylum:Proteobacteria,cl..."
4,Paucimonas lemoignei,29443,PHB,"Jendrossek, D. I. E. T. E. R., Frisse, A. N. D...",PHB depolymerase A,3.0,00003 | PHB depolymerase-A | Paucimonas lemoig...,Yes,AAA65705.1,MRNTLKAAFKLGVISAALLAPFATQAATAGPGAWSSQQTWAADSVN...,...,The homopolyester PHB was isolated from sodium...,,Yes,No,Culture collection,Culture collection,,No,,"superkingdom:Bacteria,phylum:Proteobacteria,cl..."


In [259]:
useful_features = ["Enzyme ID", 'Degradation extrapolated from enzyme']
df_microbes_useful = df_microbes[useful_features]
df_microbes_useful = df_microbes_useful[~df_microbes_useful['Enzyme ID'].isna()]
df_microbes_useful = df_microbes_useful[~df_microbes_useful['Degradation extrapolated from enzyme'].isna()]
df_microbes_useful

Unnamed: 0,Enzyme ID,Degradation extrapolated from enzyme
0,1.0,Yes
1,1.0,Yes
2,2.0,Yes
3,2.0,Yes
4,3.0,No
...,...,...
1696,204.0,Yes
1697,205.0,Yes
1698,206.0,Yes
1699,207.0,Yes


In [260]:
enzyme_degradation = {}
for index, row in df_microbes_useful.iterrows():
    enzyme_id = row['Enzyme ID']
    degradation = int(row['Degradation extrapolated from enzyme'] == 'Yes')
    if (
        enzyme_id not in enzyme_degradation 
        or enzyme_id in enzyme_degradation 
        and enzyme_degradation[int(enzyme_id)] == 0
    ): enzyme_degradation[int(enzyme_id)] = degradation
enzyme_degradation = {
    "id": list(enzyme_degradation.keys()), 
    "degradation": list(enzyme_degradation.values())
}
df_enzyme_degradation = pd.DataFrame(enzyme_degradation)
df_enzyme_degradation.set_index('id', inplace=True, drop=True)
df_enzyme_degradation

Unnamed: 0_level_0,degradation
id,Unnamed: 1_level_1
1,1
2,1
3,0
4,0
5,0
...,...
204,1
205,1
206,1
207,1


In [261]:
df_proteins = pd.concat(
    [df_proteins, df_enzyme_degradation],
    axis=1,
    join="outer",
    ignore_index=False,
    keys=None,
    levels=None,
    names=None,
    verify_integrity=False,
    copy=True,
)
df_proteins

Unnamed: 0_level_0,name,source,plastic,n_plastics,aa_seq,degradation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,phb_depolymerase,ralstonia_pickettii,phb_pha,2,"(M, K, H, P, Y, G, Y, R, W, H, W, L, Y, A, L, ...",1.0
2,pha_depolymerase,pseudomonas_fluorescens,pho_pha,2,"(M, P, L, R, T, L, L, C, G, L, L, L, A, V, C, ...",1.0
3,phb_depolymerase_a,paucimonas_lemoignei,phb_pha,2,"(M, R, N, T, L, K, A, A, F, K, L, G, V, I, S, ...",0.0
4,phb_depolymerase,paucimonas_lemoignei,phb_pha,2,"(M, L, A, K, Q, I, K, K, A, N, S, R, S, T, L, ...",0.0
5,pha_depolymerase_b,paucimonas_lemoignei,phb_pha,2,"(M, M, S, S, Q, T, T, Q, S, S, K, F, S, L, F, ...",0.0
...,...,...,...,...,...,...
204,petase,thermobifida_fusca,pet,1,"(M, A, A, N, P, Y, E, R, G, P, N, P, T, D, A, ...",1.0
205,petase,thermobifida_alba,pet,1,"(M, A, N, P, Y, E, R, G, P, N, P, T, E, S, M, ...",1.0
206,petase,thermobifida_cellulosilytica,pet,1,"(M, A, N, P, Y, E, R, G, P, D, P, T, Q, A, S, ...",1.0
207,petase,thermobifida_halotolerans,pet,1,"(M, A, N, P, Y, E, R, G, P, N, P, T, N, S, S, ...",1.0


In [262]:
df_proteins = df_proteins[df_proteins['degradation'] == 1].reset_index(drop=True)
df_proteins.drop('degradation', axis=1, inplace=True)
df_proteins

Unnamed: 0,name,source,plastic,n_plastics,aa_seq
0,phb_depolymerase,ralstonia_pickettii,phb_pha,2,"(M, K, H, P, Y, G, Y, R, W, H, W, L, Y, A, L, ..."
1,pha_depolymerase,pseudomonas_fluorescens,pho_pha,2,"(M, P, L, R, T, L, L, C, G, L, L, L, A, V, C, ..."
2,pva_dehydrogenase,pseudomonas_sp.,pva,1,"(M, Q, Q, N, I, E, R, N, Q, V, S, M, T, T, S, ..."
3,phb_depolymerase,comamonas_acidovorans,p3hp_p4hb_pea_pes_phb_pha,6,"(M, A, F, N, F, I, R, A, A, A, A, G, A, A, M, ..."
4,phb_depolymerase,comamonas_testosteroni,phb_pha,2,"(M, R, V, Q, S, W, R, S, G, V, A, A, L, A, L, ..."
...,...,...,...,...,...
116,petase,thermobifida_fusca,pet,1,"(M, A, A, N, P, Y, E, R, G, P, N, P, T, D, A, ..."
117,petase,thermobifida_alba,pet,1,"(M, A, N, P, Y, E, R, G, P, N, P, T, E, S, M, ..."
118,petase,thermobifida_cellulosilytica,pet,1,"(M, A, N, P, Y, E, R, G, P, D, P, T, Q, A, S, ..."
119,petase,thermobifida_halotolerans,pet,1,"(M, A, N, P, Y, E, R, G, P, N, P, T, N, S, S, ..."


In [263]:
# df_proteins.to_csv('../Data/__plasticdb_protein_sequences_clean.csv', index=False)