In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact, interactive, fixed, interact_manual



# Analysis of Enzyme Commission (EC) numbers in the proteins of interest

This notebook is to analyze the Enzyme Commission (EC) numbers in the proteins of interest. The EC numbers are a numerical classification scheme for enzymes, based on the chemical reactions they catalyze. The EC numbers are a hierarchical classification, with four levels of increasing specificity. The first level is the broadest, and the fourth level is the most specific. The EC numbers are used to classify enzymes in the Enzyme Commission of the International Union of Biochemistry and Molecular Biology (IUBMB).


EC --> Forth level of EC

EC2 ---> Second level of EC

EC1 ---> First level of EC

In [2]:
# load data from csv
df = pd.read_csv('pocket_active_sites_gass_prank.csv', sep=',', header=0)
# create new column with first two letters of EC number
df['EC_NUMBER_2'] = df['EC_NUMBER'].str[:3]

# create new column with first letter of EC number
df['EC_NUMBER_1'] = df['EC_NUMBER'].str[:1]

#strip whitespace from the values in the 'POCKET' column
df['POCKET'] = df['POCKET'].str.strip()

# filter according to POCKET (filter out pocket1 and pocket2)
df_filtered = df[df['POCKET'] != 'pocket1']
df_filtered = df_filtered[df_filtered['POCKET'] != 'pocket2']

# create a new column with only residue number (strip the chain)
df_filtered['RESIDUE_NUMBER'] = df_filtered['RESIDUE'].str[2:]

# create a new column with only chain 
df_filtered['CHAIN'] = df_filtered['RESIDUE'].str[0]

# filter data according to fitness score < 5
df_filtered = df_filtered[df_filtered['FITNESS'] < 5]


# form a column which calculates average FITNESS score for each POCKET in each PDB_ID
df_filtered['avg_fitness'] = df_filtered.groupby(['PDB_ID', 'POCKET'])['FITNESS'].transform('mean')



## Dataframe by EC number
You can filter the data by EC number and see the residues, pockets, and proteins that have that EC number.

In [3]:
@interact
def filter_data_by_EC(EC_NUMBER = df_filtered['EC_NUMBER_2'].unique()):
    return df_filtered[df_filtered['EC_NUMBER_2'] == EC_NUMBER]

interactive(children=(Dropdown(description='EC_NUMBER', options=('1.6', '1.9', '4.2', '2.5', '4.3', '4.1', '1.…

In [5]:
def heatmap_residue_pdbID_by_fitness_score(df_filtered_ec):
    # Pivot the data for the heatmap
    heatmap_data = df_filtered_ec.pivot_table(index='RESIDUE_NUMBER', columns='PDB_ID', values='FITNESS', aggfunc='mean')

    # Creating the heatmap
    plt.figure(figsize=(14, 10))
    sns.heatmap(heatmap_data, annot=False, cmap="viridis")
    plt.title('Heatmap of FITNESS Scores by Residue number and PDB ID')
    plt.xlabel('PDB ID')
    plt.ylabel('Residue')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [14]:
def heatmap_pocket_pdbID_by_fitness_score(df_filtered_ec):
    heatmap_data = df_filtered_ec.pivot_table(index='POCKET', columns='PDB_ID', values='avg_fitness', aggfunc='mean')
    # Creating the heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(heatmap_data, annot=False, cmap="viridis")
    plt.title('Heatmap of Average Fitness Scores by Pocket and PDB ID')
    plt.xlabel('PDB ID')
    plt.ylabel('Residue')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## Visualization of EC numbers in pockets and proteins
You can see the distribution of EC numbers in the pockets and proteins. The color of heatmap represents fitness score where 0 (zero) is the best score.

In [15]:
@interact
def visualize_heatmap_pocket_pdbid_ecnumber(ec_number = df_filtered['EC_NUMBER'].unique()):
    # Filter data according to the selected EC number
    df_filtered_ec = df_filtered[df_filtered['EC_NUMBER'] == ec_number]
    heatmap_pocket_pdbID_by_fitness_score(df_filtered_ec)
   

interactive(children=(Dropdown(description='ec_number', options=('1.6.6.-', '1.9.3.1', '4.2.1.22', '2.5.1.17',…

## Visualization of EC numbers (second level) in pockets and proteins
You can see the distribution of EC numbers (second level) in the pockets and proteins. The color of heatmap represents fitness score where 0 (zero) is the best score.

In [16]:
@interact
def visualize_heatmap_pocket_pdbid_ecnumber(ec_number= df_filtered['EC_NUMBER_2'].unique()):
    # Filter data according to the selected EC number
    df_filtered_ec = df_filtered[df_filtered['EC_NUMBER_2'] == ec_number]
    heatmap_pocket_pdbID_by_fitness_score(df_filtered_ec)

interactive(children=(Dropdown(description='ec_number', options=('1.6', '1.9', '4.2', '2.5', '4.3', '4.1', '1.…

## Visualization of EC numbers (first level) in pockets and proteins
You can see the distribution of EC numbers (first level) in the pockets and proteins. The color of heatmap represents fitness score where 0 (zero) is the best score.


In [17]:
@interact
def visualize_heatmap_pocket_pdbid_ecnumber(ec_number= df_filtered['EC_NUMBER_1'].unique()):
    # Filter data according to the selected EC number
    df_filtered_ec = df_filtered[df_filtered['EC_NUMBER_1'] == ec_number]
    heatmap_pocket_pdbID_by_fitness_score(df_filtered_ec)

interactive(children=(Dropdown(description='ec_number', options=('1', '4', '2', '6', '5', nan, '3'), value='1'…

## Visualization of EC numbers in residues and proteins
You can see the distribution of EC numbers in the residues and proteins. The color of heatmap represents fitness score where 0 (zero) is the best score.

In [18]:
@interact
def visualize_heatmap_pocket_pdbid_ecnumber(ec_number = df_filtered['EC_NUMBER'].unique()):
    # Filter data according to the selected EC number
    df_filtered_ec = df_filtered[df_filtered['EC_NUMBER'] == ec_number]
    heatmap_residue_pdbID_by_fitness_score(df_filtered_ec)

interactive(children=(Dropdown(description='ec_number', options=('1.6.6.-', '1.9.3.1', '4.2.1.22', '2.5.1.17',…

## Visualization of EC numbers (second level) in residues and proteins
You can see the distribution of EC numbers (second level) in the residues and proteins. The color of heatmap represents fitness score where 0 (zero) is the best score.

In [19]:
@interact
def visualize_heatmap_pocket_pdbid_ecnumber(ec_number = df_filtered['EC_NUMBER_2'].unique()):
    # Filter data according to the selected EC number
    df_filtered_ec = df_filtered[df_filtered['EC_NUMBER_2'] == ec_number]
    heatmap_residue_pdbID_by_fitness_score(df_filtered_ec)

interactive(children=(Dropdown(description='ec_number', options=('1.6', '1.9', '4.2', '2.5', '4.3', '4.1', '1.…

## Visualization of EC numbers (first level) in residues and proteins
You can see the distribution of EC numbers (first level) in the residues and proteins. The color of heatmap represents fitness score where 0 (zero) is the best score.

In [20]:
@interact
def visualize_heatmap_pocket_pdbid_ecnumber(ec_number = df_filtered['EC_NUMBER_1'].unique()):
    # Filter data according to the selected EC number
    df_filtered_ec = df_filtered[df_filtered['EC_NUMBER_1'] == ec_number]
    heatmap_residue_pdbID_by_fitness_score(df_filtered_ec)

interactive(children=(Dropdown(description='ec_number', options=('1', '4', '2', '6', '5', nan, '3'), value='1'…