# Approved Protein Kinase Inhibitors (PKIs)
This [overview](https://klifs.net/drugs.php#) is powered by KLIFS in-house annotations and is complemented with data from the [PKIDB](https://www.icoa.fr/pkidb/)

In [None]:
# General python libraries and packages
import numpy as np
import pandas as pd
import os

# Accessing Databases
import requests

# Processing chemical data
from rdkit.Chem import Draw, PandasTools

In [None]:
# Send a GET request to the API endpoint to retrieve the drug list
response = requests.get('https://klifs.net/api_v2/drug_list')

# Parse the JSON response from the API into a Python dictionary and convert the dictionary to a pandas dataframe
df= pd.DataFrame(response.json())

print(len(df))
df.head(1)

In [None]:
# Replace any empty strings by NaN values
df = df.replace('', np.nan, regex=True)

# In the Synonyms column, replace any \t characters with a semicolon
df['Synonyms'] = df['Synonyms'].str.replace('\t', ';  ')

def extract_comment_from_numeric_col(df, column_name):
    '''Separate values in a column into a numeric part and a string (comment) part.'''
    # Convert the specified column to a numeric type, replacing any non-numeric values with NaN
    df[f'{column_name}_Numeric'] = pd.to_numeric(df[column_name], errors='coerce')
    # Create a new column for the comments, replacing any numeric values with NaN
    df[f'{column_name}_Comment'] = np.where(df[f'{column_name}_Numeric'].isna(), df[column_name], np.nan)
    # Replace the original column with the numeric column
    df[column_name] = df[f'{column_name}_Numeric']
    # Drop the numeric column, since it has been replaced with the original column
    df = df.drop(f'{column_name}_Numeric', axis=1)
    # Return the modified DataFrame
    return df

# Apply the function to the column 'Phase' and 'Approval'
df = extract_comment_from_numeric_col(df, 'Phase')
df = extract_comment_from_numeric_col(df, 'Approval')


In [None]:
df = df.rename(columns={'INN'              : 'Drug_Name', 
                        'Brand Name'       : 'Brand_Name',
                        'ChEMBL'           : 'ChEMBL_Molecule_ID', 
                        'PDB'              : 'PDB_Ligand_ID'})

## Filter for approved drugs only

In [None]:
# How many PKIS per clinical phase?
# df['Phase'].value_counts()

# Filter for approved drugs, i.e. drugs in phase IV, only
df = df[df['Phase'] == 4.0]

# Drop the Phase_Comment column
df = df.drop(columns=['Phase_Comment'])

## Sort by the Year of First Approval

In [None]:
# Join the information from the two columns together as strings, ignoring NaN values
df['Approval'] = df['Approval'].fillna('').astype(str).str.replace(r'\.0$', '', regex=True) + ' ' + df['Approval_Comment'].fillna('').astype(str)

# Drop the original Approval_Comment column
df = df.drop(columns=['Approval_Comment'])

# Sort the dataframe by the year of approval, with the help of a temporary column
df['Year_Of_First_Approval'] = df['Approval'].str.extract('(\d{4})', expand=False).astype(int)
df = df.sort_values('Year_Of_First_Approval', ascending=False)
df = df.drop('Year_Of_First_Approval', axis=1)
df = df.reset_index(drop=True)

# Save to file
filepath = os.path.join('Data', 'KLIFS_PKIs')
filename = os.path.join(filepath, 'KLIFS_Approved_PKIs.xlsx')
df.to_excel(filename, index=False)

df.head(1)

## Cross-References to ChEMBL and the PDB

In [None]:
# How many values in the PDB column are not NaN?
print(df['PDB_Ligand_ID'].notnull().sum())

In [None]:
# How many values in the ChEMBL column are not NaN?
print(df['ChEMBL_Molecule_ID'].notnull().sum())

In [None]:
print(len(df))

## Add PDB Ligand ID 

In [None]:
# Created on Oct-10 from http://dunbrack.fccc.edu/kincore/FDA
pdb_lig_id_df = pd.read_excel(os.path.join(filepath, 'PKIs_PDB_Structures.xlsx'))
kinase_labels = pd.read_csv(os.path.join(filepath, 'KinCore_Results_All.tab'), sep='\t')

#  Split the string in the Ligand column by space character and extract the first three letters of each substring
kinase_labels['PDB_Lig_ID'] = kinase_labels['Ligand'].apply(lambda x: [i[:3] for i in x.split(' ') if i != 'No_ligand'])
# Explode to one row per Ligand
kinase_labels = kinase_labels.explode('PDB_Lig_ID').reset_index(drop=True)
# Drop lines, for which the PDB_ID is NaN
kinase_labels = kinase_labels[kinase_labels['PDB_Lig_ID'].notna()].reset_index(drop=True)

# Merge based on the PDB Ligand ID 
pdb_lig_id_df = pd.merge(pdb_lig_id_df, kinase_labels, on='PDB_Lig_ID')

# Group by the Lig_Name
pdb_lig_id_df_grouped = pdb_lig_id_df.groupby('Drug_Name').agg({
    'Drug_Name' : 'first', 
    **{col: list for col in pdb_lig_id_df.columns.to_list() if col not in ['Drug_Name']}, 
    }).reset_index(drop=True)

pdb_lig_id_df_grouped.to_excel(os.path.join(filepath, 'PKIs_PDB_Structures_Kincore_Results.xlsx'), index=False)

# Merge with df 
df = pd.merge(df, pdb_lig_id_df_grouped, on='Drug_Name', how='left')
df.to_excel(filename, index=False)

df.head(2)

## Add Molecular Structures

In [None]:
PandasTools.AddMoleculeColumnToFrame(df, smilesCol="SMILES")
PandasTools.RenderImagesInAllDataFrames(True)
df