# Kinome stats

Code taken from https://github.com/volkamerlab/teachopencadd/tree/master/teachopencadd/talktorials/T023_what_is_a_kinase

In [1]:
from pathlib import Path

import pandas as pd
import opencadd

## KinMap data

There are some KinMap trees shown in this notebook. The code below generates the KinMap CSV files to be uploaded to KinMap:
http://www.kinhub.org/kinmap.

**I used Safari for the KinMap download (Chrome produced blurry figures after conversion to PDF).**

_Note_:
1. Download as SVG.
2. Open SVG with a text editor and remove `TypicalHoverControls` and `AtypicalHoverControls` tags.
3. Download as SVG and convert to PDF in your terminal (Linux) via `convert my_kinmap_figure.svg my_kinmap_figure.pdf`.
4. If SVG download doesn't render the figure properly, open a text editor and copy paste this into the SVG file: `xmlns:xlink="http://www.w3.org/1999/xlink"`, resulting in something similar to this in the first few lines:

    `<svg id="svgCopy" viewBox="0 0 1591 1959" preserveAspectRatio="xMinYMin meet" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" style=""><desc>Created with Snap</desc><defs></defs><g`


In [2]:
def format_for_kinmap(kinase_names, kinase_values, size_min=10, size_max=50):
    """
    Given kinase names and some associated values, generates a KinMap data file
    that will display values as circles of size [`size_min`, `size_max`].

    Parameters
    ----------
    kinase_names : list of str
        Kinase names.
    kinase_values : list of float
        Some associated values, such as the number of bioactivites.
    size_min : int
        Minimum circle size on KinMap tree (minimum input value will be scaled to `size_min`).
    size_max : int
        Maximum circle size on KinMap tree (maximum input value will be scaled to `size_min`).

    Returns
    -------
    pandas.DataFrame
        KinMap data with columns `xName` (kinase name), `size` (circle size for KinMap tree).
    """

    data = pd.DataFrame({"xName": kinase_names, "values": kinase_values})
    min_ = data["values"].min()
    max_ = data["values"].max()
    print(min_, max_)
    data["size"] = data["values"].apply(
        lambda x: ((x - min_) / (max_ - min_) * size_max) + size_min
    )
    return data[["xName", "size"]]

## Number of PDB structures per kinase

Generate the number of structures per kinase in the KinMap format to be mapped onto the kinome tree.

In [3]:
from opencadd.databases.klifs import setup_remote

klifs = setup_remote()
structures_df = klifs.structures.all_structures()
structures_df = structures_df[structures_df["species.klifs"] == "Human"].copy()
print(structures_df.shape)

# Get number of structures per kinase
n_structures_per_kinase = (
    structures_df.groupby(["structure.pdb_id", "kinase.klifs_name"])
    .first()
    .reset_index()
    .groupby("kinase.klifs_name")
    .size()
)

(12314, 46)


In [4]:
# Save in KinMap format
kinmap_n_structures_per_kinase = format_for_kinmap(
    n_structures_per_kinase.index, n_structures_per_kinase.values
)

# Formatting!
kinmap_n_structures_per_kinase["xName"] = kinmap_n_structures_per_kinase["xName"].str.replace("-", "_")
kinmap_n_structures_per_kinase["xName"] = kinmap_n_structures_per_kinase["xName"].str.replace("DCLK1", "DCAMKL1")

# Save!
kinmap_n_structures_per_kinase.to_csv("kinmap_n_structures_per_kinase.csv", index=None)
# Some kinases will not be resolved in KinMap and will be simply dropped (check manually)

1 432


Identify the kinase which has the most structures.

In [5]:
kinmap_n_structures_per_kinase.iloc[n_structures_per_kinase.argmax()].xName, max(
    n_structures_per_kinase
)

('CDK2', 432)

## Number of ChEMBL bioactivities per kinase¶

In [8]:
from opencadd.databases.klifs import setup_remote

# Get bioactivity data
path = "https://github.com/openkinome/kinodata/releases/download/v0.3/activities-chembl29_v0.3.zip"
data = pd.read_csv(path, index_col=None)
data = data[data["activities.standard_type"] == "pIC50"]
data = data.dropna()

# Get kinase data
klifs = setup_remote()
kinases_df = klifs.kinases.all_kinases()
kinases_df = kinases_df[kinases_df["kinase.uniprot"] != "0"]
# Some UniProt IDs have several names in KLIFS, keep only first
kinases_df = kinases_df.groupby("kinase.uniprot").first()

# Map UniProt ID > kinase KLIFS name
data = pd.merge(data, kinases_df, left_on="UniprotID", right_on="kinase.uniprot", how="left")

# Get number of activities per kinase
n_activities_per_kinase = data.groupby("kinase.klifs_name").size()

In [9]:
# Save in KinMap format
kinmap_n_activities_per_kinase = format_for_kinmap(
    n_activities_per_kinase.index, n_activities_per_kinase.values
)
kinmap_n_activities_per_kinase.to_csv("kinmap_n_activities_per_kinase.csv", index=None)
# Some kinases will not be resolved in KinMap and will be simply dropped

1 5637
