In [2]:
import pandas as pd


Many targets have id CHEMBL612545, which in ChEMBL is an unchecked protein and does not have a sequence. Values from `activities.csv` need to be extracted and sorted in such a way, that manual searching for those sequences is possibly comfortable.

In [42]:
df = pd.read_csv('/content/actvities.csv')
print(df.columns)

Index(['Unnamed: 0', 'action_type', 'activity_comment', 'activity_id',
       'activity_properties', 'assay_chembl_id', 'assay_description',
       'assay_type', 'assay_variant_accession', 'assay_variant_mutation',
       'bao_endpoint', 'bao_format', 'bao_label', 'canonical_smiles',
       'data_validity_comment', 'data_validity_description',
       'document_chembl_id', 'document_journal', 'document_year',
       'ligand_efficiency', 'molecule_chembl_id', 'molecule_pref_name',
       'parent_molecule_chembl_id', 'pchembl_value', 'potential_duplicate',
       'qudt_units', 'record_id', 'relation', 'src_id', 'standard_flag',
       'standard_relation', 'standard_text_value', 'standard_type',
       'standard_units', 'standard_upper_value', 'standard_value',
       'target_chembl_id', 'target_organism', 'target_pref_name',
       'target_tax_id', 'text_value', 'toid', 'type', 'units', 'uo_units',
       'upper_value', 'value'],
      dtype='object')


In [47]:
assay_id_counts = df['assay_chembl_id'].value_counts()

In [65]:
assay_id_counts

CHEMBL1614220    125
CHEMBL2060535     62
CHEMBL2060534     62
CHEMBL2060537     62
CHEMBL4042368     60
                ... 
CHEMBL2043712      1
CHEMBL2043705      1
CHEMBL2043706      1
CHEMBL1944232      1
CHEMBL3094684      1
Name: assay_chembl_id, Length: 152, dtype: int64

In [63]:
i=0
n=0
for el in assay_id_counts:
    if el>2:
        i+=el
        n+=1
print(f'To get IC50 for this many target-ligand pairs:{i} we have to manualy set sequence for this many proteins: {n}, which means checking {n} papers')
# print(n)

To get IC50 for this many target-ligand pairs:929 we have to manualy set sequence for this many proteins: 67, which means checking 67 papers


In [48]:
df['target_chembl_id'].value_counts()

CHEMBL612545     760
CHEMBL1293269    157
CHEMBL5118        38
CHEMBL4523163     35
CHEMBL4380        32
CHEMBL4523582      7
CHEMBL5980         4
CHEMBL345          3
CHEMBL4953         1
CHEMBL379          1
Name: target_chembl_id, dtype: int64

In [60]:
unique_assays = df[['assay_chembl_id','assay_description','document_chembl_id','target_chembl_id']].rename(columns={'assay_chembl_id':'assay_id'}).drop_duplicates().set_index('assay_id')

In [61]:
unique_assays

Unnamed: 0_level_0,assay_description,document_chembl_id,target_chembl_id
assay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CHEMBL857450,Inhibition against DNA-Dependent ATPase activi...,CHEMBL1128737,CHEMBL4380
CHEMBL689340,Inhibition against DNA-Dependent GTPase activi...,CHEMBL1128737,CHEMBL4380
CHEMBL689341,Inhibition against the primase activity of HSV...,CHEMBL1128737,CHEMBL4380
CHEMBL690907,Inhibition against Primase by coupled primase-...,CHEMBL1128737,CHEMBL4380
CHEMBL695117,Inhibitory concentration against herpes simple...,CHEMBL1139458,CHEMBL4380
...,...,...,...
CHEMBL5098106,Inhibition of Hepatitis C virus NS3 helicase m...,CHEMBL5096190,CHEMBL612545
CHEMBL5135910,Inhibition of SARS-CoV-2 nsp13 helicase-associ...,CHEMBL5131518,CHEMBL4523582
CHEMBL5135911,Inhibition of SARS-CoV-2 nsp13 helicase-associ...,CHEMBL5131518,CHEMBL4523582
CHEMBL5138458,Inhibition of Cy5-dT15 ssDNA to Hepatitis C vi...,CHEMBL5137003,CHEMBL612545


In [77]:
unique_assays = unique_assays.join(assay_id_counts).sort_values(by=['assay_chembl_id'],ascending=False).rename(columns={'assay_chembl_id':'n_compounds'})


In [78]:
unique_assays[unique_assays.n_compounds>5]

Unnamed: 0_level_0,assay_description,document_chembl_id,target_chembl_id,n_compounds
assay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CHEMBL1614220,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,CHEMBL1201862,CHEMBL1293269,125
CHEMBL2060535,Inhibition of HCV NS3 helicase overexpressed i...,CHEMBL2057104,CHEMBL612545,62
CHEMBL2060534,Inhibition of SARS coronavirus nsP13 helicase ...,CHEMBL2057104,CHEMBL612545,62
CHEMBL2060537,Inhibition of HCV NS3 helicase ATP hydrolysis ...,CHEMBL2057104,CHEMBL612545,62
CHEMBL4042368,Inhibition of HCV NS3 helicase DNA unwinding a...,CHEMBL4041513,CHEMBL612545,60
CHEMBL3870401,Inhibition of Hepatitis C virus genotype 1b (c...,CHEMBL3870302,CHEMBL612545,43
CHEMBL4428655,Inhibition of helicase activity of JC polyomav...,CHEMBL4428043,CHEMBL4523163,35
CHEMBL4428657,Inhibition of helicase activity of BK polyomav...,CHEMBL4428043,CHEMBL612545,34
CHEMBL3122911,Inhibition of HCV H genotype 1a N-terminal hex...,CHEMBL3120058,CHEMBL612545,18
CHEMBL1816593,Inhibition of helicase/NTPase activity of full...,CHEMBL1811822,CHEMBL612545,16


In [80]:
unique_assays[unique_assays.n_compounds>5]['n_compounds'].sum()

861

In [81]:
unique_assays.to_csv('assays_targets.csv')