In [1]:
import pandas as pd
import numpy as np
import json, time, os, requests
import openai

PDB_URL_SEARCH = "https://search.rcsb.org/rcsbsearch/v2/query"
openai.api_key = "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"

In [2]:
def get_PDB_id_from_pharma_model(str):
    return str.split("_")[0].upper()

def get_ids_column(df):
    return df['Pharma Model'].map(get_PDB_id_from_pharma_model)


In [23]:
def get_keywords_column(df_ids):
    json_path = './keywords.json'
    if os.path.exists(json_path):
        with open(json_path,'r+') as json_file:
            keywords_dict = json.load(json_file)
    else:
        keywords_dict = {}
    #
    keywords_list = []
    data_len = len(df_ids)
    for x in range(data_len):
        if not (df_ids[x] in keywords_dict):
            print(df_ids[x])
            entry_id = df_ids[x]
            URL_PDB_DATA = f"https://data.rcsb.org/rest/v1/core/entry/{entry_id}"
            response = requests.get(URL_PDB_DATA)
            if (response.status_code == 200):
                keywords_dict[df_ids[x]] = response.json()['struct_keywords']['pdbx_keywords']
                keywords_list.append(response.json()['struct_keywords']['pdbx_keywords'])
            else:
                keywords_dict[df_ids[x]] = "ERROR try this entry again"
                keywords_list.append("ERROR try this entry again")
        else:
            keywords_list.append(keywords_dict[df_ids[x]])
    #
    with open(json_path,'+w') as json_file:
        json.dump(keywords_dict, json_file)
    #
    return pd.Series(keywords_list, )

In [4]:
def refine_class(classification):
    if "OXIDOREDUCTASE" in classification:
        classification =  "OXIDOREDUCTASES"
    elif "TRANSFERASE" in classification:
        classification =  "TRANSFERASES"
    elif "HYDROLASE" in classification:
        classification =  "HYDROLASES"
    elif "LYASE" in classification:
        classification =  "LYASES"
    elif "ISOMERASE" in classification:
        classification =  "ISOMERASES"
    elif "LIGASE" in classification:
        classification =  "LIGASES"
    elif "TRANSLOCASE" in classification:
        classification =  "TRANSLOCASES"
    else:
        classification = "OTHER"
    return classification

def get_class_column(df, df_keywords):
    json_path = './classifications-gpt.json'
    if os.path.exists(json_path):
        with open(json_path,'r+') as json_file:
            class_dict_gpt = json.load(json_file)
    else:
        class_dict_gpt = {}
    #
    data_len = len(df_keywords)
    for x in range(data_len):
        if not (df_keywords[x] in class_dict_gpt):
            completion = openai.ChatCompletion.create(
                model = "gpt-3.5-turbo",
                temperature = 0.5,
                max_tokens = 300,
                messages = [
                    {"role": "system", "content": "You have to determine in which classification group fits best the given protein, by its name and a keyword associated with it. Which are given in name-keyword format"},
                    {"role": "system", "content": "Answer with only the classification no more words"} ,
                    {"role": "system", "content": "For each element of the list given you can ONLY choose from one of these options: oxidoreductases, transferases, hydrolases, lyases, isomerases, ligases, translocases, not an enzyme. Do not answer out of these options."},
                    {"role": "assistant", "content": "['oxidoreductase', 'not an enzyme', ...]"},
                    {"role": "user", "content": f"{df['Name'][x]},{df_keywords[x]}"}
                ]
            )
            class_dict_gpt[df_keywords[x]] = completion.choices[0].message.content.upper()
            time.sleep(20)
    #
    with open(json_path,'+w') as json_file:
        json.dump(class_dict_gpt, json_file)
    #
    refined_list = map(refine_class, class_dict_gpt.values())
    refined_class_dict_gpt = {key: refined_list for key, refined_list in zip(class_dict_gpt.keys(), refined_list)}
    #
    class_column = df_keywords.map(lambda keyword: refined_class_dict_gpt[keyword])
    return class_column


In [29]:
def get_class(ecnumber):
    d = ecnumber.split(".")[0]
    if d == "1":
        classification =  "OXIDOREDUCTASES"
    elif d == "2":
        classification =  "TRANSFERASES"
    elif d == "3":
        classification =  "HYDROLASES"
    elif d == "4":
        classification =  "LYASES"
    elif d == "5":
        classification =  "ISOMERASES"
    elif d == "6":
        classification =  "LIGASES"
    elif d == "7":
        classification =  "TRANSLOCASES"
    else:
        classification = "OTHER"
    return classification


def get_class_column(df):
    json_path = './classifications-ecnumber.json'
    if os.path.exists(json_path):
        with open(json_path,'r+') as json_file:
            class_dict_ec = json.load(json_file)
    else:
        class_dict_ec = {}
    #
    class_column = []
    data_len = len(df)
    for x in range(data_len):
        code = df.iloc[x]['Uniplot']
        if not (code in class_dict_ec):
            url = 'https://rest.uniprot.org/uniprotkb/search?query='+code
            response = requests.get(url=url)
            if response.status_code == 200:
                data = response.json()['results']
                if len(data) > 0:
                    if 'proteinDescription' in data[0].keys() and 'recommendedName' in data[0]['proteinDescription'].keys() and 'ecNumbers' in data[0]['proteinDescription']['recommendedName'].keys():
                        ecnumber = data[0]['proteinDescription']['recommendedName']['ecNumbers'][0]['value']
                    else:
                        ecnumber = "NONE"
                classif = get_class(ecnumber)
            else:
                classif = "ERROR"
            print(classif)
            class_dict_ec[code] = classif
            class_column.append(classif)
        else:
            class_column.append(class_dict_ec[code])
    #
    with open(json_path,'+w') as json_file:
        json.dump(class_dict_ec, json_file)
    #
    return class_column

In [5]:
def get_trypanosomatida_column(df_ids):
    species = "Trypanosomatida"
    json = {
    "query": {
                "type": "terminal",
                "service": "text",
                "parameters": {
                "operator": "exact_match",
                "value": f"{species}",
                "attribute": "rcsb_entity_source_organism.taxonomy_lineage.name"
                }},
    "request_options": {
        "return_all_hits": True
    }
                ,
    "return_type": "entry"
    }
    response = requests.post(PDB_URL_SEARCH, json=json)
    results = response.json()['result_set']
    results_df = pd.DataFrame(results)
    return df_ids.isin(results_df['identifier'])

In [27]:
def organize_PHRMPR_data(data_path):
    '''Receives the PHARMMAPPER data search in CSV format, and returns a pandas Data Frame with the data
       organized following a certain crtiteria'''
    
    original_df = pd.read_csv(data_path,skiprows=1)
    result_df = pd.DataFrame()
    print("Creating Ids column")
    ids_column = get_ids_column(original_df)
    print("Creating Keywords column")
    keywords_column = get_keywords_column(ids_column)
    print("Creating Class column")
    #Class ChatGPT
    #class_column = get_class_column(original_df, keywords_column)
    #
    #Class EcNumber UNIPROT
    class_column = get_class_column(original_df)
    print("Creating Trypanosomatida column")
    trypanosomatida_column = get_trypanosomatida_column(ids_column)

    result_df['Id'] = ids_column
    result_df['Fit'] = original_df['Fit']
    result_df['Norm Fit'] = original_df['Norm Fit']
    result_df['Name'] = original_df['Name']
    result_df['Keywords'] = keywords_column
    result_df['Class'] = class_column
    result_df['UNIPROT'] = original_df['Uniplot']
    result_df['Trypanosomatida'] = trypanosomatida_column

    print('Done')
    return result_df


In [32]:
test = organize_PHRMPR_data('./t11-300-conform300-PHARMMAPPER.csv')
test

Creating Ids column
Creating Keywords column
4US0
2NVB
1JQK
1UZL
2K49
1BUC
2ROW
1G01
1FCX
3BFX
1KP9
2V68
2I0F
1E19
2Z5V
2E0G
2PH7
2IDB
2RFO
1XDZ
1PMA
2J2S
7FAB
1XNH
2HZ7
2OG7
2AD1
5LDH
2DLZ
1IIU
1Y89
1N35
1SGF
1DNV
2QJI
2DHZ
2DY3
2PAA
1Z68
1B7Y
1T3W
2JS4
1VQ0
1ATI
1K4W
1DBU
1O8V
2FV4
2AII
3GG6
3D54
1UF0
1IQ6
1PZQ
1ZQ3
2I3S
2P3N
2H12
3CWZ
1MC8
2UYY
2H59
2C95
1WJV
Creating Class column
OXIDOREDUCTASES
OXIDOREDUCTASES
TRANSLOCASES
OTHER
OTHER
OTHER
OXIDOREDUCTASES
OXIDOREDUCTASES
OXIDOREDUCTASES
OXIDOREDUCTASES
TRANSFERASES
HYDROLASES
OTHER
OTHER
OXIDOREDUCTASES
TRANSFERASES
OTHER
HYDROLASES
TRANSFERASES
OTHER
OTHER
HYDROLASES
OXIDOREDUCTASES
LYASES
TRANSFERASES
HYDROLASES
TRANSFERASES
TRANSFERASES
OTHER
OTHER
TRANSFERASES
OTHER
OTHER
TRANSLOCASES
OTHER
ISOMERASES
OTHER
HYDROLASES
HYDROLASES
OTHER
OTHER
LYASES
HYDROLASES
OTHER
LYASES
HYDROLASES
TRANSFERASES
HYDROLASES
HYDROLASES
OTHER
HYDROLASES
OTHER
HYDROLASES
ISOMERASES
TRANSFERASES
TRANSFERASES
OTHER
OTHER
TRANSFERASES
OTHER
TRANSFERA

Unnamed: 0,Id,Fit,Norm Fit,Name,Keywords,Class,UNIPROT,Trypanosomatida
0,4US0,3.999,0.9998,NONE,SIGNALING PROTEIN,OTHER,NONE,False
1,2NVB,3.986,0.9964,NADP-dependent alcohol dehydrogenase,OXIDOREDUCTASE,OXIDOREDUCTASES,P14941,False
2,1JQK,3.984,0.9961,Carbon monoxide dehydrogenase,OXIDOREDUCTASE,OXIDOREDUCTASES,P31896,False
3,1U2E,3.980,0.9949,2-hydroxy-6-oxononadienedioate/2-hydroxy-6-oxo...,HYDROLASE,HYDROLASES,P77044,False
4,2FSF,3.977,0.9941,Protein translocase subunit secA,PROTEIN TRANSPORT,TRANSLOCASES,SECA_ECOLI,False
...,...,...,...,...,...,...,...,...
294,2UYY,2.525,0.5050,Putative oxidoreductase GLYR1,CYTOKINE,OTHER,Q49A26,False
295,2H59,4.037,0.5046,Cellular tumor antigen p53,HYDROLASE,OTHER,P53_HUMAN,False
296,1HXD,4.031,0.5038,Bifunctional protein birA synthetase,LIGASE,OTHER,P06709,False
297,2C95,2.519,0.5038,Adenylate kinase isoenzyme 1,TRANSFERASE,TRANSFERASES,P00568,False


In [33]:
test.to_csv("./Processed Data/t11-PHMPR-300-300.csv")