In [1]:
from bioservices import UniProt
import pandas as pd
from Bio import SeqIO
import urllib
import re
import ast

### Get UniProt entry id and feature info for EC numbers for top hundred organisms

In [2]:
#Get cleaned data from BRENDA
df_brenda = pd.read_csv('C:/Users/Elin/Documents/GitHub/predicting-allostery/datafiles/interactions/BRENDA_interactions_intracellular.txt', index_col=0, header=0)

#Get the top hundred organisms from previously created file
top_hundred_list = pd.read_csv('C:/Users/Elin/Documents/GitHub/predicting-allostery/datafiles/support/top_hundred_organisms.txt', sep='\t', index_col=0)['Org'].tolist()

#Filter dataframe for interactions in the top hundred organisms
df_brenda_top_hundred = df_brenda[df_brenda['Org'].isin(top_hundred_list)]

#### Add column of entry-string for search in UniProt

In [55]:
#Define formula for making entry-string
def entry_formula(EC, Org):
    return f'EC:{EC}+AND+{Org}+AND+reviewed:true'

In [11]:
#apply formula to dataframe
df_brenda_top_hundred['Entry_string'] = df_brenda_top_hundred.apply(lambda x: entry_formula(x['EC'], x['Org']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brenda_top_hundred['Entry_string'] = df_brenda_top_hundred.apply(lambda x: entry_formula(x['EC'], x['Org']), axis=1)


In [13]:
#Make dataframe of only enzymes (EC numbers), organisms and entry-strings
df_ec_enz_org = df_brenda_top_hundred[['EC', 'Enz', 'Org', 'Entry_string']].drop_duplicates()

##### Search in UniProt

In [58]:
u = UniProt(verbose=False)

In [16]:
#Get info for every EC number+Organism combination
#The get_df() function has a default limit of 10 entries per search for the sake of efficiency, this was changed to 
#100 to get all entries for each search
#Also: the UniProt database changes all the time, and a new search might therefore not give the same results

feature_dfs = []

entries = set(df_ec_enz_org['Entry_string'])

#This takes about 30 minutes
for entry in entries:
    feature_dfs.append(u.get_df(entry, limit=100))

df = pd.concat(feature_dfs, axis=0)

In [21]:
#Reduce dataframe to fewer columns
columns_of_interest = ['Entry', 'Gene Names (primary)', 'Organism', 'Protein names', 'Domain [CC]', 'Domain [FT]', 'Motif', 'Protein families', 'Region', 'Active site', 'Activity regulation', 'Binding site']
df_features = df[columns_of_interest]

In [22]:
#Drop duplicates
df_features = df_features.drop_duplicates()

In [24]:
#Save the dataframe to file as the dataretrieving process takes time
df_features.to_csv('C:/Users/Elin/Documents/GitHub/predicting-allostery/datafiles/features/protein_features_uniprot.txt')

##### Get InterPro ids for every UniProt entry

In [2]:
#Read from the file created above
df_features = pd.read_csv('C:/Users/Elin/Documents/GitHub/predicting-allostery/datafiles/features/protein_features_uniprot.txt')

In [4]:
#Define function for retrieving the InterPro ids
def retrieve_interpro_ids(uniprot_entry):
    url = "http://www.uniprot.org/uniprot/{}.xml".format(uniprot_entry)
    try:
        handle = urllib.request.urlopen(url)
        record = SeqIO.read(handle, "uniprot-xml")
        interpro_ids = list(filter(lambda ref: 'InterPro' in ref, record.dbxrefs))  
    except:
        interpro_ids =  [0]
    return interpro_ids

In [5]:
#Apply function to dataframe
#This takes about 6 hours
df_features['Interpro_ids'] = df_features['Entry'].apply(lambda entry: retrieve_interpro_ids(entry))

###### Clean up the dataframe

In [7]:
#Define function for extracting the EC number from the protein name
def extract_EC(string):
    ECs = []
    array = string.split('(')
    for word in array:
        if word.startswith('EC'):
            ECs.append(word.strip(') '))
    return ECs

In [9]:
#Apply function to dataframe to extract EC numbers
df_features['EC numbers'] = df_features['Protein names'].apply(lambda string: extract_EC(string))

#Keep only relevant columns
df_features_small = df_features[['Entry', 'Organism', 'EC numbers', 'Interpro_ids']]

#Explode on EC numbers to get one Entry, Organism, EC number and InterPro ids on each row
df_features_exploded = df_features_small.explode('EC numbers')

#Remove 'EC ' from EC numbers
df_features_exploded['EC numbers'] = df_features_exploded['EC numbers'].str.replace('EC ', '')

#Extract organism name from Organism columns
df_features_exploded['Org'] = df_features_exploded['Organism'].apply(lambda x: ' '.join(x.split()[:2]))

In [18]:
#Save to csv
df_features_exploded.to_csv('C:/Users/Elin/Documents/GitHub/predicting-allostery/datafiles/features/protein_features_interpro.txt')

### Merge the dataframe of InterPro ids for each EC number with interactions dataframe

In [121]:
#Get cleaned data from BRENDA
df_brenda = pd.read_csv('C:/Users/Elin/Documents/GitHub/predicting-allostery/datafiles/interactions/BRENDA_interactions_intracellular.txt', index_col=0, header=0)

#Get the top hundred organisms from previously created file
top_hundred_list = pd.read_csv('C:/Users/Elin/Documents/GitHub/predicting-allostery/datafiles/support/top_hundred_organisms.txt', sep='\t', index_col=0)['Org'].tolist()

#Filter dataframe for interactions in the top hundred organisms
df_brenda_top_hundred = df_brenda[df_brenda['Org'].isin(top_hundred_list)]

In [122]:
#Merge the dataframe of interactions with the dataframe of features to associate EC number - Organism pairs with features
df_merged = pd.merge(df_brenda_top_hundred, df_features_exploded, left_on=['EC', 'Org'], right_on=['EC numbers', 'Org'], how='left')

#### Clean up the dataframe to get one feature per row

In [131]:
#Drop unnecessary columns
df_merged_new = df_merged.drop(['Organism', 'Entry', 'EC numbers', 'Enz'], axis=1)

In [126]:
#Explode on InterPro ids (need to make sure that the InterPro ids are evaluated as strings before exploding)
import ast
df_merged_new['Interpro_ids'] = df_merged_new['Interpro_ids'].fillna("[]").apply(ast.literal_eval)
df_merged_new = df_merged_new.explode('Interpro_ids')

#Remove 'InterPro' from ids
df_merged_new['Interpro_ids'] = df_merged_new['Interpro_ids'].str.replace('InterPro:', '')

In [135]:
#Drop duplicates and reset index
df_merged_new.drop_duplicates(inplace=True)
df_merged_new.reset_index(drop=True, inplace=True)

In [139]:
#Save to csv
df_merged_new.to_csv('C:/Users/Elin/Documents/GitHub/predicting-allostery/datafiles/features/features_interactions_merged.txt')

### Map InterPro ids to entry type and entry name

In [140]:
#Get list of entry type and name for InterPro ids, downloaded from the InterPro website
df_entry_list = pd.read_csv('C:/Users/Elin/Documents/GitHub/predicting-allostery/datafiles/support/entry.list.txt', sep='\t', header=0)

#Merge dataframe of features and interactions with the entry-list to map InterPro id to entry type and name
df_merged_type = pd.merge(df_merged_new, df_entry_list, left_on='Interpro_ids', right_on='ENTRY_AC', how='left').drop('ENTRY_AC', axis=1)

In [143]:
#Save file to csv
df_merged_type.to_csv('C:/Users/Elin/Documents/GitHub/predicting-allostery/datafiles/features/features_interactions_merged_types.txt')

### Get the features/InterPro ids for organisms and EC numbers in phylogenetic tree

###### This part retrieves the InterPro ids for the Organism-EC number pairs that are mapped in the phylogenetic tree used to identify predicted interactions. The reasoning for doing so is that not all of these EC numbers were documented in BRENDA for all top hundred organisms, and therefore we did not have data on the features for these organisms, resulting in false-negative predictions. 

In [51]:
#Create list of the EC numbers mapped on the phylogenetic tree
top_ten_ECs = ['1.3.5.1', '2.2.1.6', '2.7.1.1', '2.7.1.11', '2.7.1.30', '2.7.1.40', '2.7.2.4', '2.7.7.27', '3.1.3.11', '6.4.1.1']

In [52]:
#Make a dataframe of the top hundred organisms 
df_top_hundred = pd.DataFrame(top_hundred_list, columns=['Org'])

In [53]:
#Associate every organism with every EC number in the top ten group
df_top_hundred['EC'] = [top_ten_ECs]*100
df_top_hundred_EC = df_top_hundred.explode('EC').reset_index(drop=True)

#Apply string-formula to dataframe to get entry-strings
df_top_hundred_EC['Entry_string'] = df_top_hundred_EC.apply(lambda x: entry_formula(x['EC'], x['Org']), axis=1)

In [None]:
#Get feature info for every EC number+Organism combination

feature_dfs_tree = []

entries = set(df_top_hundred_EC['Entry_string'])

#Takes about five minutes
for entry in entries:
    feature_dfs_tree.append(u.get_df(entry, limit=100))

df = pd.concat(feature_dfs_tree, axis=0)

In [60]:
#Make dataframe with only columns that are relevant for further work
columns_of_interest = ['Entry', 'Gene Names (primary)', 'Organism', 'Protein names']
df_features_topEC = df[columns_of_interest]

In [61]:
#Get InterPro ids for all features and extract EC numbers
#Takes about 20 minutes
df_features_topEC['Interpro_ids'] = df_features_topEC['Entry'].apply(lambda entry: retrieve_interpro_ids(entry))
df_features_topEC['EC numbers'] = df_features_topEC['Protein names'].apply(lambda string: extract_EC(string))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_features_topEC['Interpro_ids'] = df_features_topEC['Entry'].apply(lambda entry: retrieve_interpro_ids(entry))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_features_topEC['EC numbers'] = df_features_topEC['Protein names'].apply(lambda string: extract_EC(string))


In [63]:
#Make dataframe with only wanted columns
df_features_topEC_small = df_features_topEC[['Entry', 'Organism', 'EC numbers', 'Interpro_ids']]

#Explode dataframe to get one EC number per row
df_features_topEC_exp = df_features_topEC_small.explode('EC numbers')

#Clean up EC numbers and organism names
df_features_topEC_exp['EC numbers'] = df_features_topEC_exp['EC numbers'].str.replace('EC ', '')
df_features_topEC_exp['Org'] = df_features_topEC_exp['Organism'].apply(lambda x: ' '.join(x.split()[:2]))

In [67]:
#Save file to csv
df_features_topEC_exp.to_csv('C:/Users/Elin/Documents/GitHub/predicting-allostery/datafiles/features/features_for_ECs_in_tree.txt')