Match list of inherently disordered proteins to official gene symbols

In [8]:
# Uniprot for protein name and gene name matching
# youtbe tutorial: https://www.youtube.com/watch?v=AisOJydPxpE
import pandas as pd
import requests
import sys
import re
import strsimpy
from strsimpy.damerau import Damerau


website_api = "https://rest.uniprot.org/"

# helper function to download data
def get_url(url, **kwargs):
    response = requests.get(url, **kwargs)
    
    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()
    
    return response


In [2]:
# test by querying the whole database
r = get_url(f"{website_api}/uniprotkb/search?query=*")
data = r.json()
# get the number of results in the payload
n_results = len(data["results"])
print(f"Number of results: {n_results}\n")

for (key, value) in r.headers.items():
    print(f"{key}: {value}")

Number of results: 25

Vary: accept,accept-encoding,x-uniprot-release,x-api-deployment-date, User-Agent
Cache-Control: public, max-age=43200
x-cache: miss cached
Content-Type: application/json
Content-Encoding: gzip
Access-Control-Allow-Credentials: true
Access-Control-Expose-Headers: Link, X-Total-Results, X-UniProt-Release, X-UniProt-Release-Date, X-API-Deployment-Date
X-API-Deployment-Date: 24-July-2024
Strict-Transport-Security: max-age=31536000; includeSubDomains
Date: Tue, 30 Jul 2024 17:47:19 GMT
Access-Control-Max-Age: 1728000
X-UniProt-Release: 2024_04
Link: <https://rest.uniprot.org/uniprotkb/search?query=%2A&cursor=1mkycb2xwxboutz8d45gm98ungtu7jgs6b01&size=25>; rel="next"
X-Total-Results: 245896766
Transfer-Encoding: chunked
Access-Control-Allow-Origin: *
Accept-Ranges: bytes
Connection: keep-alive
Access-Control-Allow-Methods: GET, PUT, POST, DELETE, PATCH, OPTIONS
Access-Control-Allow-Headers: DNT,Keep-Alive,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Conte

In [4]:
# test by quereing for alpha-synuclein in homo sapiens
syn_r = get_url(f"{website_api}/uniprotkb/search?query=(protein_name:Alpha-synuclein) AND (organism_id:9606)")
syn_data = syn_r.json()


# get the number of results on the first page
page_total = len(syn_data['results'])
print(page_total)

# get the total number of results
overall_total = r.headers.get('x-total-results')

# !!! - Can run through pages here, but it seems like the search algo would handle finding the best matches to start
if page_total != overall_total:
    print(overall_total - page_total)


24


TypeError: unsupported operand type(s) for -: 'str' and 'int'

In [5]:
# another more complex query
syn_r2 = get_url(f"{website_api}/uniprotkb/search?query=(protein_name:Alpha-synuclein) AND (organism_id:9606) AND (reviewed:true)&format=tsv")

print(syn_r2.text)

#syn_data2 = syn_r2.json()

Entry	Entry Name	Reviewed	Protein names	Gene Names	Organism	Length
Q9Y6H5	SNCAP_HUMAN	reviewed	Synphilin-1 (Sph1) (Alpha-synuclein-interacting protein)	SNCAIP	Homo sapiens (Human)	919
P37840	SYUA_HUMAN	reviewed	Alpha-synuclein (Non-A beta component of AD amyloid) (Non-A4 component of amyloid precursor) (NACP)	SNCA NACP PARK1	Homo sapiens (Human)	140
Q8ND56	LS14A_HUMAN	reviewed	Protein LSM14 homolog A (Protein FAM61A) (Protein SCD6 homolog) (Putative alpha-synuclein-binding protein) (AlphaSNBP) (RNA-associated protein 55A) (hRAP55) (hRAP55A)	LSM14A C19orf13 FAM61A RAP55 RAP55A	Homo sapiens (Human)	463



Above is testing for UniProt querying
Below is the function and cleaning used to match gene and protein names

In [12]:
# function to reference uniprot on protein names and then compare the results to the given name, choosing the most similar
def protein_lookup(given_name):
    website_api = "https://rest.uniprot.org/"

    # helper function for querying
    def get_url(url, **kwargs):
        response = requests.get(url, **kwargs)
        # if query fails, print a message
        if not response.ok:
            print(response.text)
            response.raise_for_status()
            sys.exit()
        return response
    
    # query uniprot via its website API specifying protein name, homo sapien, and manually review proteins
    query = get_url(f"{website_api}/uniprotkb/search?query=(protein_name:{given_name}) AND (organism_id:9606) AND (reviewed:true)&format=tsv")
    data = query.text
    # split the results into a list of lists by row
    row_split = data.split('\n')
    data_split = [row.split('\t') for row in row_split]
    # remove rows that are not equal in length to the column names of results
    data_clean = [row  for row in data_split if len(row) == len(data_split[0])]
    # assign the first list as column names and remove it from the response
    colnames = data_clean[0]
    data_clean.pop(0)

    df = pd.DataFrame(data=data_clean, columns=colnames)
  
    # if there are no results (df length 0) then clean then organize the data in the same way and fill with None
    if len(df) == 0:

        df = df.drop(['Reviewed', 'Organism'], axis = 1)
        df = df.rename(columns={'Protein names' : 'UniProt Name'})
        df = df[['UniProt Name', 'Gene Names', 'Entry', 'Entry Name', 'Length']]
        # set each column as no match or zero for length
        df.loc[0, ['UniProt Name', 'Gene Names', 'Entry', 'Entry Name']] = 'No match'
        df.loc[0, 'Length'] = 0
        # set distance very high to recognize later
        df['Distance'] = 100
        # add the given protein name to the data frame
        df.insert(loc=0, column='Protein Name', value=given_name)
        df.reset_index()

        print(f"No UniProt results for {given_name}.")
        return df

    # split the protein names to just include the first entry
    if any(["(" in names for names in df['Protein names']]):
        df[['Protein names', 'Other protein names']] = df['Protein names'].str.split('(', n=1, expand=True)
    else:
        df['Other protein names'] = None
    # add delimiter back onto the front of other names
    #df['Other protein names'] = ["("+names for names in df['Other protein names']]

    # compute the optimal string alignment (Demerau-Levenshtein distance) between given protein name and each result
    damerau = Damerau()
    df['Distance'] = [damerau.distance(given_name, result__name, ) for result__name in df['Protein names']]

    # take just the row of information with the lowest similarity score and set index to 0
    most_similar = df.loc[df['Distance'] == min(df['Distance'])]
    most_similar = most_similar.reset_index()
    # if multiple results have the same distance, select the first
    if len(most_similar) > 1:
        most_similar = most_similar.head(1)

    # delete reviewed, other names, and distance columns that are no longer needed
    most_similar_clean = most_similar.drop(['Reviewed', 'Other protein names', 'Organism'], axis = 1)
    most_similar_clean = most_similar_clean.rename(columns={'Protein names' : 'UniProt Name'})
    # reorder columns
    most_similar_clean = most_similar_clean[['UniProt Name', 'Gene Names', 'Entry', 'Entry Name', 'Length', 'Distance']]
    # add column with the given name of the protein
    most_similar_clean.insert(loc=0, column='Protein Name', value=given_name)
    # print the distance and the two matched protein names to print as a sanity check
    to_test_match = most_similar_clean.loc[0, 'UniProt Name']
    to_test_distance = most_similar_clean.loc[0, 'Distance']
    print(f"{given_name} matched with {to_test_match} by {to_test_distance}", end='\n')

    return most_similar_clean



In [13]:
test_query = protein_lookup("G2/mitotic-specific cyclin-B1")
display(test_query)

G2/mitotic-specific cyclin-B1 matched with G2/mitotic-specific cyclin-B1 by 0.0


Unnamed: 0,Protein Name,UniProt Name,Gene Names,Entry,Entry Name,Length,Distance
0,G2/mitotic-specific cyclin-B1,G2/mitotic-specific cyclin-B1,CCNB1 CCNB,P14635,CCNB1_HUMAN,433,0.0


In [15]:
# load the protein list csv file
proteins = pd.read_csv("WGCNA_gene_lists/")
# remove any square brackets or parentheses from the protein names, they interrupt querying UniProt
proteins['name'] = proteins['name'].str.replace('[', '')
proteins['name'] = proteins['name'].str.replace(']', '')
proteins['name'] = proteins['name'].str.replace('(', '')
proteins['name'] = proteins['name'].str.replace(')', '')

#display(proteins.loc[490:495, 'name'])


FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/bs1250/Box/LAB/Lab Folder/WGCNA_Ben/IDP homo sapiens final list.csv'

In [14]:
#results_list = [protein_lookup(protein) for protein in proteins.loc[0:55, 'name']]
results_list = []
count = 0
for protein in proteins['name']:
    count += 1
    print(count, end=': ')
    results = protein_lookup(protein)
    results_list.append(results)

uniprot_df = pd.concat(results_list)


NameError: name 'proteins' is not defined

In [139]:
# assign the UniProt results to a seperate variable
df_backup = uniprot_df 
df_backup = df_backup.reset_index()

# check the lengths of both to make sure they match
print(len(df_backup))
print(len(proteins))

# remove the index column
df_backup = df_backup.drop('index', axis=1)

# check how many of the results had a distance larger than 1 - which should be manually checked
need2check = df_backup[df_backup['Distance'] >= 2]
print(len(need2check) / len(df_backup) * 100)
# ~10% of the results should be checked

# save the final formatted df as a csv file to work on by hand
df_backup.to_csv('protein_gene_matches_V1.csv', index=False) 



1216
1216
9.950657894736842


In [14]:
import pandas as pd
# read in the hand-edited csv file to remove commas from gene lists
# pandas is not working so will read in without it
import csv
with open('C:/Users/bs1250/Box/LAB/Lab Folder/WGCNA_Ben/IDP Gene List/protein_gene_matches_V2_names_split.csv', mode ='r')as file:
  csv_file = csv.reader(file)
  file_data = [line for line in csv_file]

csv_df = pd.DataFrame(file_data[-(len(file_data) - 1):], columns=file_data[0])
# pull just the gene names from the file
idp_gene_list = csv_df['Gene Names'].tolist()
idp_gene_list_formatted = ', '.join(idp_gene_list)
print(idp_gene_list_formatted)
# create a txt file with all of the formatted gene names
output = open('IDP Gene List', 'w')
output.write(idp_gene_list_formatted)
output.close()

FMR1, CACYBP, CRK, CSTB, DDX4, MAX, UAP1, SULT2B1, PTMA, SEPTIN4, DLP1a, RTN4, NHERF1, COL7A1, ABCC9, RCOR3, PUS10, PAICS, MASP1, COL4A2, COL4A4, ABL1, COL10A1, DUT, COL17A1, COL9A1, PIN4, PIN4, UNG, PTEN, ARPP19, QKI6, FHIT, MAPT, CPEB3, MMACHC, FGF2, OGT, CLPB, CHKA, RAP1GDS1, LAT, PTMA, FGF12, RASSF1, PLOD3, MAPT, HBEGF, EIF4G1, BAALC, GPHN, CDKN2A, NUFIP1, MDM2, UPF1, QKI, MBP, CMTR2, GPC1, SERBP1, LDB3, OGG1, WRN, CDKN1B, PTPN2, DNMT3A, API5, IGHG1 , PPP3CA, WAS, LMNA, CSN1S1, SMG7, BLM, ALOX5, WASF1, FUT8, RANBP3, SDC4, SLC25A24, DIABLO, SMG5, TAF-1beta, Vinculin, RPLP2, RPLP1, SMO, NCK1, NRG1, RPL4, ATP2B1, NUPR1 , RPL10L, IBSP, EPB41, NUPR2, RPL24, UBTD2, FOS, CAD, CYBRD1, MAP4K3, GNAQ, TTC8, FGA, SOD1, MICAL3, SOD3, DAG1, GMPS, CPS1, MAOA, ALDH1A3, GPR179, HSD17B1, KCNE1, TGFB1, MSL3, KCNE3, SFTPC, SLC12A2, CAMP, SULT1A3, CFTR, CGB3, CDKN1A, CDKN1C, CDKN1B, EIF4EBP1, NR3C1, GHR, HMGA1, PIP4K2B, RPA1, RXRA, VAMP2, SNCA, TTN, TYMS, ESR1, TOP1, ESR2, MAX, RELA, TP53, KCNAB1, GATM