.ipynb script for extracting 2 x 5000 random Asn residues from the Swissprot human proteome (swissprot_human_proteome.xlsx, from https://pubmed.ncbi.nlm.nih.gov/8594581/). Generates two Excel output files each containing 5000 randomly selected Asn residues.

The output files can be used in fetch_pdb_data to obtain pdb codes for the selected residues. 

In [None]:
# Path to the input file (swissprot_human_proteome.xlsx)
INPUT_FILEPATH = ""

# Path to the output file 1 (first 5000 random Asn residues)
OUTPUT_FILEPATH_1 = ""

# Path to the output file 2 (second 5000 random Asn residues)
OUTPUT_FILEPATH_2 = ""

# Random seed for first 5000 Asn residue extraction (use the same seeds for reproducibility)
RANDOM_SEED_1 = 42

# Random seed for second 5000 Asn residue extraction (use the same seeds for reproducibility)
RANDOM_SEED_2 = 24

In [None]:
# Generate table of ASN occurrences from Human Swissprot
import pandas as pd

# Load reference excel file into dataframe
df = pd.read_excel(INPUT_FILEPATH)

In [None]:
# Helper function that finds all positions at which an amino acid occurs in a sequence.
# Sequence must be capitalized!

def find_aa_occurrences(aa_letter_code, sequence):
    return [i + 1 for i, char in enumerate(sequence) if char == aa_letter_code]

In [None]:
# Initiate tqdm
from tqdm import tqdm
tqdm.pandas()

In [None]:
# Explode DF based on ASN occurrences in each row's sequence
df['ASN Occurrences'] = df['Sequence'].progress_apply(lambda x: find_aa_occurrences("N", x))
exploded_df = df.explode('ASN Occurrences')
exploded_df.rename(columns={'ASN Occurrences': 'ASN Position'}, inplace=True)
exploded_df.reset_index(drop=True, inplace=True)

# Filter out rows from exploded DF that have a missing ASN position (i.e. no ASN in the sequence).
exploded_df = exploded_df.dropna(subset=['ASN Position'])

# Rename "Entry" and "ASN Position" columns for compatibility with fetch_pdb_data.ipynb
exploded_df.rename(columns={'ASN Position': 'aa_position', 'Entry': 'uniprot_id'}, inplace=True)

exploded_df

In [None]:
# Extract 5000 random rows from dataframe

df_5000_first = exploded_df.sample(n=5000, random_state=RANDOM_SEED_1) # random_state for reproducibility
df_5000_second = exploded_df.sample(n=5000, random_state=RANDOM_SEED_2)

# If you want to reset the index
df_5000_first.reset_index(drop=True, inplace=True)
df_5000_second.reset_index(drop=True, inplace=True)

In [None]:
df_5000_first.to_excel(OUTPUT_FILEPATH_1)
df_5000_second.to_excel(OUTPUT_FILEPATH_2)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=08806ca9-3319-4bb4-9ddc-71a137575411' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>