In [None]:
# %% ============================================================================
## Extraction and Calculation of Protein Properties from Uniprot Database
# Author: Alexa Canchola
# Advisor: Wei-Chun Chou
# Date: April 22, 2025
# ==============================================================================
"""
This Python script provides a comprehensive workflow for the extraction of molecular weight (MW) and amino acid sequence information from the UniProt Protein Knowledgebase.
Additional functionality to calculate protein properties including:
    - Isoelectric point (pI)
    - Grand Average of Hydropathicity (GRAVY)
using the BioPython (v1.85) package
"""
# %% ============================================================================
# Install Required Libraries
# ==============================================================================
!pip install biopython



In [None]:
# %% ============================================================================
# Import Libraries
# ==============================================================================
# Standard Libraries for Data Handling
import pandas as pd
import numpy as np


# Libraries for Uniprot Query
import requests as r
import re
from urllib.request import urlopen
from ast import AnnAssign
import warnings
warnings.filterwarnings("ignore")

# Bio.SeqUtils Package
# Used to calculate protein information from amino acid sequence
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [None]:
# %% ============================================================================
#Import & Preprocess Data
# ============================================================================
from google.colab import files
import pandas as pd
import io

# Load Data
uploaded = files.upload() # proteinIDs.csv
file_name = list(uploaded.keys())[0]
proteins = pd.read_csv(io.BytesIO(uploaded[file_name]))

Mounted at /content/drive


In [None]:
proteinids = proteins['ID'].to_list() #requires list format
print(proteisnids)

KeyError: 'ID'

#1. Extracting MW and Sequence Data from Uniprot


---


  Calculation of pI and GRAVY from peptide sequence using Bio.SeqUtils.ProtParam

In [None]:
# %% ============================================================================
# Define Data Retrieval Functions
# ============================================================================
#Function to query MW and AA sequence for proteins in list
def get_mw_and_seq_from_uniprot_id(ID):
    ID = ID.strip()
    url = f"http://www.uniprot.org/uniprot/{ID}.txt"
    try:
        response = urlopen(url)
        data = response.read().decode()

        # Optional: Print the raw data for inspection
        # print(f"Raw data for {ID}:\n{data}\n{'-'*40}")

        # Ensure the response contains the sequence information
        if 'SQ ' in data:
            result = data.split('SQ ')[-1]

            # Use regex to find molecular weight (MW)
            mw_match = re.search(r"(\d+)\s*MW", result)
            if mw_match:
                mw = int(mw_match.group(1))
            else:
                print(f"Molecular weight not found for {ID}")
                mw = None

            #use regex to find amino acid length (AA)
            AA_match = re.search(r"(\d+)\s*AA", result)
            if AA_match:
                AA = int(AA_match.group(1))
            else:
                print(f"Amino acid length not found for {ID}")

            # Extract sequence
            sequence_lines = result.split('\n')[1:]
            sequence = ''.join([line.strip().replace(' ', '') for line in sequence_lines if line and not line.startswith('//')])

            print(f"Extracted sequence for ID {ID}: {sequence}")
            return mw, sequence, AA
        else:
            print(f"Sequence data not found for UniProt ID {ID}")
            return None, None, None

    except Exception as e:
        print(f"Error fetching data for UniProt ID {ID}: {e}")
        return None, None, None

#Function to clean formatting of extracted AA sequences
def clean_sequence(sequence):
  clean_sequence = ''.join(sequence.split()).upper()
  valid_amino_acids = set( "ARNDCEQGHILKMFPSTWYV")

  #check for invalid characters
  if not all(char in valid_amino_acids for char in clean_sequence):
      raise ValueError("Sequence contains invalid characters.")

  return clean_sequence

In [None]:
# %% ============================================================================
# Define Functions for pI and GRAVY calculation
# ============================================================================

#define function for calculating
def calculate_isoelectric_point (sequence):
  try:
    sequence = clean_sequence(sequence)
    analysis = ProteinAnalysis(sequence)
    pI = analysis.isoelectric_point()
    return pI
  except Exception as e:
    print(f"Error calculating pI for sequence: {e}")
    return None

def calculate_gravy(sequence):
    try:
        sequence = clean_sequence(sequence)
        analysis = ProteinAnalysis(sequence)
        gravy = analysis.gravy() # calculate the GRAVY according to Kyte and Doolitle, 1982
        return gravy
    except Exception as e:
        print(f"Error calculating GRAVY for sequence: {e}")
        return None



In [None]:
#create dataframe for calculated properties
ProteinInfo = pd.DataFrame(proteinids)
ProteinInfo.rename(columns={ProteinInfo.columns[0]: 'ID'}, inplace=True)
ProteinInfo['MW'] = None
ProteinInfo['AA'] = None
ProteinInfo['pI'] = None
ProteinInfo['gravy'] = None

In [None]:
# %% ============================================================================
# Extract and Calculate Protein Data
# ============================================================================

# Loop through protein ID list to extract relevant data; if data is available, calculate specified properties
for index, row in ProteinInfo.iterrows():
    mw, seq, AA = get_mw_and_seq_from_uniprot_id(row['ID'])
    if mw and seq:
       ProteinInfo.at[index, 'MW'] = mw
       ProteinInfo.at[index,'AA'] = AA
       ProteinInfo.at[index, 'pI'] = calculate_isoelectric_point(seq)
       ProteinInfo.at[index, 'gravy'] = calculate_gravy(seq)

# Check dataframe
print(ProteinInfo.head(30))


NameError: name 'ProteinInfo' is not defined

In [None]:
print(ProteinInfo.head(30))

         ID      MW    AA        pI     gravy
0    A1A4J1   85292   780  7.012363 -0.088205
1    A1L595   48712   441  5.082907 -0.582766
2    A2I7M9   46237   411  5.666075 -0.122871
3    A2I7N0   46311   411  5.922759 -0.194647
4    A2I7N3   46942   417   5.89775 -0.127338
5    A2VDY3   24948   222  4.614042 -0.818919
6    A2VE23   50549   510  6.641204 -0.752157
7    A3KMV5  117830  1058  5.495842 -0.263705
8    A3KN27   59952   550  8.151012 -0.402182
9    A4FUA8   32932   286  5.532787 -0.666783
10   A4FUZ0   53989   493  5.430989 -0.309128
11   A4FV54   23684   207  9.137056 -0.400483
12  A4FV61     None  None      None      None
13   A4FV68   54699   467  6.509508 -0.300642
14   A4IF97   19692   171  4.723627 -0.778363
15  A5D785     None  None      None      None
16   A5D7B7   87734   760  7.535907 -0.324605
17   A5D7D1  104928   911  5.267065 -0.642151
18  A5D989    31142   280   4.94456     -0.65
19   A5PJM7   51847   447  6.502744 -0.477405
20   A5PK45   71547   623  7.04351