In [1]:
import requests
import os
import json
import pandas as pd
import numpy as np
from Bio.PDB import PDBParser
import py3Dmol



# Defining Functions

In [2]:
# Combined function to get protein data and export files
def fetch_protein_data(uniprot_id, output_dirs):
    # Set up API endpoints
    api_url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot_id}"
    uniprot_url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json?fields=ft_topo_dom%2Cft_transmem"
    
    try:
        # Get protein details
        response = requests.get(api_url, timeout=10)
        response.raise_for_status()
        protein_data = response.json()[0]
        
        # Export PDB file
        pdb_url = protein_data.get('pdbUrl')
        if pdb_url:
            pdb_response = requests.get(pdb_url)
            with open(os.path.join(output_dirs['pdb'], f"{uniprot_id}.pdb"), 'wb') as pdb_file:
                pdb_file.write(pdb_response.content)
        
        # Export image file
        image_url = protein_data.get('paeImageUrl')
        if image_url:
            image_response = requests.get(image_url)
            with open(os.path.join(output_dirs['images'], f"{uniprot_id}_image.png"), 'wb') as image_file:
                image_file.write(image_response.content)

        # Save protein data as a text file
        with open(os.path.join(output_dirs['text'], f"{uniprot_id}_details.txt"), 'w') as text_file:
            json.dump(protein_data, text_file, indent=2)
        
        return protein_data
    except requests.RequestException as e:
        print(f"Error fetching data for {uniprot_id}: {e}")
        return None

In [3]:
# Extract helical details for each protein
def extract_helical_regions(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json?fields=ft_topo_dom%2Cft_transmem"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching data for {uniprot_id}")
        return None

    data = response.json()
    tm_regions = []
    for feature in data.get('features', []):
        if feature.get('type') == 'Transmembrane' and 'Helical' in feature.get('description', ''):
            start = feature['location']['start']['value']
            end = feature['location']['end']['value']
            tm_regions.append((start, end))
    return tm_regions

In [4]:
# Generate a dataframe with start and end positions of TMHs
def create_helical_df(uniprot_ids):
    data = []
    for uniprot_id in uniprot_ids:
        tm_regions = extract_helical_regions(uniprot_id)
        if tm_regions is not None:
            row = [uniprot_id] + [coord for region in tm_regions for coord in region]
            data.append(row + [None] * (15 - len(row)))  # Pad to maintain consistent column count
        print(f"Successfully extracted helical details for {uniprot_id}")
    column_names = ['protein'] + [f'TMH {i+1} Start' for i in range(7)] + [f'TMH {i+1} End' for i in range(7)]
    return pd.DataFrame(data, columns=column_names)

In [5]:
# Parse C-alpha coordinates and create final dataframe
def parse_coordinates(pdb_file, tmh_positions):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("protein", pdb_file)
    coords = {f'TMH {i+1}': [] for i in range(7)}

    for tmh_idx, (start, end) in enumerate(tmh_positions):
        atoms = []
        for model in structure:
            for chain in model:
                for residue in chain.get_residues():
                    if start <= residue.id[1] <= end and "CA" in residue:
                        atom = residue["CA"]
                        atoms.extend(atom.coord)
        coords[f'TMH {tmh_idx+1}'] = atoms

    # Flatten coordinates into dataframe format
    data = {'protein': pdb_file}
    for tmh, atom_coords in coords.items():
        for idx, coord in enumerate(atom_coords):
            data[f'{tmh} Atom {idx // 3 + 1} {["X", "Y", "Z"][idx % 3]}'] = coord
    return pd.DataFrame([data])

In [6]:
# # 3D visualization
# def visualize_proteins(uniprot_ids, output_dirs):
#     os.makedirs(output_dirs['pdb'], exist_ok=True)
#     os.makedirs(output_dirs['images'], exist_ok=True)
#     os.makedirs(output_dirs['text'], exist_ok=True)

#     for uniprot_id in uniprot_ids:
#         protein_data = fetch_protein_data(uniprot_id, output_dirs)
        
#         if protein_data:
#             # Show 3D visualization
#             pdb_url = protein_data.get('pdbUrl')
#             if pdb_url:
#                 pdb_data = requests.get(pdb_url).text
#                 view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js')
#                 view.addModel(pdb_data, 'pdb')
#                 view.setStyle({'cartoon': {'color': 'spectrum'}})
#                 view.zoomTo()
#                 display(view)

In [7]:
# 3D visualization
def visualize_proteins(uniprot_ids, output_dirs):
    os.makedirs(output_dirs['pdb'], exist_ok=True)

    for uniprot_id in uniprot_ids:
        pdb_file_path = os.path.join('PDB_Files', f"{uniprot_id}.pdb")
        
        if os.path.exists(pdb_file_path):
            with open(pdb_file_path, 'r') as pdb_file:
                pdb_data = pdb_file.read()
            
            # Show 3D visualization
            view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js')
            view.addModel(pdb_data, 'pdb')
            view.setStyle({'cartoon': {'color': 'spectrum'}})
            view.zoomTo()
            display(view)
        else:
            print(f"PDB file for {uniprot_id} not found in PDB_Files folder.")


In [8]:
# Final dataframe
def process_proteins(uniprot_ids, output_dirs):
    os.makedirs(output_dirs['pdb'], exist_ok=True)
    os.makedirs(output_dirs['images'], exist_ok=True)
    os.makedirs(output_dirs['text'], exist_ok=True)
    
    # Final dataframe for all proteins
    final_coord_dfs = []
    for uniprot_id in uniprot_ids:
        protein_data = fetch_protein_data(uniprot_id, output_dirs)
        
        if protein_data:
            # Extract and parse TMH coordinates
            tmh_positions = extract_helical_regions(uniprot_id)
            if tmh_positions:
                pdb_file = os.path.join(output_dirs['pdb'], f"{uniprot_id}.pdb")
                coord_df = parse_coordinates(pdb_file, tmh_positions)
                final_coord_dfs.append(coord_df)
        print(f"Successfully processed {uniprot_id}!")
    
    final_coord_df = pd.concat(final_coord_dfs, ignore_index=True)
    final_coord_df['protein'] = final_coord_df['protein'].str.extract(r'([^/]+)\.pdb$')[0]
    
    # Combine all parsed coordinates into a single dataframe
    return final_coord_df

# Data Extraction

In [9]:
file_path = 'GPCRTargets.csv'
gpcr_targets_df = pd.read_csv(file_path, header=1)
gpcr_targets_df.head()

Unnamed: 0,Type,Family id,Family name,Target id,Target name,Subunits,Target systematic name,Target abbreviated name,synonyms,HGNC id,...,Rat SwissProt,Rat Entrez Gene,MGI id,MGI symbol,MGI name,Mouse genetic localisation,Mouse nucleotide RefSeq,Mouse protein RefSeq,Mouse SwissProt,Mouse Entrez Gene
0,gpcr,16,Class A Orphans,83,<i>GPR3</i>,,,,Gpcr21|GPCR3|ACCA orphan receptor|adenylate cy...,4484.0,...,Q8K1Q3,266769.0,MGI:101908,Gpr3,G-protein coupled receptor 3,4 D2.3,NM_008154,NP_032180,P35413,14748.0
1,gpcr,16,Class A Orphans,84,<i>GPR4</i>,,,,GPR19|G-protein coupled receptor 19,4497.0,...,Q4KLH9,308408.0,MGI:2441992,Gpr4,G protein-coupled receptor 4,7 A3,NM_175668,NP_783599,Q8BUD0,319197.0
2,gpcr,16,Class A Orphans,228,<i>GPR42</i>,,,,FFAR1L|GPR41L|FFAR3L|G protein-coupled recepto...,4500.0,...,,,,,,,,,,
3,gpcr,16,Class A Orphans,85,<i>GPR6</i>,,,,Sphingosine 1-phosphate receptor GPR6,4515.0,...,P51651,83683.0,MGI:2155249,Gpr6,G protein-coupled receptor 6,10 22.08 cM,NM_199058,NP_951013,Q6YNI2,140741.0
4,gpcr,16,Class A Orphans,86,<i>GPR12</i>,,,,Gpcr01|Gpcr20|GPCR21|GPCR12|R334,4466.0,...,P30951,80840.0,MGI:101909,Gpr12,G-protein coupled receptor 12,5 G3,NM_008151,NP_032177,P35412,14738.0


In [10]:
uniprot_ids = gpcr_targets_df['Human SwissProt'].dropna().tolist()
uniprot_ids

['P46089',
 'P46093',
 'O15529',
 'P46095',
 'P47775',
 'P49685',
 'Q13304',
 'Q15760',
 'Q99678',
 'Q99679',
 'Q99680',
 'O00155',
 'Q8NDV2',
 'Q9NS67',
 'O00270',
 'O75388',
 'Q49SQ1',
 'Q9UPC5',
 'Q9HC97',
 'O15354',
 'O60883',
 'O43194',
 'Q9Y5Y3',
 'Q13585',
 'Q9Y2T5',
 'Q9BZJ8',
 'Q9BZJ7',
 'Q9BZJ6',
 'Q8IYL9',
 'Q15743',
 'O95800',
 'Q96P69',
 'Q96P67',
 'Q9NYM4',
 'Q9NQS5',
 'P60893',
 'Q9BY21',
 'Q9GZN0',
 'Q96P66',
 'Q9UNW8',
 'Q8IZ08',
 'Q6DWJ6',
 'Q7Z602',
 'Q7Z601',
 'Q96CH1',
 'Q8TDV2',
 'Q86SP6',
 'Q8NGU9',
 'Q8TDV0',
 'Q8TDT2',
 'Q6NV75',
 'Q9UJ42',
 'Q8N6U8',
 'Q16538',
 'O14626',
 'Q9NS66',
 'Q9BXC1',
 'Q14439',
 'O15218',
 'P32249',
 'Q9BXB1',
 'O75473',
 'Q9HBX8',
 'P04201',
 'P35410',
 'Q8TDS7',
 'Q86SM8',
 'Q96AM1',
 'Q86SM5',
 'Q96LB2',
 'Q96LB1',
 'Q96LB0',
 'Q96LA9',
 'Q86VZ1',
 'O00398',
 'Q9P1P5',
 'Q9P1P4',
 'O14804',
 'Q96RI8',
 'Q969N4',
 'Q96RI9',
 'Q8NFN8',
 'Q5T848',
 'Q6PRD1',
 'Q8NFJ5',
 'Q9NZH0',
 'Q9NQ84',
 'Q9NZD1',
 'Q5T6X5',
 'P04000',
 'P04001',

In [12]:
# Define output directories for files
output_dirs = {
    'pdb': 'PDB_Files',
    'images': 'Images',
    'text': 'Text_Files'
}

# Generate TMH dataframe
tmh_df = create_helical_df(uniprot_ids)

Successfully extracted helical details for P46089
Successfully extracted helical details for P46093
Successfully extracted helical details for O15529
Successfully extracted helical details for P46095
Successfully extracted helical details for P47775
Successfully extracted helical details for P49685
Successfully extracted helical details for Q13304
Successfully extracted helical details for Q15760
Successfully extracted helical details for Q99678
Successfully extracted helical details for Q99679
Successfully extracted helical details for Q99680
Successfully extracted helical details for O00155
Successfully extracted helical details for Q8NDV2
Successfully extracted helical details for Q9NS67
Successfully extracted helical details for O00270
Successfully extracted helical details for O75388
Successfully extracted helical details for Q49SQ1
Successfully extracted helical details for Q9UPC5
Successfully extracted helical details for Q9HC97
Successfully extracted helical details for O15354


In [13]:
# Generate final parsed coordinates dataframe
final_coord_df = process_proteins(uniprot_ids, output_dirs)

Successfully processed P46089!
Successfully processed P46093!
Successfully processed O15529!
Successfully processed P46095!
Successfully processed P47775!
Successfully processed P49685!
Successfully processed Q13304!
Successfully processed Q15760!
Successfully processed Q99678!
Successfully processed Q99679!
Successfully processed Q99680!
Successfully processed O00155!
Successfully processed Q8NDV2!
Successfully processed Q9NS67!
Successfully processed O00270!
Successfully processed O75388!
Successfully processed Q49SQ1!
Successfully processed Q9UPC5!
Successfully processed Q9HC97!
Successfully processed O15354!
Successfully processed O60883!
Successfully processed O43194!
Successfully processed Q9Y5Y3!
Successfully processed Q13585!
Successfully processed Q9Y2T5!
Successfully processed Q9BZJ8!
Successfully processed Q9BZJ7!
Successfully processed Q9BZJ6!
Successfully processed Q8IYL9!
Successfully processed Q15743!
Successfully processed O95800!
Successfully processed Q96P69!
Successf

In [None]:
# Visualize each protein
visualize_proteins(uniprot_ids, output_dirs)

# Data Preprocessing