In [1]:
# !pip install biopython
# !pip install nglview
# !conda install -c conda-forge nglview

In [2]:
import warnings
warnings.filterwarnings('ignore')

### The CASP12 dataset is just a huge (12gb) ASCII text file with each protein's data in sequence. Open the `test` file if you want to see the structure as it is only a few mb. Here is a parser that loads it into an initial df.

In [3]:
import pandas as pd

def parse_protein_data(file_path, n):
    # Initialize lists to store data
    ids = []
    sequences = []
    evolutionary = []
    tertiary = []
    masks = []
    
    # Initialize counters and temporary storage for current protein data
    current_protein = {}
    count = 0
    
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('[ID]'):
                # Save previous protein data if exists
                if current_protein:
                    ids.append(current_protein.get('ID', ''))
                    sequences.append(current_protein.get('PRIMARY', ''))
                    evolutionary.append(current_protein.get('EVOLUTIONARY', []))
                    tertiary.append(current_protein.get('TERTIARY', []))
                    masks.append(current_protein.get('MASK', ''))
                    count += 1
                    # Break the loop if the required number of proteins have been parsed
                    if count >= n:
                        break
                    current_protein = {}
            
            # Identify the section and append the data to the current protein
            if line.startswith('['):
                key = line[1:line.find(']')]
                current_protein[key] = []
            elif current_protein:
                # Split TSV data into lists of floats or keep as strings depending on the section
                if key in ['EVOLUTIONARY', 'TERTIARY']:
                    # Split the line by spaces, convert each item to float
                    current_protein[key].append([float(x) for x in line.split()])
                else:
                    current_protein[key].append(line)
                
    # Check if last processed protein needs to be added
    if count < n and current_protein:
        ids.append(current_protein.get('ID', ''))
        sequences.append(current_protein.get('PRIMARY', ''))
        evolutionary.append(current_protein.get('EVOLUTIONARY', []))
        tertiary.append(current_protein.get('TERTIARY', []))
        masks.append(current_protein.get('MASK', ''))
    
    # Create DataFrame
    df = pd.DataFrame({
        'ID': ids,
        'Sequence': sequences,
        'Evolutionary': evolutionary,
        'Tertiary': tertiary,
        'Mask': masks
    })
    
    return df

In [4]:
# Usage example
file_path = './data/validation'
number_of_proteins = 300  # Set how many proteins you want to load
protein_df = parse_protein_data(file_path, number_of_proteins)
protein_df.head(2)

Unnamed: 0,ID,Sequence,Evolutionary,Tertiary,Mask
0,[90#2WXZ_2_C],[DRVYIHPFHLLYYSKSTCAQLENPSVETLPEPTFEPVPIQAKTSP...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",[-----+++++++++++++++-------++++++++++++++++++...
1,[30#3U88_2_M],[SRWRFPARPGTGRRGLGGAPRQRVPALLRVGPGFDAALQVSAAIG...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.785860237024...","[[0.0, 0.0, 0.0, -6941.7, -7005.4, -6965.1, -6...",[-++++++++++++++++++++++++++++++++++++++++++++...


### We need to do some further extraction to get the right format:

In [5]:
def process_dataframe(df):
    # Convert ID, Sequence, and Mask to string directly
    df['ID'] = df['ID'].apply(lambda x: ''.join(x))
    df['Sequence'] = df['Sequence'].apply(lambda x: ''.join(x))
    df['Mask'] = df['Mask'].apply(lambda x: ''.join(x).replace('+', '1').replace('-', '0'))

    # Initialize columns for coordinates and amino acids
    coord_columns = ['x', 'y', 'z']
    aa_columns = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y','Info']

    # Expand tertiary coordinates
    for i, col in enumerate(coord_columns):
        df[col] = df['Tertiary'].apply(lambda x: x[i] if len(x) > i else None)

    # Expand evolutionary data into separate columns for each amino acid position
    for i, col in enumerate(aa_columns):
        df[col] = df['Evolutionary'].apply(lambda x: x[i] if len(x) > i else None)

    # Drop the original Tertiary and Evolutionary columns
    df.drop(['Tertiary', 'Evolutionary'], axis=1, inplace=True)

    return df

In [6]:
# Assume 'df' is your DataFrame loaded with data
# Example usage:
processed_df = process_dataframe(protein_df)
processed_df.head(2)

Unnamed: 0,ID,Sequence,Mask,x,y,z,A,C,D,E,...,N,P,Q,R,S,T,V,W,Y,Info
0,90#2WXZ_2_C,DRVYIHPFHLLYYSKSTCAQLENPSVETLPEPTFEPVPIQAKTSPV...,0000011111111111111100000001111111111111111111...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.012...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.237...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.012...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.040...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.021...","[0.0, 0.0, 0.9633596392333709, 0.0, 0.48795669...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.9787755102040815, 0.0, 0.0, ...","[0.9999999988691709, 0.9999999988691709, 0.939..."
1,30#3U88_2_M,SRWRFPARPGTGRRGLGGAPRQRVPALLRVGPGFDAALQVSAAIGT...,0111111111111111111111111111111111111111111111...,"[0.0, 0.0, 0.0, -6941.7, -7005.4, -6965.1, -68...","[0.0, 0.0, 0.0, 5551.7, 5538.0, 5409.6, 5380.7...","[0.0, 0.0, 0.0, 2541.6, 2410.3, 2334.6, 2325.8...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7858602370249...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.01074413...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.76283326...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2141397629750...","[0.0, 0.0, 0.06993736951983298, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.9300626304801669, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.9999999988691709, 0.9999999988691709, 0.915..."


## Data description:
- ID is the protein name
- Sequence is the amino acid sequence using the single letter code
- Mask (same length as sequence) is a binary representation of how importatnt each amino acid is to the structure (that is, amino acids marked `0` can be changed without affecting the structure much, whereas amino acids marked `1` would significantly alter structure/function).
- XYZ are the coordinates of each atom (note there are 3 atoms per amino acid, Nitrogen, alpha-Carbon, and Carbon, so there are 3 times as many coordinates as amino acids in the sequence)
- The 20 columns A-Y are normalized PSSM information for each amino acid. This is essentially a sparse matrix where there is a row for each position in the protein sequence, and a column for each of the 20 amino acids.
- Info contains the information content for that residue (unclear what this actually is)
## Note:
Proteins are chains of amino acids, and each amino acid has a backbone of three atoms, N-C-C, which are always linked in the N to C direction: 

`Beginning of protein sequence amino acids:       1       2       3    ... etc`

`Beginning of protein sequence backbone atoms: (N-C-C)-(N-C-C)-(N-C-C)-... etc`

So in our final output, we need to have xyz coordinates for each of the three atoms, in order, for every amino acid.

## NGL - load a sample protein to get the parser working 

https://files.rcsb.org/download/5YHX.pdb

In [7]:
import nglview as nv
from Bio.PDB import PDBParser

# Load your PDB file
parser = PDBParser()
structure = parser.get_structure('Sample', './data/5yhx.pdb')

# Create NGLView widget for the structure
view = nv.show_biopython(structure)
view.clear_representations()

# Add molecular graphics: e.g., cartoon for secondary structures and ball+stick for atoms
view.add_representation('cartoon', selection='protein', color='blue')
view.add_representation('ball+stick', selection='ligand')

view



NGLWidget()

## Here is a parser to turn a row of our dataframe into a PDB file that can be viewed with NGL. Note that our dataset doesn't have info on the secondary structure, so these structures will not appear bolded in the output. The PDB specification is available here:

https://www.biostat.jhsph.edu/~iruczins/teaching/260.655/links/pdbformat.pdf

In [8]:
import pandas as pd

def format_pdb_from_df(row):
    pdb_str = ""
    atom_count = 1  # Starting index for ATOM records
    chain_id = 'A'  # Assuming a single chain for simplicity
    res_num = 1     # Residue number
    occupancy = 1.00
    t_factor = 50.00

    atom_types = ['N', 'CA', 'C']  # Backbone atoms
    element_types = {'N': 'N', 'CA': 'C', 'C': 'C'}  # Elements for atoms
    residue_mapping = {'A': 'ALA','R': 'ARG','N': 'ASN','D': 'ASP','C': 'CYS','E': 'GLU','Q': 'GLN','G': 'GLY',
                       'H': 'HIS','I': 'ILE','L': 'LEU','K': 'LYS','M': 'MET','F': 'PHE','P': 'PRO','S': 'SER',
                       'T': 'THR','W': 'TRP','Y': 'TYR','V': 'VAL' } # PDB requires the 3 letter codes for amino acids

    for i in range(len(row['Sequence'])):
        residue = residue_mapping[row['Sequence'][i]]
        # Loop over each backbone atom type
        for j, atom_type in enumerate(atom_types):
            # Index to pull the correct coordinates from flattened list
            idx = 3 * i + j
            x = row['x'][idx]/100
            y = row['y'][idx]/100
            z = row['z'][idx]/100
            element = element_types[atom_type]

            pdb_str += f"ATOM  {atom_count:>5}  {atom_type:<2}  {residue:>3} {chain_id}{res_num:>4}    {x:>8.3f}{y:>8.3f}{z:>8.3f}{occupancy:>6.2f}{t_factor:>6.2f}          {element:>2}  \n"
            atom_count += 1

        res_num += 1  # Increment residue number for each new amino acid

    return pdb_str

In [9]:
protein_row = processed_df.iloc[0]

pdb_content = format_pdb_from_df(protein_row)
print(pdb_content)

# Optionally, write to a file
with open("./data/sample_protein.pdb", "w") as file:
    file.write(pdb_content)

ATOM      1  N   ASP A   1       0.000   0.000   0.000  1.00 50.00           N  
ATOM      2  CA  ASP A   1       0.000   0.000   0.000  1.00 50.00           C  
ATOM      3  C   ASP A   1       0.000   0.000   0.000  1.00 50.00           C  
ATOM      4  N   ARG A   2       0.000   0.000   0.000  1.00 50.00           N  
ATOM      5  CA  ARG A   2       0.000   0.000   0.000  1.00 50.00           C  
ATOM      6  C   ARG A   2       0.000   0.000   0.000  1.00 50.00           C  
ATOM      7  N   VAL A   3       0.000   0.000   0.000  1.00 50.00           N  
ATOM      8  CA  VAL A   3       0.000   0.000   0.000  1.00 50.00           C  
ATOM      9  C   VAL A   3       0.000   0.000   0.000  1.00 50.00           C  
ATOM     10  N   TYR A   4       0.000   0.000   0.000  1.00 50.00           N  
ATOM     11  CA  TYR A   4       0.000   0.000   0.000  1.00 50.00           C  
ATOM     12  C   TYR A   4       0.000   0.000   0.000  1.00 50.00           C  
ATOM     13  N   ILE A   5  

In [10]:
import nglview as nv
from Bio.PDB import PDBParser

# Load your PDB file
parser = PDBParser()
structure = parser.get_structure('Sample', './data/sample_protein.pdb')

# Create NGLView widget for the structure
view = nv.show_biopython(structure)
view.clear_representations()

# Add molecular graphics: e.g., cartoon for secondary structures and ball+stick for atoms
view.add_representation('cartoon', selection='protein', color='blue')
view.add_representation('ball+stick', selection='ligand')

view

NGLWidget()