# Setup

In [None]:
# installation of required libraries
!pip install biopandas

In [None]:
!rm -rf /content/*

# Data exploration

# Load the protein data

In [None]:
from biopandas.pdb import PandasPdb
import pandas as pd

Note: Why is `biopandas` useful for protein analysis?

If you work with PDB files and want to analyze protein structures efficiently in Python, biopandas makes it easier by converting structural data into Pandas DataFrames.
Key Benefits for a Computational Protein Workshop

- **Easy Data Handling**: Instead of looping through PDB files manually, you can use biopandas to extract atomic coordinates, residue information, and chain details into a structured DataFrame.
- **Fast Filtering**: Need only backbone atoms, a specific chain, or only hydrophobic residues? Use simple pandas filtering instead of complex parsing.
- **Compatible with ML & Visualization**: Since biopandas works with DataFrames, it’s easier to integrate protein structural data into machine learning pipelines or use matplotlib for visualization.
- **No Complex Parsing**: Unlike `Bio.PDB` (which requires object-oriented access), biopandas lets you query PDB data like a spreadsheet, making it more beginner-friendly.


In [None]:
PDB_FILE_LOCATION = 'https://github.com/enveda/modbioterp-enveda/raw/refs/heads/main/workshop_data/cotb2_pp_mg.pdb'
!wget $PDB_FILE_LOCATION -O /content/cotb2_pp_mg.pdb

In [None]:
!ls /content

In [None]:
pdb_file = PandasPdb().read_pdb('/content/cotb2_pp_mg.pdb')

In [None]:
display(
    pdb_file.df["ATOM"].head(3),
    pdb_file.df["HETATM"].head(3)
)

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

pdb_file.df["ATOM"].groupby("residue_name").size().plot(kind="bar", figsize=(10, 5), color='green')
_ = plt.xticks(rotation=90)

# Extract the sequence from the protein file and write it into a fasta file

In [None]:
# the mapping of 3 letter AA to 1 letter
aa_dict = {
    'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
    'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
    'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
    'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y',
    'HSE': 'H', 'HSP': 'H', 'HSD': 'H', 'HIE': 'H', 'HID': 'H',
}

In [None]:
grouped_amino_acids = pdb_file.df["ATOM"].groupby(["residue_number"]).first()
grouped_amino_acids.head()

In [None]:
sequence = "".join(grouped_amino_acids['residue_name'].map(aa_dict))
display(sequence)

### Answer 🏗

there are Mg and Pop (lipids) in the pandas dataframe. If you do

```python
set(aa_dict.keys()) - set(grouped_amino_acids["residue_name"])
```
you should see
```python
['MG', 'POP']
```

indicating that you must get rid of these somehow. One easy way is to simply do
```python
sequence = "".join(grouped_amino_acids['residue_name'].map(aa_dict).dropna())
```

In [None]:
sequence = "".join(grouped_amino_acids['residue_name'].map(aa_dict).dropna())
display(sequence)

---

In [None]:
# how to write a fasta file

fasta_location = "cotb2_mg.fasta"
with open(fasta_location, "w") as f:
    f.write(f">cotb2_mg\n{sequence}")

# Find the amino acid composition (AAC)