# Downloads PDB - Pfam Domain Mappings
**[Work in progress]**

This notebook downloads Pfam domain information for PDB structures.

Data source: [Pfam](https://pfam.xfam.org/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
from pathlib import Path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


### Download data

In [4]:
pdb_pfam_url = 'ftp://ftp.ebi.ac.uk/pub/databases/Pfam/mappings/pdb_pfam_mapping.txt'

In [8]:
df = pd.read_csv(pdb_pfam_url, sep='\t', dtype=str, skiprows=1)

In [9]:
df.head()

Unnamed: 0,PDB,CHAIN,PDB_START,PDB_END,PFAM_ACCESSION,PFAM_NAME
0,101m,A,7,113,PF00042,Globin
1,102m,A,7,113,PF00042,Globin
2,103m,A,7,113,PF00042,Globin
3,104m,A,6,112,PF00042,Globin
4,105m,A,6,112,PF00042,Globin


### Assign CURIEs and standard names

In [10]:
df['accession'] = 'pfam:' + df['PFAM_ACCESSION'].str.split('.', expand=True)[0]

In [11]:
df['pdbChainId'] = 'pdb:' + df['PDB'] + '.' + df['CHAIN']

In [12]:
df.rename(columns={'PFAM_NAME': 'name'}, inplace=True)
df.rename(columns={'PDB_START': 'start'}, inplace=True)
df.rename(columns={'PDB_END': 'end'}, inplace=True)

In [13]:
df.head()

Unnamed: 0,PDB,CHAIN,start,end,PFAM_ACCESSION,name,accession,pdbChainId
0,101m,A,7,113,PF00042,Globin,pfam:PF00042,pdb:101m.A
1,102m,A,7,113,PF00042,Globin,pfam:PF00042,pdb:102m.A
2,103m,A,7,113,PF00042,Globin,pfam:PF00042,pdb:103m.A
3,104m,A,6,112,PF00042,Globin,pfam:PF00042,pdb:104m.A
4,105m,A,6,112,PF00042,Globin,pfam:PF00042,pdb:105m.A


### Add PDB entity id to domains to identify identical chains (Pfam only annotates a single unique chain in each PDB)

In [14]:
chains = pd.read_csv(NEO4J_IMPORT / "01f-PDBChain.csv", dtype='str')
chains = chains[['pdbChainId', 'entityId']]

In [15]:
df = df.merge(chains, on='pdbChainId')
df.fillna('', inplace=True)

In [16]:
df.head()

Unnamed: 0,PDB,CHAIN,start,end,PFAM_ACCESSION,name,accession,pdbChainId,entityId
0,1914,A,23,113,PF02290,SRP14,pfam:PF02290,pdb:1914.A,1


In [17]:
df.to_csv(NEO4J_IMPORT / "01g-PfamDomainPDB.csv", index=False)