# Downloads PDB - Pfam Domain Mappings
**[Work in progress]**

This notebook downloads Pfam domain information for PDB structures.

Data source: [Pfam](https://pfam.xfam.org/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
import dask
import dask.dataframe as dd
from pathlib import Path

In [2]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


### Get list of unique UniProt accession numbers

In [3]:
unp = dd.read_csv(NEO4J_IMPORT / '01a-UniProtProtein.csv', usecols=['accession'])

In [4]:
unp['accession'] = unp['accession'].str.split(':', n=1, expand=True)[1]

In [5]:
unp = unp.drop_duplicates()

### Read InterPro mapping file

In [6]:
ipro = dd.read_csv(NEO4J_IMPORT / 'cache' / 'protein2pfam.tsv', sep='\t', dtype=str,
                   header=0,
                   names=['uniprotId', 'interproId', 'name', 'pfamId', 'start', 'end'],
                   usecols=['uniprotId', 'pfamId', 'start', 'end'])

In [7]:
ipro

Unnamed: 0_level_0,uniprotId,pfamId,start,end
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,object,object,object,object
,...,...,...,...


Keep only entries with PFam ids

In [8]:
ipro = ipro[ipro['pfamId'].str.startswith('PF')]

### Merge the two dataframes

In [9]:
df = unp.merge(ipro, left_on='accession', right_on='uniprotId')

### Assign CURIEs

In [10]:
df['accession'] = 'pfam:' + df['pfamId']
df['uniprotId'] = 'uniprot:' + df['uniprotId']

In [11]:
df.compute().to_csv(NEO4J_IMPORT / "01g-PfamDomainUniProt.csv", index=False)