# Downloads PDB Structure Information
**[Work in progress]**

This notebook downloads 3D-structure information from the Worldwide Protein Data Bank.

Data sources: 
[Protein Data Bank Japan](https://pdbj.org/), 
[PDBe Protein Data Bank Europe](https://www.ebi.ac.uk/pdbe/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import re
import numpy as np
import pandas as pd
import dateutil
import urllib
from pathlib import Path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


### Get PDB summary data

In [4]:
URL = 'https://pdbj.org/rest/newweb/search/sql?format=csv&q='

In [5]:
sql_query = 'SELECT pdbid, release_date AS releasedate, pdbx_descriptor AS description, struct_title AS title, exptl_method AS methods, resolution FROM brief_summary'
encoded_query = urllib.parse.quote(sql_query)

In [6]:
summary_url = URL + encoded_query
df_summary = pd.read_csv(summary_url, dtype=str)

In [7]:
df_summary.head()

Unnamed: 0,pdbid,releasedate,description,title,methods,resolution
0,3q7d,2011-11-09,(2R)-2-(6-methoxynaphthalen-2-yl)propanoic aci...,Structure of (R)-naproxen bound to mCOX-2.,X-RAY DIFFRACTION,2.4
1,1wgi,1997-11-19,"INORGANIC PYROPHOSPHATASE, MANGANESE (II) ION,...",STRUCTURE OF INORGANIC PYROPHOSPHATASE,X-RAY DIFFRACTION,2.2
2,5z3f,2019-05-15,"CITRIC ACID, GLYCEROL, Glycoside hydrolase 15-...",Glycosidase E335A in complex with glucose,X-RAY DIFFRACTION,1.1
3,1wgm,2004-11-28,Ubiquitin conjugation factor E4A,Solution structure of the U-box in human ubiqu...,SOLUTION NMR,
4,1wiu,1996-12-23,TWITCHIN 18TH IGSF MODULE,TWITCHIN IMMUNOGLOBULIN SUPERFAMILY DOMAIN (IG...,SOLUTION NMR,


In [8]:
df_summary.rename(columns={'pdbid': 'pdbId'}, inplace=True)
df_summary['pdbId'] = df_summary['pdbId'].str.upper()
df_summary.rename(columns={'releasedate': 'releaseDate'}, inplace=True)

In [9]:
print('Number of records:', df_summary.shape[0])
df_summary.head()

Number of records: 178451


Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution
0,3Q7D,2011-11-09,(2R)-2-(6-methoxynaphthalen-2-yl)propanoic aci...,Structure of (R)-naproxen bound to mCOX-2.,X-RAY DIFFRACTION,2.4
1,1WGI,1997-11-19,"INORGANIC PYROPHOSPHATASE, MANGANESE (II) ION,...",STRUCTURE OF INORGANIC PYROPHOSPHATASE,X-RAY DIFFRACTION,2.2
2,5Z3F,2019-05-15,"CITRIC ACID, GLYCEROL, Glycoside hydrolase 15-...",Glycosidase E335A in complex with glucose,X-RAY DIFFRACTION,1.1
3,1WGM,2004-11-28,Ubiquitin conjugation factor E4A,Solution structure of the U-box in human ubiqu...,SOLUTION NMR,
4,1WIU,1996-12-23,TWITCHIN 18TH IGSF MODULE,TWITCHIN IMMUNOGLOBULIN SUPERFAMILY DOMAIN (IG...,SOLUTION NMR,


### Get refinement data

In [10]:
sqlQuery = 'SELECT "ls_R_factor_R_free", entry_id FROM refine'
encoded_query = urllib.parse.quote(sqlQuery)
refine_url = URL + encoded_query

In [11]:
#df_refine = pd.read_csv(refine_url, usecols=['ls_R_factor_R_free', 'entry_id'], dtype=str)
df_refine = pd.read_csv(refine_url, dtype=str)

In [12]:
df_refine.head()

Unnamed: 0,ls_R_factor_R_free,entry_id
0,,100D
1,0.252,101D
2,0.202,101M
3,,102D
4,,102L


In [13]:
df_refine.rename(columns={'ls_R_factor_R_free': 'rFree', 'entry_id': 'pdbId'}, inplace=True)
df_refine.fillna('', inplace=True)

In [14]:
print('Number of records:', df_refine.shape[0])
df_refine.sample(5)

Number of records: 160748


Unnamed: 0,rFree,pdbId
90063,0.216,4O6Y
28993,0.223,2DKG
20615,0.244,1UR0
59883,0.24293,3MFF
45689,0.285,2ZYK


In [15]:
structures = df_summary.merge(df_refine, on='pdbId', how='left')
structures.fillna('', inplace=True)

In [16]:
structures.drop_duplicates(subset=['pdbId'], inplace=True)

In [17]:
print('Number of structures:', structures.shape[0])
structures.sample(10)

Number of structures: 178451


Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution,rFree
44883,5R7W,2020-04-22,"Interleukin-1 beta, SULFATE ION, water",PanDDA analysis group deposition of ground-sta...,X-RAY DIFFRACTION,1.27,0.2026
157446,3BED,2007-11-27,"PTS system, IIA component, water",Mannose/sorbose specific IIA subunit of phosph...,X-RAY DIFFRACTION,1.45,0.1967
111002,7AEJ,2021-05-26,"2H10, Envelope glycoprotein gp160,Envelope gly...",Crystal structure of asymmetric HIV-1 gp41 con...,X-RAY DIFFRACTION,3.8,0.3077
113828,5P8V,2016-08-03,"Endothiapepsin, water",Automated refinement of diffraction data obtai...,X-RAY DIFFRACTION,1.579,0.1735
125444,1HG8,2001-11-10,"2-acetamido-2-deoxy-beta-D-glucopyranose, 2-ac...",Endopolygalacturonase from the phytopathogenic...,X-RAY DIFFRACTION,1.73,0.203
7148,3BHP,2007-12-11,"UPF0291 protein ynzC, water",Crystal structure of UPF0291 protein ynzC from...,X-RAY DIFFRACTION,2.01,0.2623
72835,1U1L,2004-09-21,"5'-D(*TP*AP*GP*GP*GP*TP*TP*(PRN)P*GP*GP*G)-3',...",Crystal Structure of UP1 Complexed With d(TTAG...,X-RAY DIFFRACTION,2.0,0.274
104563,5KL8,2016-08-17,"Maternal protein pumilio, Protein nanos, RNA (...",Crystal structure of the Pumilio-Nos-CyclinB R...,X-RAY DIFFRACTION,4.0,0.3119
111008,3GMO,2009-11-10,"1,2-ETHANEDIOL, 2-acetamido-2-deoxy-beta-D-glu...",Structure of mouse CD1d in complex with C8PhF,X-RAY DIFFRACTION,1.6,0.2156
107698,7JSI,2020-12-16,"DNA (5'-D(P*TP*TP*TP*TP*TP*TP*TP*T)-3'), Prote...",Adeno-Associated Virus 2 Rep68 HD Hexamer-ssDN...,ELECTRON MICROSCOPY,5.01,


In [18]:
structures['pdbId'] = 'pdb:' + structures['pdbId']

# convert methods string to a semicolon separated list (our default one-to-many representation in CSV files)
structures['methods'] = structures['methods'].str.replace('[', '', regex=False)
structures['methods'] = structures['methods'].str.replace(']', '', regex=False)
structures['methods'] = structures['methods'].str.replace("'", '', regex=False)
structures['methods'] = structures['methods'].str.replace(',', ';', regex=False)
structures['methods'] = structures['methods'].str.replace('; ', ';', regex=False)

In [19]:
structures['methods'].unique()

array(['X-RAY DIFFRACTION', 'SOLUTION NMR', 'SOLID-STATE NMR',
       'ELECTRON MICROSCOPY', 'NEUTRON DIFFRACTION',
       'ELECTRON CRYSTALLOGRAPHY',
       'ELECTRON MICROSCOPY;SOLID-STATE NMR;SOLUTION SCATTERING;HYBRID',
       'SOLID-STATE NMR;SOLUTION NMR;HYBRID',
       'SOLUTION NMR;SOLUTION SCATTERING;HYBRID', 'SOLUTION SCATTERING',
       'FIBER DIFFRACTION',
       'NEUTRON DIFFRACTION;X-RAY DIFFRACTION;HYBRID',
       'EPR;X-RAY DIFFRACTION;HYBRID',
       'SOLUTION NMR;THEORETICAL MODEL;HYBRID',
       'SOLUTION SCATTERING;X-RAY DIFFRACTION;HYBRID',
       'NEUTRON DIFFRACTION;SOLUTION NMR;HYBRID', 'INFRARED SPECTROSCOPY',
       'POWDER DIFFRACTION', 'EPR;SOLUTION NMR;HYBRID',
       'ELECTRON MICROSCOPY;SOLID-STATE NMR;HYBRID',
       'FLUORESCENCE TRANSFER',
       'ELECTRON MICROSCOPY;SOLUTION SCATTERING;HYBRID',
       'ELECTRON MICROSCOPY;SOLUTION NMR;HYBRID',
       'FIBER DIFFRACTION;SOLID-STATE NMR;HYBRID',
       'SOLUTION NMR;X-RAY DIFFRACTION;HYBRID',
       'EL

In [20]:
structures.head()

Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution,rFree
0,pdb:3Q7D,2011-11-09,(2R)-2-(6-methoxynaphthalen-2-yl)propanoic aci...,Structure of (R)-naproxen bound to mCOX-2.,X-RAY DIFFRACTION,2.4,0.2329
1,pdb:1WGI,1997-11-19,"INORGANIC PYROPHOSPHATASE, MANGANESE (II) ION,...",STRUCTURE OF INORGANIC PYROPHOSPHATASE,X-RAY DIFFRACTION,2.2,0.211
2,pdb:5Z3F,2019-05-15,"CITRIC ACID, GLYCEROL, Glycoside hydrolase 15-...",Glycosidase E335A in complex with glucose,X-RAY DIFFRACTION,1.1,0.12089
3,pdb:1WGM,2004-11-28,Ubiquitin conjugation factor E4A,Solution structure of the U-box in human ubiqu...,SOLUTION NMR,,
4,pdb:1WIU,1996-12-23,TWITCHIN 18TH IGSF MODULE,TWITCHIN IMMUNOGLOBULIN SUPERFAMILY DOMAIN (IG...,SOLUTION NMR,,


In [21]:
structures.to_csv(NEO4J_IMPORT / "01f-PDBStructure.csv", index=False)

### Get PDB Entity info

In [22]:
sqlQuery = "select pdbid, id, pdbx_description from entity"
encodedSQL = urllib.parse.quote(sqlQuery)
entity_url = URL + encodedSQL

In [23]:
df_entity = pd.read_csv(entity_url, dtype=str)

In [24]:
df_entity.rename(columns={'pdbid': 'pdbId', 'id': 'entityId', 'pdbx_description': 'description'}, inplace=True)

In [25]:
df_entity['pdbId'] = df_entity['pdbId'].str.upper()

In [26]:
print('Number of entities:', df_entity.shape[0])

Number of entities: 810321


In [27]:
df_entity.head()

Unnamed: 0,pdbId,entityId,description
0,4CN3,1,RETINOIC ACID RECEPTOR RXR-ALPHA
1,4CN3,2,RETINOIC ACID RECEPTOR RXR-ALPHA
2,4CN3,3,5'-D(*CP*TP*AP*GP*TP*TP*CP*AP*AP*AP*GP*TP*TP*C...
3,4CN3,4,5'-D(*TP*GP*TP*GP*AP*AP*CP*TP*TP*TP*GP*AP*AP*C...
4,4CN3,5,ZINC ION


In [28]:
def categorize(description):
    words = re.split('[ ,\-()]', str(description).lower())
    
    keywords_ab = ['antibody', 'fab', 'nanobody']
    keywords_hc = ['heavy', 'h', 'hc']
    keywords_lc = ['light', 'l', 'lc']
    
    category = ''
    
    if any(w in words for w in keywords_ab):
        category = 'antibody'
        if any(w in words for w in keywords_hc):
            category = 'antibody heavy chain'
            
        if any(w in words for w in keywords_lc):
            category = 'antibody light chain'
    
    return category

In [29]:
df_entity['category'] = df_entity['description'].apply(categorize)

In [30]:
df_entity.query("category != ''").head()

Unnamed: 0,pdbId,entityId,description,category
2425,4D3C,2,SFN68 FAB,antibody
2426,4D3C,3,SFN68 FAB,antibody
2769,4D9L,1,Light chain of Fab fragment of anti-HIV1 gp120...,antibody light chain
2770,4D9L,2,Heavy chain of Fab fragment of anti-HIV1 gp120...,antibody heavy chain
3559,4D9R,2,Fab light chain,antibody light chain


### Get PDB Polymer Entities

In [31]:
sqlQuery = "select pdbid, entity_id, pdbx_strand_id, type, pdbx_seq_one_letter_code_can from entity_poly"
encodedSQL = urllib.parse.quote(sqlQuery)
poly_url = URL + encodedSQL

In [32]:
df_poly = pd.read_csv(poly_url, dtype=str)

In [33]:
df_poly.rename(columns={'pdbid': 'pdbId', 'entity_id': 'entityId', 'pdbx_seq_one_letter_code_can': 'sequence'}, inplace=True)

In [34]:
df_poly['pdbId'] = df_poly['pdbId'].str.upper()
df_poly['chainId'] = df_poly['pdbx_strand_id'].str.split(',')
df_poly = df_poly.explode('chainId')
df_poly = df_poly[['pdbId', 'entityId', 'chainId', 'type', 'sequence']]

In [35]:
print("Number of polymer entities:", df_poly.shape[0])

Number of polymer entities: 636117


In [36]:
df_poly.head()

Unnamed: 0,pdbId,entityId,chainId,type,sequence
0,100D,1,A,polydeoxyribonucleotide/polyribonucleotide hybrid,CCGGCGCCGG
0,100D,1,B,polydeoxyribonucleotide/polyribonucleotide hybrid,CCGGCGCCGG
1,101D,1,A,polydeoxyribonucleotide,CGCGAATTCGCG
1,101D,1,B,polydeoxyribonucleotide,CGCGAATTCGCG
2,101M,1,A,polypeptide(L),MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...


In [37]:
df_chain = pd.merge(df_entity, df_poly, on=['pdbId', 'entityId'])

In [38]:
print('Number of polymer chains:', df_chain.shape[0])

Number of polymer chains: 636117


In [39]:
df_chain.head()

Unnamed: 0,pdbId,entityId,description,category,chainId,type,sequence
0,4CN3,1,RETINOIC ACID RECEPTOR RXR-ALPHA,,A,polypeptide(L),GSHMFTKHICAICGDRSSGKHYGVYSCEGCKGFFKRTVRKDLTYTC...
1,4CN3,1,RETINOIC ACID RECEPTOR RXR-ALPHA,,B,polypeptide(L),GSHMFTKHICAICGDRSSGKHYGVYSCEGCKGFFKRTVRKDLTYTC...
2,4CN3,1,RETINOIC ACID RECEPTOR RXR-ALPHA,,C,polypeptide(L),GSHMFTKHICAICGDRSSGKHYGVYSCEGCKGFFKRTVRKDLTYTC...
3,4CN3,2,RETINOIC ACID RECEPTOR RXR-ALPHA,,D,polypeptide(L),GSHMFTKHICAICGDRSSGKHYGVYSCEGCKGFFKRTVRKDLTYTC...
4,4CN3,3,5'-D(*CP*TP*AP*GP*TP*TP*CP*AP*AP*AP*GP*TP*TP*C...,,E,polydeoxyribonucleotide,CTAGTTCAAAGTTCACA


### Get PDB Chain - UniProt sequence mappings

In [40]:
sifts_url = 'http://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_segments_observed.tsv.gz'

In [41]:
chains = pd.read_csv(sifts_url, sep='\t', skiprows=1, dtype=str)
print("Number of chains:", chains.shape[0])
chains.head()

Number of chains: 839171


Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
0,123l,A,P00720,1,162,1,162,1,162
1,113l,A,P00720,1,162,1,162,1,162
2,105m,A,P02185,1,153,1,153,2,154
3,128l,A,P00720,1,162,1,162,1,162
4,156l,A,P00720,1,162,1,162,1,162


In [42]:
chains.rename(columns={'PDB': 'pdbId', 'CHAIN': 'chainId', 'SP_PRIMARY': 'accession'}, inplace=True)
chains.rename(columns={'RES_BEG': 'seqresStart', 'RES_END': 'seqresEnd'}, inplace=True)
chains.rename(columns={'PDB_BEG': 'pdbStart', 'PDB_END': 'pdbEnd'}, inplace=True)
chains.rename(columns={'SP_BEG': 'uniprotStart', 'SP_END': 'uniprotEnd'}, inplace=True)

In [43]:
chains['pdbId'] = chains['pdbId'].str.upper()

In [44]:
chains.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd
0,123L,A,P00720,1,162,1,162,1,162
1,113L,A,P00720,1,162,1,162,1,162
2,105M,A,P02185,1,153,1,153,2,154
3,128L,A,P00720,1,162,1,162,1,162
4,156L,A,P00720,1,162,1,162,1,162


### Sort chains by uniprot residue number

In [45]:
chains['uniprotStart'] = chains['uniprotStart'].astype(int)
chains['uniprotEnd'] = chains['uniprotEnd'].astype(int)
chains['length'] = chains['uniprotEnd'] - chains['uniprotStart'] + 1
# TODO length seems to be string??
chains['length'] = chains['length'].astype(int)
#
chains.sort_values(by='uniprotStart', inplace=True)
chains['uniprotStart'] = chains['uniprotStart'].astype(str)
chains['uniprotEnd'] = chains['uniprotEnd'].astype(str)

In [46]:
chains.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length
0,123L,A,P00720,1,162,1,162,1,162,162
324839,1Y5N,B,P11349,1,509,1,509,1,509,509
324840,1Y5N,C,P11350,1,72,1,72,1,72,72
324847,1Y5I,B,P11349,1,509,1,509,1,509,509
324848,1Y5I,C,P11350,1,72,1,72,1,72,72


In [47]:
chains.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 839171 entries, 0 to 345771
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   pdbId         839171 non-null  object
 1   chainId       839129 non-null  object
 2   accession     839171 non-null  object
 3   seqresStart   839171 non-null  object
 4   seqresEnd     839171 non-null  object
 5   pdbStart      839171 non-null  object
 6   pdbEnd        839171 non-null  object
 7   uniprotStart  839171 non-null  object
 8   uniprotEnd    839171 non-null  object
 9   length        839171 non-null  int64 
dtypes: int64(1), object(9)
memory usage: 70.4+ MB


### Group data by PDB chains

In [48]:
chains = chains.groupby(['pdbId','chainId','accession']).agg(list).reset_index()

### Create semicolon separated string of residue numbers so they can be represented in a csv file

In [49]:
chains['uniprotStart'] = chains['uniprotStart'].apply(lambda x: ';'.join(x))
chains['uniprotEnd'] = chains['uniprotEnd'].apply(lambda x: ';'.join(x))
chains['seqresStart'] = chains['seqresStart'].apply(lambda x: ';'.join(x))
chains['seqresEnd'] = chains['seqresEnd'].apply(lambda x: ';'.join(x))
chains['pdbStart'] = chains['pdbStart'].apply(lambda x: ';'.join(x))
chains['pdbEnd'] = chains['pdbEnd'].apply(lambda x: ';'.join(x))

In [50]:
chains['residues'] = chains['length'].apply(lambda x: sum(x))

In [51]:
print("Number of chains with UniProt mapping:", chains.shape[0])
chains.tail()

Number of chains with UniProt mapping: 555057


Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length,residues
555052,9XIA,A,P24300,1,387,1,387,1,387,[387],387
555053,9XIM,A,P12851,2,393,3,394,3,394,[392],392
555054,9XIM,B,P12851,2,393,3,394,3,394,[392],392
555055,9XIM,C,P12851,3,393,4,394,4,394,[391],391
555056,9XIM,D,P12851,2,393,3,394,3,394,[392],392


In [52]:
chains.query('pdbId == "6XDG"')

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length,residues
512864,6XDG,E,P0DTC2,15,208,333,526,333,526,[194],194


In [53]:
chains = df_chain.merge(chains, on=['pdbId', 'chainId'], how='left')
chains.fillna('', inplace=True)

### Assign CURIES

In [54]:
chains['accession'] = chains['accession'].apply(lambda x: x if x == '' else 'uniprot:' + x)
chains['pdbId'] = 'pdb:' + chains['pdbId']
chains['pdbChainId'] = chains['pdbId'] + '.' + chains['chainId']

In [55]:
print("Total number of polymer chains:", chains.shape[0])

Total number of polymer chains: 641041


In [56]:
chains.sample(10)

Unnamed: 0,pdbId,entityId,description,category,chainId,type,sequence,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length,residues,pdbChainId
150218,pdb:5KR2,1,Protease PR5-SQV,,A,polypeptide(L),PQITLWKRPLVTIKVGGQLKEALLNTGADDTVIEDMNLPGKWKPKM...,uniprot:V5YAB1,1,99,1,99,1,99,[99],99.0,pdb:5KR2.A
190697,pdb:5V3Z,1,Polyketide synthase Pks13 (Termination polyket...,,B,polypeptide(L),SNAQIDGFVRTLRARPEAGGKVPVFVFHPAGGSTVVYEPLLGRLPA...,uniprot:I6X8D2,4;145,142;279,1451;1592,1589;1726,1451;1592,1589;1726,"[139, 135]",274.0,pdb:5V3Z.B
317530,pdb:6S5J,1,Strictosidine synthase,,A,polypeptide(L),MASSPEFFEFIEAPSYGPNAYAFDSDGELYASVEDGRIIKYDKPSN...,uniprot:Q94LW9,4,306,4,306,26,328,[303],303.0,pdb:6S5J.A
309274,pdb:6QZ0,1,Major capsid protein,,5J,polypeptide(L),MRITFNDVKTSLGITESYDIVNAIRNSQGDNFKSYVPLATANNVAE...,uniprot:P13849,4,438,4,438,4,438,[435],435.0,pdb:6QZ0.5J
242328,pdb:6FML,11,Histone H2A type 1,,O,polypeptide(L),SGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERVGAG...,uniprot:P0C0S8,16,119,16,119,17,120,[104],104.0,pdb:6FML.O
306314,pdb:6QN1,2,BMC domain-containing protein,,AP,polypeptide(L),MKEALGLIETKGLVACIEAADAMCKAANVELIGYENVGSGLVTAMV...,uniprot:A0A0J4R4X1,3,87,3,87,3,87,[85],85.0,pdb:6QN1.AP
116839,pdb:5DDP,1,RNA (61-MER),,A,polyribonucleotide,CGUUGACCCAGGAAACUGGGCGGAAGUAAGGUCCAUUGCACUCCGG...,,,,,,,,,,pdb:5DDP.A
76153,pdb:4V74,31,50S ribosomal protein L11,,BI,polypeptide(L),MAKKVQAYVKLQVAAGMANPSPPVGPALGQQGVNIMEFCKAFNAKT...,uniprot:P0A7J7,2,142,1,141,2,142,[141],141.0,pdb:4V74.BI
488050,pdb:2CXI,1,Phenylalanyl-tRNA synthetase beta chain,,C,polypeptide(L),MPKFDVSKSDLERLIGRSFSIEEWEDLVLYAKCELDDVWEENGKVY...,uniprot:O73984,2,348,2,348,2,348,[347],347.0,pdb:2CXI.C
322583,pdb:6T68,1,MORN repeat-containing protein 1,,B,polypeptide(L),SEKYDGEWNEGRMQGWGKYFYADGGVYEGEWVDGRMHGRGTYVFPN...,uniprot:Q587D3,13,209,155,351,155,351,[197],197.0,pdb:6T68.B


In [57]:
chains.to_csv(NEO4J_IMPORT / "01f-PDBChain.csv", index=False)