In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# %%bash
# MINICONDA_INSTALLER_SCRIPT=Miniconda3-4.5.4-Linux-x86_64.sh
# MINICONDA_PREFIX=/usr/local
# wget https://repo.continuum.io/miniconda/$MINICONDA_INSTALLER_SCRIPT
# chmod +x $MINICONDA_INSTALLER_SCRIPT
# ./$MINICONDA_INSTALLER_SCRIPT -b -f -p $MINICONDA_PREFIX

In [4]:
import gzip
import time
from multiprocessing import Pool
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import utils.gcs_utils as gcs
import utils.spark_utils as sprk
from utils.proteins import *

In [5]:
import numpy as np
from bio_embeddings.embed import SeqVecEmbedder, ProtTransBertBFDEmbedder  

In [5]:
files = gcs.list_keys("UP000005640_9606_HUMAN/cif")

## Parse sequences and atom sites
*reduce files that are multipart*

In [52]:
def parse_cif_file_in_cloud(key):
    content = gcs.download_gzip_to_string(key)
    parsed_cif = parse_cif(content)
    file_name = key.strip(".cif.gz").split("/")[-1]

    sequence = get_protein_sequence_from_cif(parsed_cif)
    sequence['protein_filename'] = file_name
    sequence['protein_id'] = get_protein_id_from_filename(file_name)
    sequence['confidence_pLDDT'] = get_global_confidence_from_cif(parsed_cif)

    atoms = get_atom_sites_from_cif(parsed_cif)
    atoms['protein_filename'] = file_name
    atoms['protein_id'] = get_protein_id_from_filename(file_name)
    local_confidence = get_local_confidence_from_cif(parsed_cif)
    atoms = sort_by_file_number_and_index(join_atoms_with_confidence(atoms, local_confidence))
    return sequence, atoms

In [53]:
sequences = []

# could potentially use spark here with wholeTextFile reader but that is not necessary in our usecase
for i in tqdm(range(len(files) // 1000 + 1)):
    with Pool() as p:
        structures = p.map(parse_cif_file_in_cloud, files[i*1000:(i+1)*1000])
    sequences_part, atom_sites_part = zip(*structures)
    sequences.append(pd.concat(sequences_part))

    # this grows quickly, so we will write these in parts
    pd.concat(atom_sites_part).to_parquet(path=f"/home/jupyter/pss/structure_files/atom_sites/atom_sites_part_{str(i).zfill(2)}.parquet")

  0%|          | 0/24 [00:00<?, ?it/s]

In [55]:
sequences_df = reduce_sequence_df(pd.concat(sequences))
sequences_df.to_parquet(path="/home/jupyter/pss/structure_files/sequences/sequences.parquet")

In [56]:
sequences_df = pd.read_parquet(path="/home/jupyter/pss/structure_files/sequences/sequences.parquet")
sequences_df.head()

Unnamed: 0,pdbx_db_accession,db_code,db_name,protein_id,pdbx_seq_one_letter_code,protein_filename
0,A0A024R1R8,A0A024R1R8_HUMAN,UNP,A0A024R1R8,MSSHEGGKKKALKQPKKQAKEMDEEEKAFKQKQKEEQKKLEVLKAK...,AF-A0A024R1R8-F1-model_v1
1,A0A024RBG1,NUD4B_HUMAN,UNP,A0A024RBG1,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...,AF-A0A024RBG1-F1-model_v1
2,A0A024RCN7,A0A024RCN7_HUMAN,UNP,A0A024RCN7,MERSFVWLSCLDSDSCNLTFRLGEVESHACSPSLLWNLLTQYLPPG...,AF-A0A024RCN7-F1-model_v1
3,A0A075B6H5,A0A075B6H5_HUMAN,UNP,A0A075B6H5,METVVTTLPREGGVGPSRKMLLLLLLLGPGSGLSAVVSQHPSRVIC...,AF-A0A075B6H5-F1-model_v1
4,A0A075B6H7,KV37_HUMAN,UNP,A0A075B6H7,MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRAS...,AF-A0A075B6H7-F1-model_v1


In [57]:
atom_sites_df = pd.read_parquet(path="/home/jupyter/pss/structure_files/atom_sites/atom_sites_part_00.parquet")
atom_sites_df.head()

Unnamed: 0,group_PDB,id,type_symbol,label_atom_id,label_alt_id,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,...,auth_asym_id,auth_atom_id,pdbx_PDB_model_num,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res,protein_filename,protein_id,confidence_pLDDT
ALA11,ATOM,75,N,N,,ALA,A,1,11,,...,A,N,1,A0A024R1R8,UNP,11,A,AF-A0A024R1R8-F1-model_v1,A0A024R1R8,58.65
ALA11,ATOM,76,C,CA,,ALA,A,1,11,,...,A,CA,1,A0A024R1R8,UNP,11,A,AF-A0A024R1R8-F1-model_v1,A0A024R1R8,58.65
ALA11,ATOM,77,C,C,,ALA,A,1,11,,...,A,C,1,A0A024R1R8,UNP,11,A,AF-A0A024R1R8-F1-model_v1,A0A024R1R8,58.65
ALA11,ATOM,78,C,CB,,ALA,A,1,11,,...,A,CB,1,A0A024R1R8,UNP,11,A,AF-A0A024R1R8-F1-model_v1,A0A024R1R8,58.65
ALA11,ATOM,79,O,O,,ALA,A,1,11,,...,A,O,1,A0A024R1R8,UNP,11,A,AF-A0A024R1R8-F1-model_v1,A0A024R1R8,58.65


In [10]:
# for i in range(24):
#     path=f"/home/jupyter/pss/structure_files/atom_sites/old/atom_sites_part_{str(i).zfill(2)}.parquet"
#     new_path=f"/home/jupyter/pss/structure_files/atom_sites/atom_sites_part_{str(i).zfill(2)}.parquet"
#     atom_sites_df = pd.read_parquet(path)
#     sorted_df = sort_by_file_number_and_index(atom_sites_df)
#     print(sorted_df.head())
#     sorted_df.to_parquet(new_path)

  group_PDB  id type_symbol label_atom_id label_alt_id label_comp_id  \
0      ATOM   1           N             N                        MET   
1      ATOM   2           C            CA                        MET   
2      ATOM   3           C             C                        MET   
3      ATOM   4           C            CB                        MET   
4      ATOM   5           O             O                        MET   

  label_asym_id label_entity_id label_seq_id pdbx_PDB_ins_code  ...  \
0             A               1            1              None  ...   
1             A               1            1              None  ...   
2             A               1            1              None  ...   
3             A               1            1              None  ...   
4             A               1            1              None  ...   

  auth_asym_id auth_atom_id pdbx_PDB_model_num pdbx_sifts_xref_db_acc  \
0            A            N                  1             A0A024R1