In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# %%bash
# MINICONDA_INSTALLER_SCRIPT=Miniconda3-4.5.4-Linux-x86_64.sh
# MINICONDA_PREFIX=/usr/local
# wget https://repo.continuum.io/miniconda/$MINICONDA_INSTALLER_SCRIPT
# chmod +x $MINICONDA_INSTALLER_SCRIPT
# ./$MINICONDA_INSTALLER_SCRIPT -b -f -p $MINICONDA_PREFIX

In [2]:
import gzip
import time
from multiprocessing import Pool
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import utils.gcs_utils as gcs
import utils.spark_utils as sprk
from utils.proteins import *

In [7]:
home_path = "~/mids/pss"

In [None]:
files = gcs.list_keys("UP000005640_9606_HUMAN/cif")

## Parse sequences and atom sites
*reduce files that are multipart*

In [None]:
def parse_cif_file_in_cloud(key):
    content = gcs.download_gzip_to_string(key)
    parsed_cif = parse_cif(content)
    file_name = key.strip(".cif.gz").split("/")[-1]

    sequence = get_protein_sequence_from_cif(parsed_cif)
    sequence['protein_filename'] = file_name
    sequence['protein_id'] = get_protein_id_from_filename(file_name)
    sequence['confidence_pLDDT'] = get_global_confidence_from_cif(parsed_cif)

    atoms = get_atom_sites_from_cif(parsed_cif)
    atoms['protein_filename'] = file_name
    atoms['protein_id'] = get_protein_id_from_filename(file_name)
    
    struct_shape_explode = get_protein_shapes_from_cif(parsed_cif)
    atoms_w_shape = join_atoms_with_shape(atoms, struct_shape_explode)
    
    local_confidence = get_local_confidence_from_cif(parsed_cif)
    atoms = sort_by_file_number_and_index(join_atoms_with_confidence(atoms_w_shape, local_confidence))
    return sequence, atoms

In [None]:
sequences = []

# could potentially use spark here with wholeTextFile reader but that is not necessary in our usecase
for i in tqdm(range(len(files) // 1000 + 1)):
    with Pool() as p:
        structures = p.map(parse_cif_file_in_cloud, files[i*1000:(i+1)*1000])
    sequences_part, atom_sites_part = zip(*structures)
    sequences.append(pd.concat(sequences_part))

    # this grows quickly, so we will write these in parts
    pd.concat(atom_sites_part).to_parquet(path=f"{home_path}/structure_files/atom_sites/atom_sites_part_{str(i).zfill(2)}.parquet")

In [None]:
sequences_df = reduce_sequence_df(pd.concat(sequences))
sequences_df.to_parquet(path=f"{home_path}/structure_files/sequences/sequences.parquet")

In [None]:
sequences_df = pd.read_parquet(path=f"{home_path}/structure_files/sequences/sequences.parquet")
sequences_df.head()

In [10]:
atom_sites_df = pd.read_parquet(path=f"{home_path}/atom_sites/atom_sites_part_00.parquet")
atom_sites_df.head().columns

Index(['group_PDB', 'id', 'type_symbol', 'label_atom_id', 'label_alt_id',
       'label_comp_id', 'label_asym_id', 'label_entity_id', 'label_seq_id',
       'pdbx_PDB_ins_code', 'Cartn_x', 'Cartn_y', 'Cartn_z', 'occupancy',
       'B_iso_or_equiv', 'pdbx_formal_charge', 'auth_seq_id', 'auth_comp_id',
       'auth_asym_id', 'auth_atom_id', 'pdbx_PDB_model_num',
       'pdbx_sifts_xref_db_acc', 'pdbx_sifts_xref_db_name',
       'pdbx_sifts_xref_db_num', 'pdbx_sifts_xref_db_res', 'protein_filename',
       'protein_id', 'shape.beg_label_comp_id', 'shape.beg_label_seq_id',
       'shape.conf_type_id', 'shape.end_label_comp_id',
       'shape.end_label_seq_id', 'shape.id', 'confidence_pLDDT'],
      dtype='object')

In [14]:
atom_sites_df[['Cartn_x', 'Cartn_y', 'Cartn_z']].iloc[:100,].to_numpy()

array([['-52.339', '-6.285', '37.051'],
       ['-52.217', '-6.121', '35.586'],
       ['-51.273', '-4.965', '35.324'],
       ['-53.569', '-5.788', '34.933'],
       ['-51.503', '-3.922', '35.912'],
       ['-54.554', '-6.956', '34.939'],
       ['-56.176', '-6.468', '34.307'],
       ['-57.069', '-8.042', '34.423'],
       ['-50.236', '-5.171', '34.510'],
       ['-49.535', '-4.155', '33.697'],
       ['-48.042', '-4.478', '33.581'],
       ['-49.730', '-2.697', '34.140'],
       ['-47.219', '-4.033', '34.374'],
       ['-49.019', '-1.843', '33.272'],
       ['-47.745', '-5.248', '32.536'],
       ['-46.608', '-5.043', '31.636'],
       ['-45.203', '-5.058', '32.240'],
       ['-46.877', '-3.811', '30.766'],
       ['-44.460', '-4.081', '32.191'],
       ['-48.163', '-3.941', '30.180'],
       ['-44.776', '-6.246', '32.660'],
       ['-43.463', '-6.703', '32.213'],
       ['-43.632', '-7.090', '30.745'],
       ['-43.027', '-7.931', '33.029'],
       ['-44.460', '-7.953', '30.501'],
