In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# %%bash
# MINICONDA_INSTALLER_SCRIPT=Miniconda3-4.5.4-Linux-x86_64.sh
# MINICONDA_PREFIX=/usr/local
# wget https://repo.continuum.io/miniconda/$MINICONDA_INSTALLER_SCRIPT
# chmod +x $MINICONDA_INSTALLER_SCRIPT
# ./$MINICONDA_INSTALLER_SCRIPT -b -f -p $MINICONDA_PREFIX

In [None]:
import gzip
import time
from multiprocessing import Pool
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import utils.gcs_utils as gcs
import utils.spark_utils as sprk
from utils.proteins import *

In [None]:
home_path = "/home/jupyter/pss"

In [None]:
files = gcs.list_keys("UP000005640_9606_HUMAN/cif")

## Parse sequences and atom sites
*reduce files that are multipart*

In [None]:
def parse_cif_file_in_cloud(key):
    content = gcs.download_gzip_to_string(key)
    parsed_cif = parse_cif(content)
    file_name = key.strip(".cif.gz").split("/")[-1]

    sequence = get_protein_sequence_from_cif(parsed_cif)
    sequence['protein_filename'] = file_name
    sequence['protein_id'] = get_protein_id_from_filename(file_name)
    sequence['confidence_pLDDT'] = get_global_confidence_from_cif(parsed_cif)

    atoms = get_atom_sites_from_cif(parsed_cif)
    atoms['protein_filename'] = file_name
    atoms['protein_id'] = get_protein_id_from_filename(file_name)
    
    struct_shape_explode = get_protein_shapes_from_cif(parsed_cif)
    atoms_w_shape = join_atoms_with_shape(atoms, struct_shape_explode)
    
    local_confidence = get_local_confidence_from_cif(parsed_cif)
    atoms = sort_by_file_number_and_index(join_atoms_with_confidence(atoms_w_shape, local_confidence))
    return sequence, atoms

In [None]:
sequences = []

# could potentially use spark here with wholeTextFile reader but that is not necessary in our usecase
for i in tqdm(range(len(files) // 1000 + 1)):
    with Pool() as p:
        structures = p.map(parse_cif_file_in_cloud, files[i*1000:(i+1)*1000])
    sequences_part, atom_sites_part = zip(*structures)
    sequences.append(pd.concat(sequences_part))

    # this grows quickly, so we will write these in parts
    pd.concat(atom_sites_part).to_parquet(path=f"{home_path}/structure_files/atom_sites/atom_sites_part_{str(i).zfill(2)}.parquet")

In [None]:
sequences_df = reduce_sequence_df(pd.concat(sequences))
sequences_df.to_parquet(path=f"{home_path}/structure_files/sequences/sequences.parquet")

In [None]:
sequences_df = pd.read_parquet(path=f"{home_path}/structure_files/sequences/sequences.parquet")
sequences_df.head()

In [None]:
atom_sites_df = pd.read_parquet(path=f"{home_path}/structure_files/atom_sites/atom_sites_part_00.parquet")
atom_sites_df.head()