In [1]:
# import all necessary packages

import pandas as pd
import requests
import time
import ast
from collections import defaultdict
import urllib.parse
import urllib.request
import sys
import shutil
import glob
import logging
import tempfile
import os
import os.path as op
# Import the GEM-PRO class
from ssbio.pipeline.gempro import GEMPRO
# Printing multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
from NozPiker_Funcs import main as NZ

# Define all functions

In [56]:
def gempro_analysis(GENES_AND_SEQUENCES):
    InteractiveShell.ast_node_interactivity = "all"

    # Create logger
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)  # SET YOUR LOGGING LEVEL HERE #

    # Other logger stuff for Jupyter notebooks
    handler = logging.StreamHandler(sys.stderr)
    formatter = logging.Formatter('[%(asctime)s] [%(name)s] %(levelname)s: %(message)s', datefmt="%Y-%m-%d %H:%M")
    handler.setFormatter(formatter)
    logger.handlers = [handler]

    # SET FOLDERS AND DATA HERE
    ROOT_DIR = tempfile.gettempdir()
    
    PROJECT = 'proteins_human_ortho_Frac15'
    PDB_FILE_TYPE = 'pdb'

    # Create the GEM-PRO project
    my_gempro = GEMPRO(gem_name=PROJECT, root_dir=ROOT_DIR, genes_and_sequences=GENES_AND_SEQUENCES, pdb_file_type=PDB_FILE_TYPE)

    # Mapping using BLAST
    my_gempro.blast_seqs_to_pdb(all_genes=True, seq_ident_cutoff=.9, evalue=0.00001)
    my_gempro.df_pdb_blast.head(10)
    
    # Download all mapped PDBs and gather the metadata
    my_gempro.pdb_downloader_and_metadata()
    my_gempro.df_pdb_metadata.head(2)

    # Set representative structures
    my_gempro.set_representative_structure(pdb_file_type = 'pdb', engine='biopython',seq_ident_cutoff=0.7)
    my_gempro.df_representative_structures.head()

    # Prep I-TASSER model folders
    my_gempro.prep_itasser_modeling('~/I-TASSER5.1', '~/I-TASSER5.1/ITLIB/', runtype='local', all_genes=False)
    my_gempro.save_json(op.join(my_gempro.model_dir, '{}.json'.format(my_gempro.id)), compression=False)
    print('GEMPRO run has been saved in ' + my_gempro.model_dir)

In [None]:
def Main(uniprotID):
    # extract protein sequences and append to uniprot ID
    params = {"query": uniprotID, "format": "fasta"} 
    response = requests.get("http://www.uniprot.org/uniprot/", params)
    seq = response.text
    seqq = ''.join([x for x in seq.split('\n')[1:]])
    print('Preparing genes and sequences...')
    GENES_AND_SEQUENCES = dict(zip(cluster_of_interest, prot_seqs))
    print('Beginning GEMPRO analysis...')
    gempro_analysis(GENES_AND_SEQUENCES, cluster_number)
    print('GEMPRO analysis has finished...')


# Run protein structure extraction

In [2]:
# read in tett data table
# Define Data Path
DATA_DIR = os.path.join(os.path.dirname(os.getcwd()),'data/')
tett_data = pd.read_csv(DATA_DIR + "tetts_fractions_for_IFTA_cryoEM_06032020_annots.csv")



In [3]:
tett_data_mostabundant = tett_data[tett_data['Tetrahymena_fxn15_06032020'] > 30]

In [4]:
tett_data_mostabundant

Unnamed: 0,human_genename,human_uniprot_ID,euNOG_ID,tetts_uniprot,#N/A,total_counts,Tetrahymena_fxn14_06032020,Tetrahymena_fxn15_06032020
0,,,,I7M9Q5,Uncharacterized protein,348,206.0,142.0
1,,,ENOG410J2R0,Q23FB7,Uncharacterized protein,176,92.0,84.0
2,"HSP90B2P,GRP94B,GRP94P1,TRAP1,HSP90B1,GRP94,TR...","Q58FF3,P14625,Q58FF6,Q58FF8,Q58FF7,Q14568,Q58F...",KOG0020,Q22W82,HATPase_c domain-containing protein,164,88.0,76.0
3,,,ENOG410JP72,I7MJ89,Uncharacterized protein,138,86.0,52.0
4,"TUBB4B,TUBB2C,TUBB2B,TUBB8,TUBB1,TUBB,TUBB5,OK...","P68371,Q9BVA1,Q3ZCM7,Q9H4B7,P07437,Q13509,P043...",KOG1375,Q24B92,Tubulin beta chain,117,60.0,57.0
5,"DNHD1,C11orf47,CCDC35,DHCD1,DNHD1L,UNQ5781/PRO...","Q96M86,Q96DT5,Q9UFH2,Q6ZR08,Q9P225,Q8TE73,Q9NY...",KOG3595,I7M6H4,"Dynein-1-alpha heavy chain, flagellar inner ar...",96,46.0,50.0
6,"HSPD1,HSP60",P10809,KOG0356,Q23JZ7,Chaperonin CPN60-1,95,44.0,51.0
7,"DOP1A,DOPEY1,KIAA1117",Q5JWR5,KOG3613,Q23JK7,"Dopey, amine-terminal domain protein",91,38.0,53.0
8,PSMD1,Q99460,KOG2062,I7LTL6,"26S proteasome regulatory complex, subunit RPN2",87,39.0,48.0
9,"PSMD2,TRAP2",Q13200,KOG2005,I7M1R0,26S proteasome regulatory subunit,82,38.0,44.0


In [31]:
# makes list of human proteins
human_protein_list = tett_data_mostabundant['tetts_uniprot '].tolist()
# removes nans from list
human_protein_list = [x for x in human_protein_list if str(x) != 'nan']
# flattens list
human_protein_list = [x.split(",") for x in human_protein_list]
human_protein_list = [item for sublist in human_protein_list for item in sublist]

In [32]:
prot_seqs = []
for i in human_protein_list:
    params = {"query": i, "format": "fasta"} 
    response = requests.get("http://www.uniprot.org/uniprot/", params)
    seq = response.text
    seqq = ''.join([x for x in seq.split('\n')[1:]])
    prot_seqs.append(seqq)

In [33]:
GENES_AND_SEQUENCES = dict(zip(human_protein_list, prot_seqs))


In [34]:
InteractiveShell.ast_node_interactivity = "all"
# Create logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)  # SET YOUR LOGGING LEVEL HERE #

In [35]:
# Other logger stuff for Jupyter notebooks
handler = logging.StreamHandler(sys.stderr)
formatter = logging.Formatter('[%(asctime)s] [%(name)s] %(levelname)s: %(message)s', datefmt="%Y-%m-%d %H:%M")
handler.setFormatter(formatter)
logger.handlers = [handler]

In [36]:
# SET FOLDERS AND DATA HERE
ROOT_DIR = tempfile.gettempdir()

PROJECT = 'proteins_human_ortho_Frac15'
PDB_FILE_TYPE = 'pdb'

In [37]:
# Create the GEM-PRO project
my_gempro = GEMPRO(gem_name=PROJECT, root_dir=ROOT_DIR, genes_and_sequences=GENES_AND_SEQUENCES, pdb_file_type=PDB_FILE_TYPE)

[2020-06-07 18:51] [ssbio.pipeline.gempro] INFO: Creating GEM-PRO project directory in folder /tmp
[2020-06-07 18:51] [ssbio.pipeline.gempro] INFO: /tmp/proteins_human_ortho_Frac15: GEM-PRO project location
[2020-06-07 18:51] [ssbio.pipeline.gempro] INFO: Added 26 genes to GEM-PRO project
[2020-06-07 18:51] [ssbio.pipeline.gempro] INFO: Loaded in 26 sequences
[2020-06-07 18:51] [ssbio.pipeline.gempro] INFO: 26: number of genes


In [38]:
# Mapping using BLAST
my_gempro.blast_seqs_to_pdb(all_genes=True, seq_ident_cutoff=.9, evalue=0.00001)
my_gempro.df_pdb_blast.head(10)

A Jupyter Widget




[2020-06-07 18:53] [ssbio.pipeline.gempro] INFO: Completed sequence --> PDB BLAST. See the "df_pdb_blast" attribute for a summary dataframe.
[2020-06-07 18:53] [ssbio.pipeline.gempro] INFO: 1: number of genes with additional structures added from BLAST


Unnamed: 0_level_0,pdb_id,pdb_chain_id,hit_score,hit_evalue,hit_percent_similar,hit_percent_ident,hit_num_ident,hit_num_similar
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Q24B92,5ubq,F,2248.0,0.0,0.927765,0.927765,411,411
Q24B92,5ubq,B,2248.0,0.0,0.927765,0.927765,411,411
Q24B92,5ubq,D,2248.0,0.0,0.927765,0.927765,411,411
Q24B92,5ucy,B,2248.0,0.0,0.927765,0.927765,411,411
Q24B92,6u0h,B,2252.0,0.0,0.930023,0.930023,412,412
Q24B92,6u0t,M,2252.0,0.0,0.930023,0.930023,412,412
Q24B92,6u0t,L,2252.0,0.0,0.930023,0.930023,412,412
Q24B92,6u0t,K,2252.0,0.0,0.930023,0.930023,412,412
Q24B92,6u0t,J,2252.0,0.0,0.930023,0.930023,412,412
Q24B92,6u0t,B,2252.0,0.0,0.930023,0.930023,412,412


In [39]:
massspec_pdbdf = my_gempro.df_pdb_blast

In [41]:
uniqueset_massspec_pdblist = set(massspec_pdbdf['pdb_id'].to_list())

In [42]:
len(uniqueset_massspec_pdblist)

5

In [43]:
uniqueset_massspec_pdblist = [x.upper() for x in uniqueset_massspec_pdblist]

In [44]:
uniqueset_massspec_pdblist

['6U0H', '5UCY', '6U0T', '6U0U', '5UBQ']

In [45]:
DATA_DIR = os.path.join(os.path.dirname(os.getcwd()),'data/build_data/')
DEST_DIR = '/home/cns-mccafferty/NozPiker/data/tetts_F15_massSpec/'

files = glob.glob(DATA_DIR + "*.mrcs")
madefiles = []
for i in files:
    shortname = i.split('/')[-1].split('_')[1]
    if shortname in uniqueset_massspec_pdblist:
        madefiles.append(shortname)
        shutil.copy(i, DEST_DIR)

In [46]:
len(madefiles)

0

In [47]:
l3 = [x for x in uniqueset_massspec_pdblist if x not in madefiles]

In [49]:
for i in l3:
    try:
        NZ.makeImages(DEST_DIR, i)
    except AttributeError:
        pass

TUBULINALPHACHAIN


('TUBULINALPHACHAIN', 'TUBULINALPHACHAIN_6U0H_proj.mrcs')

TUBULINALPHACHAIN


('TUBULINALPHACHAIN', 'TUBULINALPHACHAIN_5UCY_proj.mrcs')

TUBULINALPHACHAIN


('TUBULINALPHACHAIN', 'TUBULINALPHACHAIN_6U0T_proj.mrcs')

TUBULINALPHACHAIN


('TUBULINALPHACHAIN', 'TUBULINALPHACHAIN_6U0U_proj.mrcs')

TUBULINALPHACHAIN


('TUBULINALPHACHAIN', 'TUBULINALPHACHAIN_5UBQ_proj.mrcs')

In [30]:
tetts_protein_list = tett_data_mostabundant['tetts_uniprot '].tolist()

In [2]:
NZ.makeImages('/home/cns-mccafferty/NozPiker/data/tetts_F15_massSpec_only/', '2CG9')

ATPDEPENDENTMOLECULARCHAPERONEHSP82


('ATPDEPENDENTMOLECULARCHAPERONEHSP82',
 'ATPDEPENDENTMOLECULARCHAPERONEHSP82_2CG9_proj.mrcs')