### Loading GEM-PRO dataframe

In [9]:
import pandas as pd

In [10]:
GP = pd.read_csv('DF_GEMPRO.csv', index_col=0)
# forcing gene IDs to be read as strings
GP['m_gene_original'] = GP['m_gene_original'].astype(str)
GP['m_gene_entrez'] = GP['m_gene_entrez'].astype(str)
GP['m_gene_isoform'] = GP['m_gene_isoform'].astype(str)
GP['m_gene_original'][1]
GP.head()

Unnamed: 0,m_gene_original,m_gene_entrez,m_gene_isoform,u_uniprot_acc,u_isoform_id,u_refseq,u_ensp,u_seq_len,u_seq,u_reviewed,...,ssb_p_aln_coverage,ssb_p_percent_seq_ident,ssb_p_no_deletions_in_pdb,ssb_p_aln_coverage_sim,ssb_si_score,ssb_rez_score,ssb_raw_score,ssb_above_cutoffs,ssb_rank,ssb_best_file
0,100.1,100,1,P00813,P00813-1,NP_000013,ENSP00000361965,363.0,MAQTPAFDKPKVELHVHLDGSIKPETILYYGRRRGIALPANTAEGL...,True,...,359.0,0.988981,True,359.0,1.586721,1.213333,2.800054,True,1.0,3iar.pdb
1,10005.1,10005,1,O14734,O14734-1,NP_005460,ENSP00000217455,319.0,MSSPQAPEDGQGCGDRGDPPGDLRSVLVTTVLNLEPLDEDLFRGRH...,True,...,,,,,,,,,,NP_005460.2_model1_fix.pdb
2,10005.2,10005,2,,,,,,,,...,,,,,,,,,,
3,10005.3,10005,3,,,,,,,,...,,,,,,,,,,
4,10007.1,10007,1,P46926,P46926-1,NP_005462,ENSP00000311876,289.0,MKLIILEHYSQASEWAAKYIRNRIIQFNPGPEKYFTLGLPTGSTPL...,True,...,281.0,0.972318,True,281.0,1.559988,1.175,2.734988,True,1.0,1ne7.pdb


#### Note the excel file has the sequence IDs

### Outputs gene of interest, most importantly identifies the best pdb structure

In [15]:
# this searches for an ID and prints out which row it is in
gene_id = raw_input("What is the gene ID?   ") # this can be modified to ask for gene original, entrez, uniprot, isoform id, refseq etc.

index = 0 
for ID in  GP['m_gene_entrez']:
    if ID == gene_id:
        print pd.DataFrame(GP.ix[index])
    index += 1

What is the gene ID?   a


### Writing FASTA files

In [17]:
import os.path
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC

def write_fasta_file(sequence, fileout):
    '''
    This writes a fasta file for a SeqRecord object. It also checks if the file exists already and returns the filename.
    
    Input: sequence - Biopython SeqRecord object, identification - ID of the sequence.
    Output: Filename of fasta file
    '''
    
    outfile = "%s" % fileout
    if os.path.isfile(outfile):
        print 'FASTA file already exists %s' % outfile
        return outfile
    else:
        SeqIO.write(sequence, outfile, "fasta")
        return outfile

In [21]:
# example: for gene 100.1

# getting the IDs and making output file name
seq_id = GP[GP.m_gene_original == '100.1'].u_isoform_id.values[0]
# the /tmp/ in '/tmp/' + seq_id + '.faa' puts it in a temporary folder; I will remove the temp saving for now
seq_output = seq_id + '.faa'

# getting the sequence and making it into a Biopython SeqRecord object
seq = GP[GP.m_gene_original == '100.1'].u_seq.values[0]
seq_biop = SeqRecord(Seq(seq, IUPAC.protein),id=seq_id,description='uniprot sequence')

# writing the SeqRecord object (formats it as FASTA file)
out_file = write_fasta_file(seq_biop, seq_output)

FASTA file already exists P00813-1.faa


In [19]:
# just saving in tmp for this example, all fasta files were already written in "sequence_files"
!cat $out_file

>P00813-1 uniprot sequence
MAQTPAFDKPKVELHVHLDGSIKPETILYYGRRRGIALPANTAEGLLNVIGMDKPLTLPD
FLAKFDYYMPAIAGCREAIKRIAYEFVEMKAKEGVVYVEVRYSPHLLANSKVEPIPWNQA
EGDLTPDEVVALVGQGLQEGERDFGVKARSILCCMRHQPNWSPKVVELCKKYQQQTVVAI
DLAGDETIPGSSLLPGHVQAYQEAVKSGIHRTVHAGEVGSAEVVKEAVDILKTERLGHGY
HTLEDQALYNRLRQENMHFEICPWSSYLTGAWKPDTEHAVIRLKNDQANYSLNTDDPLIF
KSTLDTDYQMTKRDMGFTEEEFKRLNINAAKSSFLPEDEKRELLDLLYKAYGMPPSASAG
QNL


## aligning 2 FASTA files

In [22]:
import os.path
from Bio.Emboss.Applications import NeedleCommandline

def run_alignment(fasta1_id, fasta1, fasta2_id, fasta2):
    '''
    Runs the needle alignment program and writes the result to a file. Returns the filename. Standard gap inputs are used.
    
    Input:  fasta1 - fasta file name ("reference" sequence)
            fasta2 - fasta file name (what you're interested in aligning)
    Output: alignment_file - file name of alignment
    '''

    alignment_file = "%s_%s_align.txt" % (fasta1_id, fasta2_id)
    
    if os.path.isfile(alignment_file):
        print 'Alignment %s file already exists' % alignment_file
        return alignment_file

    else:
        print '**RUNNING ALIGNMENT FOR %s AND %s**' % (fasta1_id, fasta2_id)
        needle_cline = NeedleCommandline(asequence=fasta1, bsequence=fasta2, gapopen=10, gapextend=0.5, outfile=alignment_file)
        stdout, stderr = needle_cline()
        return alignment_file

In [11]:
# load Biopython PDB packages

# PDBList to download PDBs
from Bio.PDB.PDBList import PDBList
pdbl = PDBList()

# PDBParser to load and work with files
from Bio.PDB.PDBParser import PDBParser
parser = PDBParser()

import urllib2
import uuid

# download pdb
pdb_file_path = pdbl.retrieve_pdb_file('3iar')

Structure exists: '/Users/LAURENCE/Desktop/Senior Design/Untitled Folder/ia/pdb3iar.ent' 


In [13]:
from Bio.Emboss.Applications import NeedleCommandline
# I manually downloaded 3iar.faa from Nathan's dropbox
needle_cline = NeedleCommandline(asequence="3iar.faa", bsequence="P00813-1.faa",
                                gapopen=10, gapextend=0.5, outfile="needle.txt")
stdout,stderr=needle_cline()

ApplicationError: Non-zero return code 127 from 'needle -outfile=needle.txt -asequence=3iar.faa -bsequence=P00813-1.faa -gapopen=10 -gapextend=0.5', message '/bin/sh: needle: command not found'

In [10]:
from Bio import AlignIO
align = AlignIO.read("needle.txt","emboss")
print (align)

IOError: [Errno 2] No such file or directory: 'needle.txt'