# Bioinformatics Workshop
###Kavli Institute for Theoretical Physics
#### Gita Mahmoudabadi | Phillips Lab | Caltech | August 2015

In [22]:
import os
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# changing the directory to the folder where the protein fasta files are stored. 
path = '/Users/octatig88/anaconda/notebooks/kitp2015/protfiles/'
os.chdir(path)

# opening a file to record protein length statistic for each organism. 
outputfile = open('protein_length_stat_multiple_organisms.txt','w') 
outputfile.write('\tNumber of Records\tMean Length\tMedian Length\tStandard Deviation\tMaximum Length\n')

# we need to go through every fasta file to extract protein length information from each. 
protfiles = os.listdir(path)

for protfile in protfiles:
    
# we need to only look into fasta files because we're going to be generating PDFs as well. 
    if protfile.endswith(".fasta"):
        h1=open(protfile)
        inputfilename = protfile[0:-6]
        outputfile.write(inputfilename + '\t')
    
        prot_records= SeqIO.parse(h1, 'fasta')
        length_list =[]
        prot_des =[]
        
# looping through each fasta record and calculating the mean, median, max,
#std deviation, and number of records examined.
        for prot_record in prot_records:  
            prot_length = len(prot_record.seq)
            length_list.append(prot_length)
        
        numrec = str(len(length_list))
        meanl= str(int(round(np.mean(length_list))))
        medl= str(int(round(np.median(length_list))))
        dev = str(int(round(np.std(length_list))))
        maxl = str(max(length_list))
    
# writing the extracted protein length statistic to the outputfile.txt 
        outputfile.write(numrec + '\t' + meanl + '\t' + medl + '\t' + dev + '\t' + maxl + '\n')
    
# closing the fasta file which we opened in this iteration.     
        h1.close()
                                                         
# making a histogram of protein lengths for each fasta file examined.     
        fig = plt.figure(dpi = 600)
        ax = fig.add_subplot(111)
        plt.hist(length_list, bins=200)
    
        plt.xlabel("Protein length (Amino Acids)")
        
        plt.ylabel("Frequency")
        
        figuretext = inputfilename +"\nMean= " + meanl + "\nStd= " + dev + "\nNo. Sequences=" + numrec
        plt.text(0.7, 0.7, figuretext , horizontalalignment='center', verticalalignment='center',transform = ax.transAxes)

# saving the plot as a pdf in the same directory as the one we started in.        
        plt.savefig(inputfilename + "protein length histogram.pdf")
# closing the plot        
        plt.close()
        
        #plt.title("Histogram of " + inputfilename + " Protein Lengths" , fontsize='16')
        #fig = plt.figure(figsize=(5, 3))
        #plt.xticks(fontsize='14')
        #plt.xlim((0,6000))
        #plt.show()
        
        
# closing the outputfile.txt after we have extracted information from all fasta files.     
outputfile.close() 