## Notebook Outline

1. import statements
2. load expression data
3. ID mapping via a genbank parse
4. COG - get a functional annotatino and categorization for each protein
5. Gene essentiality


In [4]:
### Step 1 - import a bunch of libraries
from pandas import DataFrame, read_table
import pandas as pd
from Bio import SeqIO


In [14]:
#### Step 2 - id mapping
##read in the genbank file from E. coli to start out our data frame
## just with the basics of meta-data necessary

Ecoli_mapper = []
Ecoli_genbank_file = "W3110.gb" # the specific strain of E coli
Ecoli_chromosome = SeqIO.read(Ecoli_genbank_file, "genbank")

##this is a special one, because it's genbank and not RefSeq, so I have to do a bit more parsing
#to get the data that I want.
for feature in Ecoli_chromosome.features:
    #first switch out of a bunch of things that we don't really care about
    # must be a CDS, with a RefSeq accession and uniprot accession
    if not feature.type == "CDS":
        continue
    if not 'protein_id' in feature.qualifiers:
        continue
    location = feature.location
    refseq = feature.qualifiers['protein_id'][0] # stupid list.
    gene_sym = feature.qualifiers['gene'][0]
    note = feature.qualifiers['note'][0]
    #now parse the note which is apparently where they've chosen to hide the blattner number
    #  /note="ECK0101:JW5008:b0101"
    locus = note.split(":")[-1] # the last thing in the note is b0007
    #sometimes the locus contains a comment like 'b1011; predicted enzyme'
    if ';' in locus:
        locus = locus.split(';')[0] # the first thing before the semicolon
    Ecoli_mapper.append({'RefSeq':refseq, 'gene':gene_sym, 'locus':locus, 'start':location.start,
                        'stop':location.end, 'strand':location.strand})
    #break
#done with the loop, now we make a data frame.
# this data frame should have all e. coli genes
df = pd.DataFrame(data=Ecoli_mapper)
df


Unnamed: 0,RefSeq,gene,locus,start,stop,strand
0,BAE76026.1,thrL,b0001,189,255,1
1,BAB96579.2,thrA,b0002,336,2799,1
2,BAB96580.2,thrB,b0003,2800,3733,1
3,BAB96581.1,thrC,b0004,3733,5020,1
4,BAB96582.1,yaaX,b0005,5233,5530,1
5,BAB96584.1,yaaA,b0006,5682,6459,-1
6,BAB96585.1,yaaJ,b0007,6528,7959,-1
7,BAB96586.1,talB,b0008,8237,9191,1
8,BAB96587.2,mog,b0009,9305,9893,1
9,BAB96588.2,yaaH,b0010,9927,10494,-1


In [16]:
### Step 3 - COG
# Step 3.1 gather the COG data for E. coli and merge that into the table
# COG is the file that does not have full refseq (it lacks a decimal)
# so that is why we are merging on the 'matchid' column
COG_file = "E_coli_blattner_cog.txt"
df_COG = pd.read_table(COG_file, sep="\t")
df = df.merge(df_COG, left_on="locus", right_on="locus", how='left')

df.head(20)
out_filename = "E_coli_metadata.txt"
df.to_csv(out_filename, sep="\t", index=False)

In [17]:
### Step 3 - coordinates for the deletion series
#this is the start and stop of every point in the E. coli deletion series
# note that some steps have multiple pieces which were deleted, and so
# that means that there are multiple start,stop tuples
#list of beginning and end points of to-be-deleted sequences
step1 = [(1049227,1097311)]
step2 = [(3109247,3133480)]
step3 = [(4239495,4255870)]
step4 = [(1198444,1225484)]
step5 = [(1630054,1640180),(1641770,1649361)]
step6 = [(518367,533048),(535841,550551),(564277,608454)]
step7 = [(3486428,3496908),(3536156,3549686),(3556478,3576238)]
step8 = [(2984498,3032266)]
step9 = [(244301,253746),(262300,387867),(389475,404039)]
step10 = [(3697305,3709526),(3762909,3772816),(3774088,3785257)]
step11 = [(1963664,1992215),(1995801,2027348),(2032588,2043515),(2068289,2081066),(2103532,2115203)]
step12 = [(2754815,2788618),(2794312,2800028),(2824452,2829398)]
step13 = [(3361081,3370891)]
step14 = [(2145403,2186994),(2199808,2228989),(2255035,2262631)]
step15 = [(3076224,3078055)]
step16 = [(4501438,4514770),(4523114,4604874)]
step17 = [(729157,739929)]
step18 = [(4414768,4428387)]
step19 = [(4003005,4021346)]
step20 = [(832889,848426),(867975,883810),(891335,909716)]
step21 = [(3164770,3172081)]
step22 = [(675440,689710)]
step23 = [(2422504,2433237),(2471991,2481621)]
step24 = [(4170563,4186908)]
step25 = [(2385401,2394610)]
step26 = [(3411500,3421220)]
step27 = [(2909410,2909709)]
MGF01 = [(167401,173315)]
MGF02 = [(640669,659192),(2921194,2943638),(65857,78797),(4128291,4141015),
         (1853580,1864142),(3794688,3803458),(3224890,3240400),(4055372,4061465),
         (1875288,1888499),(2876630,2876783),(2599858,2613439)]
DGF298 = [(15445,16903),(19646,20508),(1098128,1117068),(1296003,1308864),(1359517,1368530),(1522387,1543782),
          (1592568,1599800),(1651508,1657863),(2091353,2091607),(2289724,2293603),(2411459,2412234),(2494688,2513688),
          (2519718,2523278),(2909410,2909709),(3183787,3188521),(3746428,3751980),(3871777,3914528),(3987068,3989225),
          (4447062,4453719),(1130992,1142564),(1396605,1520045),(1582557,1592251),(1607823,1624231),(2361574,2382929),
          (2836910,2855462),(3058023,3068464),(3258359,3269672),(3730114,3742029),(4281498,4291254),(4298072,4366613),
          (747145,753217),(3918091,3969123)]

steps = [step1,step2,step3,step4,step5,step6,step7,step8,step9,step10,step11,step12,step13,
         step14,step15,step16,step17,step18,step19,step20,step21,step22,step23,step24,step25,
         step26,step27,MGF01,MGF02,DGF298]

In [20]:
### Step 4 - looking at the coordinates to find out which genes
# are affected by the deletions

step_mutant = [] # array to gather stuff in, which later gets turned into a dataframe


## first a method that inputs the values for gene deletions
#which will be present in later steps
# we need this method, because a deletion present in step 1
# is also present in steps 2 and 3 and ... n. But they are only
# going to be hitby the loop in step 1. so we take care of all 
# the extra steps in this method.
def DeletedInLaterSteps (locus, step_count, gene, del_type):
    #step_count is what is passed in, representing the step where this deletion was first seen
    d = {'locus':locus,'gene':gene} # a dictionary that will grow in the loop
    for i in range (step_count, len(steps)+1):  # have to do len+1 because we're 1-based in naming steps
        step_label = "step{:02d}".format(i)
        d[step_label] = del_type
    #now after we've made this big dictionary, I have to add it to the array which becomes the dataframe
    step_mutant.append(d)

    
    
    
#### the actual loop through all the various deletion steps    
# now I need to cycle through the data frame and get the coordinates for each protein
# and see if they are within the various knockouts

counter = 0
for deletion_coords in steps:
    counter +=1 # at the begninning to start at 1, not zero
    step_label = "step%s"%counter
    #some steps have multiple deleted segments
    for i in range(len(deletion_coords)):
        (del_start, del_stop) = deletion_coords[i]
        for index, row in df.iterrows():
            start, stop = row['start'], row['stop']
            locus = row['locus']
            gene = row['gene']
            #check if deletion starts within the gene boundary
            if del_start in range(start, stop): 
                if row['strand'] == 1: #biopython encodes positive strand as 1
                    #this is a 5' partial gene disruption
                    del_type = "5' partial deletion"
                    #step_mutant.append({'locus':locus, step_label:del_type, 'gene':gene})
                    DeletedInLaterSteps(locus, counter, gene, del_type)
                else: # this is a 3' partial gene disruption
                    del_type = "3' partial deletion"
                    #step_mutant.append({'locus':locus, step_label:del_type, 'gene':gene})
                    DeletedInLaterSteps(locus, counter, gene, del_type)
            #check if deletion stops within the gene boundary
            elif del_stop in range (start, stop):
                if row['strand'] == 1: #biopython encodes positive strand as 1
                    #this is a 5' partial gene disruption
                    del_type = "3' partial deletion"
                    #step_mutant.append({'locus':locus, step_label:del_type, 'gene':gene})
                    DeletedInLaterSteps(locus, counter, gene, del_type)
                else: # this is a 3' partial gene disruption
                    del_type = "5' partial deletion"
                    #step_mutant.append({'locus':locus, step_label:del_type, 'gene':gene})
                    DeletedInLaterSteps(locus, counter, gene, del_type)
            #check if the gene is entirely within a gene boundary
            elif (start in range (del_start, del_stop)) and (stop in range(del_start, del_stop)):
                del_type = "complete deletion"
                DeletedInLaterSteps(locus, counter, gene, del_type)
                #step_mutant.append({'locus':locus, step_label:del_type, 'gene':gene})
            
df_temp = pd.DataFrame(data=step_mutant)
#making column names be step%s was easy, but a few had real names
df_temp.rename(index=str, columns={'step28':'MGF01', 'step29':'MGF02', 'step30':'DGF298'}, inplace=True)
#df_temp

In [21]:
## save out data frame, because I've done a lot of work to get here
#df = df.merge(df_temp, left_on='locus', right_on='locus', how='left')
out_filename = "E_coli_data_frame_knockout.txt"
df_temp.to_csv(out_filename, sep="\t", index=False)

In [22]:
df_temp.tail(20)

Unnamed: 0,gene,locus,step01,step02,step03,step04,step05,step06,step07,step08,...,step21,step22,step23,step24,step25,step26,step27,MGF01,MGF02,DGF298
1605,ldrD,b4453,,,,,,,,,...,,,,,,,,,,complete deletion
1606,bcsG,b3538,,,,,,,,,...,,,,,,,,,,complete deletion
1607,bcsF,b3537,,,,,,,,,...,,,,,,,,,,complete deletion
1608,bcsE,b3536,,,,,,,,,...,,,,,,,,,,complete deletion
1609,yhjR,b3535,,,,,,,,,...,,,,,,,,,,complete deletion
1610,bcsA,b3533,,,,,,,,,...,,,,,,,,,,complete deletion
1611,bcsB,b3532,,,,,,,,,...,,,,,,,,,,complete deletion
1612,bcsZ,b3531,,,,,,,,,...,,,,,,,,,,complete deletion
1613,bcsC,b3530,,,,,,,,,...,,,,,,,,,,complete deletion
1614,bcsC,b3530,,,,,,,,,...,,,,,,,,,,complete deletion
