In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import glob

# We should consider filtering our data a bit because it's a bit messy currently

**I've written this scaffold to make things easier but code similar to what I've written below should be written as a function or two and run for every host and every virus probably saving as a new "tsv" file called "yadayada.clean.tsv"**

**This currently works on E. coli but it's possible that there will be some weird genomes that something in here screws up on so we'll need to think about double checking things to make sure that we never remove too much data. Which is to say that df.shape shouldn't change *that* dramatically from beginning to end and if it does it's likely that there is a bug in the code that needs to be investigated**

In [18]:
###Read in the starting TSV
df = pd.read_csv('../Data/host_genomes/305.tsv', sep=',', index_col=0)
###Reset the index so that it runs from zero to whatever
df = df.reset_index(drop=True)
print(df.shape)
df.head()

(3466, 12)


Unnamed: 0,genome_id,source,type,start,stop,idk,strand,trash,qualifiers,coding_sequence,upstream_sequence,energy_binding
0,NC_003295.1,Protein Homology,CDS,486,620,.,+,0,ID=cds-WP_003262958.1;Parent=gene-RS_RS25220;D...,ATGAAACGTACCTATCAACCTTCCGTTACCCGTCGCAAGCGCACCC...,CATTAACCAGAGAGTGCATC,-3.18
1,NC_003295.1,Protein Homology,CDS,682,1113,.,+,0,ID=cds-WP_019717302.1;Parent=gene-RS_RS00005;D...,ATGGGCCCGCACGCCTACCCCAAGGCCGCAAGGCTTGTGAAAACGG...,GTGTTCGCGGCAATTCGGCG,-1.74
2,NC_003295.1,Protein Homology,CDS,1110,1424,.,+,0,ID=cds-WP_010999971.1;Parent=gene-RS_RS00010;D...,ATGACGCGCGTGCTGCTGTTCCTGCTGCGTGTCTACAAGGTGGCGT...,CCCTCCGACGGAGCCGCGCC,-5.82
3,NC_003295.1,Protein Homology,CDS,1431,3092,.,+,0,ID=cds-WP_010999972.1;Parent=gene-RS_RS00015;D...,ATGGATATCAAACGCACCATTCTCTGGGTGATCTTCTCGCTGTCGG...,TTCCCAGACCGTAATCCGAC,-0.58
4,NC_003295.1,Protein Homology,CDS,3222,4667,.,+,0,ID=cds-WP_010999973.1;Parent=gene-RS_RS00020;D...,ATGACCGCCTCTTCCCACGCCATGACTTCGCCGACTGTTTCCGATG...,GCTGCCGCACAATACCGCCC,-0.07


**First, make sure the table only uses each `locus_tag` once. These should be unique identifiers for the hosts and should therefore only appear once. If they appear more than once I have no idea what's happening. The virus dataframes won't be able to run this part of the code, however.**

In [19]:
df['locus_tag'] = df['qualifiers'].str.split('Parent=gene-', n=1, expand=True)[1]\
                            .str.split(';', n=1, expand=True)[0]

In [20]:
test_string = df.loc[0]["qualifiers"]

In [21]:
test_string

'ID=cds-WP_003262958.1;Parent=gene-RS_RS25220;Dbxref=Genbank:WP_003262958.1,GeneID:31479960;Name=WP_003262958.1;gbkey=CDS;gene=rpmH;inference=COORDINATES: similar to AA sequence:RefSeq:WP_003816025.1;locus_tag=RS_RS25220;product=50S ribosomal protein L34;protein_id=WP_003262958.1;transl_table=11'

In [22]:
df['locus_tag'].value_counts()[:10]

RS_RS07500    2
RS_RS07225    2
RS_RS25525    2
RS_RS07245    2
RS_RS11380    2
RS_RS04135    2
RS_RS26580    2
RS_RS25540    2
RS_RS09230    2
RS_RS26610    2
Name: locus_tag, dtype: int64

**Oooof, let's get rid of those**

In [23]:
df = df.drop_duplicates(subset=['locus_tag'], keep=False)
print(df.shape)

(3432, 13)


In [24]:
df['locus_tag'].value_counts()[:10]

RS_RS11035    1
RS_RS08945    1
RS_RS16780    1
RS_RS00940    1
RS_RS14325    1
RS_RS10380    1
RS_RS06340    1
RS_RS25705    1
RS_RS00770    1
RS_RS16440    1
Name: locus_tag, dtype: int64

**Voila!**

**How about any weird length genes? This should be checked and removed for all genomes (host and virus)**

In [25]:
((df['stop']-df['start']) %3).value_counts()

2    3385
1      24
0      23
dtype: int64

**Clean them up too**

In [26]:
df = df[(df['stop']-df['start']) %3 == 2]
print(df.shape)

(3385, 13)


**Remove phage/prophage genes from descriptors. This again should only be run for the host genomes so you can see maybe we'll want separate functions like "def clean_host(...)" and "def clean_virus(...)"**

In [30]:
filter_word = 'phage'
# filter_word = 'phage'
virus_genes = df[(df['qualifiers'].str.contains(filter_word)==True)]
print(virus_genes.shape)

(31, 13)


In [34]:
df = df[(df['qualifiers'].str.contains(filter_word)==False)]
df.shape

(3354, 13)

0.9676860934795153

In [33]:
virus_genes.loc[93]["qualifiers"]

'ID=cds-WP_011000059.1;Parent=gene-RS_RS00465;Dbxref=Genbank:WP_011000059.1,GeneID:1218895;Name=WP_011000059.1;gbkey=CDS;inference=COORDINATES: similar to AA sequence:RefSeq:WP_003261510.1;locus_tag=RS_RS00465;product=phage holin family protein;protein_id=WP_011000059.1;transl_table=11'

# Add in knowledge of codon usage bias

** This is an important control and you'll have to install this "iCUB" software to make this run. I worked on this project with a former graduate student in my PhD lab and the code is here: https://github.com/amarallab/iCUB **

**The major bug that I can anticipate cropping up is if the organism uses a "non-standard" genetic code but we'll cross that bridge when we get there. This will also probably cause an error if any "non-standard" nucleotides appear in a given genome which will almost certainly occur and need to be fixed**

In [40]:
import sys
sys.path.append('/Users/ChaseWeaver/Desktop/Projects/iCUB-master/')
import iCUB

In [84]:
for index in df.index:
    nt_seq = df.at[index, "coding_sequence"]
    if len(nt_seq)%3 == 0:
        df.at[index, "iCUB"] = iCUB.iCUB_Calculator(nt_seq).get_iCUB()

KeyboardInterrupt: 

**If our genes are clean and good then we shouldn't have any null values in either of these categories and we can then write this new dataframe**

In [85]:
print(df[df['iCUB'].isnull()==True].shape)
print(df[df['energy_binding'].isnull()==True].shape)

(3820, 14)
(0, 14)


# Final notes:

**As I mentioned, we aren't going to want to step through this code for every genome so your task is to write this up into some functions that you can then put into for loops and run for everyone. I'm imagining something like:**

<code>
for host_tsv_file in glob.glob('blahblah/\*.tsv): 
    clean_host_tsv(host_tsv_file)
</code>
    
**where `clean_host_tsv()` will read the tsv, clean it up, check that no problems occurred, and write the new tsv. Something along the lines of:**

<code>
def clean_host_tsv(tsv_location):

    df = pd.read_tsv(tsv_location...)
    starting_shape = df.shape
    #Code that does the work goes here
    ending_shape = df.shape
    #Code that compares ending and starting shapes to make sure things look reasonable goes here
    df.to_csv(tsv_location.replace('.tsv', '.clean.tsv'))
</code>


**And then again we'll have something like this for the viruses:**

<code>
for virus_folder in glob.glob('blahblah/\*\_rep\_viruses/'):
    for virus_tsv_file in glob.glob(virus_folder + '\*.tsv'):
        clean_virus_tsv(virus_tsv_file)
</code>


In [None]:
df = pd.read_csv('../Data/host_genomes/562.tsv', sep = "\t", index_col = 0)
print(df.shape)

In [57]:
def clean_host_tsv(df):
    """
    
    """
    
    initial_shape = df.shape
    df = df.reset_index(drop = True)
    
    #Ensure each locus tag is only used once
    df['locus_tag'] = df['qualifiers'].str.split('locus_tag=', n=1, expand=True)[1]\
                            .str.split(';', n=1, expand=True)[0]
    df = df.drop_duplicates(subset = ["locus_tag"], keep = False)
    
    #ensure that only genes with a total length divisible by 3 are used
    df = df[(df['stop']-df['start']) %3 == 2]
    
    #filter out prophages from dataframe
    filter_word = 'phage'
    virus_genes = df[(df['qualifiers'].str.contains(filter_word)==True)]
    df = df[(df['qualifiers'].str.contains(filter_word)==False)]
    
    #adjust for codon bias
    for index in df.index[]:
        nt_seq = df.at[index, "coding_sequence"]
        if len(nt_seq)%3 == 0:
            df.at[index, "iCUB"] = iCUB.iCUB_Calculator(nt_seq).get_iCUB()
    
    
    
    
    
    return df

In [78]:
for tsv_location in glob.glob("../Data/host_genomes/*.tsv"):
     
    df = pd.read_csv(tsv_location, sep = "\t", index_col = 0)
    initial_shape = df.shape
    df = clean_host_tsv(df)
    final_shape = df.shape
    gene_ratio = final_shape[0]/initial_shape[0]
    if gene_ratio <= .8 or gene_ratio == 1:
        print(tsv_location)
        break
    
    clean_tsv_loc = tsv_location.replace(".tsv",".clean.tsv")
    df.to_csv(clean_tsv_loc, sep = "\t")
    


(4059, 14)


SyntaxError: 'break' outside loop (cell_name, line 7)