In [None]:
%matplotlib inline

# Notebook summary

The purpose of this notebook is simply to clean up some of the files that were created in `Compile_data.ipynb`. The output of that notebook was a set of `.tsv` files organized into folders within `../Data/` for both host and virus genomes. Those `.tsv` files (potentially) contained some bugs so we're going to run some filters/tests and store cleaned versions. 

# Imports

The only custom import here is iCUB (a codon usage bias calculator) that can be found here:
https://github.com/amarallab/iCUB

In [1]:
import pandas as pd
import glob
import sys
sys.path.append('../../iCUB/') ###iCUB needs to be installed for this to work
import iCUB

# Custom functions

Despite some similarities, there are a few crucial differences between the host and virus `.tsv` files that dictate separate cleaning functions.

In [11]:
def common_cleaning(df):

    df = df[df['upstream_sequence'].isnull()==False]
    df = df[df['coding_sequence'].isnull()==False]
    
    ###Add a codon usage bias and GC percent columns
    for index in df.index:
        nt_seq = df.at[index, "coding_sequence"]
        if len(nt_seq) != nt_seq.count('A') + nt_seq.count('T') + nt_seq.count('C') + nt_seq.count('G'):
            continue
        if len(nt_seq)%3 != 0:
            continue
        df.at[index, 'iCUB'] = iCUB.iCUB_Calculator(nt_seq).get_iCUB()
        #
        df.at[index, 'GC_cds'] = (nt_seq.count('G') + nt_seq.count('C')) / len(nt_seq)
        #
        up_seq = df.at[index, 'upstream_sequence']
        df.at[index, 'GC_upstream'] = (up_seq.count('G') + up_seq.count('C')) / len(up_seq)
    
    ###Remove genes for which we were unable to calculate codon usage bias or RBS energies
    df = df[df['iCUB'].isnull()==False]
    df = df[df['GC_cds'].isnull()==False]
    df = df[df['GC_upstream'].isnull()==False]
    df = df[df['RBS_energy'].isnull()==False]
    df = df[df['RBS_energy_upstream'].isnull()==False]
    return df

def clean_host_tsv(df):
    """
    WRITE A BRIEF PURPOSE/SUMMARY
    
    Development notes: I could also think about testing for stop codons within coding sequences
    and filtering accordingly. Also a way to test for possible non-standard genetic code usage
    
    Input/s:
        df - a pandas dataframe with numeric indices, outputted from Compile_data.ipynb and read
                back in
        
    Output/s:
        df - a clean version of the dataframe with one new column (iCUB) and (potentially)
                several rows removed
    """
    df = df.reset_index(drop = True)

    ###Filter out possible prophage genes by removing anything involving the word phage
    ###Numerous possibilities/ways to do this and not all genomes might have any decent
    ###descriptions in the qualifiers.
    filter_word = 'phage'
    df = df[(df['qualifiers'].str.contains(filter_word)==False)]
    
    ###Run the main cleaning/additions
    df = common_cleaning(df)
    
    ###Now ensure that each locus tag is only used once and when in doubt remove them both
    df['locus_tag'] = df['qualifiers'].str.split('locus_tag=', n=1, expand=True)[1]\
                            .str.split(';', n=1, expand=True)[0]
    df = df.drop_duplicates(subset = ['locus_tag'], keep = False)
    return df

def clean_virus_tsv(df):
    """
    This is the same basic structure as the "clean_host_tsv" function. 
    
    Input/s:
        df - a pandas dataframe with numeric indices, outputted from Compile_data.ipynb and read
                back in
        
    Output/s:
        df - a clean version of the dataframe with one new column (iCUB) and (potentially)
                several rows removed
                
    """  
    df = df.reset_index(drop = True)
    
    ###Run the main cleaning/additions
    df = common_cleaning(df)

    ###Now ensure that each viral_id tag is only used once (and when in doubt remove them BOTH)
    df['viral_id'] = df['qualifiers'].str.split('ID=', n=1, expand=True)[1]\
                            .str.split(';', n=1, expand=True)[0]
    df = df.drop_duplicates(subset = ["viral_id"], keep = False)
    return df



# First, test the translation table assumption

In the future, if this analysis is extended to any genome where the `transl_table` qualifier is **not** 11 there will need to be extensive code re-factoring, starting with re-annotating the viral genomes to account for this fact

In [9]:
for host_tsv_file in glob.glob('../Data/host_genomes/' + '*.tsv')[:]:
    if '.clean.' in host_tsv_file:
        continue
    print(host_tsv_file)
    df = pd.read_csv(host_tsv_file, sep = "\t", index_col = 0)
    df['transl_table'] = df['qualifiers'].str.split('transl_table=', n=1, expand=True)[1]\
                        .str.split(';', n=1, expand=True)[0]
    print(df.shape[0], list(df['transl_table'].value_counts().items()))

../Data/host_genomes/28450.tsv
5727 [('11', 5727)]
../Data/host_genomes/1590.tsv
3013 [('11', 3013)]
../Data/host_genomes/562.tsv
4379 [('11', 4379)]
../Data/host_genomes/357276.tsv
4343 [('11', 4343)]
../Data/host_genomes/657318.tsv
3294 [('11', 3294)]
../Data/host_genomes/573.tsv
5316 [('11', 5316)]
../Data/host_genomes/1280.tsv
2767 [('11', 2767)]
../Data/host_genomes/305.tsv
3466 [('11', 3466)]
../Data/host_genomes/435591.tsv
3979 [('11', 3979)]
../Data/host_genomes/470.tsv
4327 [('11', 4327)]
../Data/host_genomes/287.tsv
5573 [('11', 5573)]
../Data/host_genomes/1314.tsv
1693 [('11', 1690)]
../Data/host_genomes/36809.tsv
4920 [('11', 4920)]
../Data/host_genomes/717959.tsv
3110 [('11', 3110)]
../Data/host_genomes/90371.tsv
4447 [('11', 4447)]
../Data/host_genomes/1639.tsv
2867 [('11', 2867)]
../Data/host_genomes/1428.tsv
5117 [('11', 5117)]


# Run cleaning function on all host `.tsv` files

This isn't fast but it's also not optimized for speed. At the moment I am only looking at < 20 host files and this code only needs to be run once so speed is taking a backseat to thoroughness. Ergo, for loops, etc. As it stands, my guess is that this cell takes ~45 mins or so to run

In [10]:
for host_tsv_file in glob.glob('../Data/host_genomes/' + '*.tsv'):
    if '.clean.' in host_tsv_file:
        continue
    print(host_tsv_file)
    ###
    df = pd.read_csv(host_tsv_file, sep = "\t", index_col = 0)
    initial_shape = df.shape
    df = clean_host_tsv(df)
    final_shape = df.shape
    ###
    gene_ratio = final_shape[0]/initial_shape[0]
    print(gene_ratio)
    if gene_ratio <= 0.8: #This basically shouldn't happen
        break
    ###
    clean_tsv_loc = host_tsv_file.replace(".tsv",".clean.tsv")
    df.to_csv(clean_tsv_loc, sep = "\t")

../Data/host_genomes/28450.tsv
0.9776497293521914
../Data/host_genomes/1590.tsv
0.9495519415864587
../Data/host_genomes/562.tsv
0.9271523178807947
../Data/host_genomes/357276.tsv
0.9689154962007829
../Data/host_genomes/657318.tsv
0.9875531268973892
../Data/host_genomes/573.tsv
0.9834462001504891
../Data/host_genomes/1280.tsv
0.9555475243946513
../Data/host_genomes/305.tsv
0.9676860934795153
../Data/host_genomes/435591.tsv
0.9856747926614727
../Data/host_genomes/470.tsv
0.9315923272475156
../Data/host_genomes/287.tsv
0.9971290148932352
../Data/host_genomes/1314.tsv
0.941523922031896
../Data/host_genomes/36809.tsv
0.9902439024390244
../Data/host_genomes/717959.tsv
0.9305466237942123
../Data/host_genomes/90371.tsv
0.947829997751293
../Data/host_genomes/1639.tsv
0.9895361004534357
../Data/host_genomes/1428.tsv
0.9976548759038499


# Run cleaning function on all phage `.tsv` files

Currently printing a lot but realistically there aren't to many files in my current database so it doesn't seem to be a problem to monitor ongoing progress in this manner. This runs fairly quickly, ~30 mins I think.

In [None]:
for virus_folder in glob.glob('../Data/*_rep_viruses/'):
    print('###', virus_folder)
    for virus_tsv_file in glob.glob(virus_folder + '*.tsv'):
        if ".clean." in virus_tsv_file:
            continue
        print(virus_tsv_file)
        ###
        df = pd.read_csv(virus_tsv_file, sep = "\t", index_col = 0)
        initial_shape = df.shape
        df = clean_virus_tsv(df)
        final_shape = df.shape
        ###
        gene_ratio = final_shape[0]/initial_shape[0]
        if gene_ratio <= 0.8:
            print('Strange case', gene_ratio)
            break
        ###
        clean_tsv_loc = virus_tsv_file.replace(".tsv",".clean.tsv")
        df.to_csv(clean_tsv_loc, sep = "\t")

### ../Data/1314_rep_viruses/
../Data/1314_rep_viruses/1022.tsv
../Data/1314_rep_viruses/3755.tsv
../Data/1314_rep_viruses/4398.tsv
../Data/1314_rep_viruses/3568.tsv
../Data/1314_rep_viruses/8429.tsv
../Data/1314_rep_viruses/628.tsv
../Data/1314_rep_viruses/6777.tsv
../Data/1314_rep_viruses/17895.tsv
../Data/1314_rep_viruses/5646.tsv
../Data/1314_rep_viruses/11570.tsv
../Data/1314_rep_viruses/6173.tsv
../Data/1314_rep_viruses/15413.tsv
../Data/1314_rep_viruses/7394.tsv
../Data/1314_rep_viruses/3899.tsv
../Data/1314_rep_viruses/7591.tsv
../Data/1314_rep_viruses/10794.tsv
../Data/1314_rep_viruses/6112.tsv
../Data/1314_rep_viruses/6488.tsv
../Data/1314_rep_viruses/5342.tsv
../Data/1314_rep_viruses/6716.tsv
../Data/1314_rep_viruses/5035.tsv
../Data/1314_rep_viruses/10417.tsv
../Data/1314_rep_viruses/13644.tsv
../Data/1314_rep_viruses/2597.tsv
../Data/1314_rep_viruses/10603.tsv
../Data/1314_rep_viruses/657.tsv
../Data/1314_rep_viruses/6321.tsv
../Data/1314_rep_viruses/7759.tsv
../Data/1314_

../Data/562_rep_viruses/2210.tsv
../Data/562_rep_viruses/4113.tsv
../Data/562_rep_viruses/7340.tsv
../Data/562_rep_viruses/6274.tsv
../Data/562_rep_viruses/7618.tsv
../Data/562_rep_viruses/1255.tsv
../Data/562_rep_viruses/12391.tsv
../Data/562_rep_viruses/5553.tsv
../Data/562_rep_viruses/3650.tsv
../Data/562_rep_viruses/504.tsv
../Data/562_rep_viruses/15763.tsv
../Data/562_rep_viruses/16295.tsv
../Data/562_rep_viruses/1872.tsv
../Data/562_rep_viruses/3081.tsv
../Data/562_rep_viruses/6462.tsv
../Data/562_rep_viruses/3240.tsv
../Data/562_rep_viruses/7973.tsv
../Data/562_rep_viruses/11920.tsv
../Data/562_rep_viruses/2822.tsv
../Data/562_rep_viruses/667.tsv
../Data/562_rep_viruses/4248.tsv
../Data/562_rep_viruses/5142.tsv
../Data/562_rep_viruses/8466.tsv
../Data/562_rep_viruses/5397.tsv
../Data/562_rep_viruses/1683.tsv
../Data/562_rep_viruses/2413.tsv
../Data/562_rep_viruses/1085.tsv
../Data/562_rep_viruses/12192.tsv
../Data/562_rep_viruses/9397.tsv
../Data/562_rep_viruses/7437.tsv
../Data

../Data/470_rep_viruses/3800.tsv
../Data/470_rep_viruses/11758.tsv
../Data/470_rep_viruses/17114.tsv
../Data/470_rep_viruses/5490.tsv
../Data/470_rep_viruses/7046.tsv
../Data/470_rep_viruses/7052.tsv
../Data/470_rep_viruses/612.tsv
../Data/470_rep_viruses/7053.tsv
../Data/470_rep_viruses/2513.tsv
../Data/470_rep_viruses/767.tsv
../Data/470_rep_viruses/1224.tsv
../Data/470_rep_viruses/10242.tsv
../Data/470_rep_viruses/9145.tsv
../Data/470_rep_viruses/15573.tsv
../Data/470_rep_viruses/5145.tsv
../Data/470_rep_viruses/4266.tsv
../Data/470_rep_viruses/9568.tsv
../Data/470_rep_viruses/5959.tsv
../Data/470_rep_viruses/2005.tsv
../Data/470_rep_viruses/18652.tsv
../Data/470_rep_viruses/11071.tsv
../Data/470_rep_viruses/10596.tsv
../Data/470_rep_viruses/706.tsv
../Data/470_rep_viruses/6106.tsv
../Data/470_rep_viruses/3917.tsv
../Data/470_rep_viruses/5625.tsv
../Data/470_rep_viruses/6885.tsv
../Data/470_rep_viruses/17607.tsv
../Data/470_rep_viruses/12179.tsv
../Data/470_rep_viruses/3137.tsv
../D

../Data/657318_rep_viruses/11858.tsv
../Data/657318_rep_viruses/18459.tsv
../Data/657318_rep_viruses/15169.tsv
../Data/657318_rep_viruses/10824.tsv
../Data/657318_rep_viruses/11905.tsv
../Data/657318_rep_viruses/12968.tsv
../Data/657318_rep_viruses/11134.tsv
../Data/657318_rep_viruses/14500.tsv
../Data/657318_rep_viruses/10410.tsv
../Data/657318_rep_viruses/19408.tsv
../Data/657318_rep_viruses/17752.tsv
../Data/657318_rep_viruses/15179.tsv
../Data/657318_rep_viruses/3470.tsv
../Data/657318_rep_viruses/7372.tsv
../Data/657318_rep_viruses/12749.tsv
../Data/657318_rep_viruses/11320.tsv
../Data/657318_rep_viruses/9410.tsv
../Data/657318_rep_viruses/14728.tsv
../Data/657318_rep_viruses/16164.tsv
../Data/657318_rep_viruses/11227.tsv
../Data/657318_rep_viruses/17454.tsv
../Data/657318_rep_viruses/12289.tsv
../Data/657318_rep_viruses/13380.tsv
../Data/657318_rep_viruses/12920.tsv
../Data/657318_rep_viruses/10477.tsv
../Data/657318_rep_viruses/7712.tsv
../Data/657318_rep_viruses/12062.tsv
../Da