In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import glob
import sys
sys.path.append('/Users/ChaseWeaver/Desktop/Projects/iCUB-master/') 
import iCUB

# Cleaning Host Data

In [3]:
def clean_host_tsv(df):
    """
    
    """
    
    initial_shape = df.shape
    df = df.reset_index(drop = True)
    
    #Ensure each locus tag is only used once
    df['locus_tag'] = df['qualifiers'].str.split('locus_tag=', n=1, expand=True)[1]\
                            .str.split(';', n=1, expand=True)[0]
    df = df.drop_duplicates(subset = ["locus_tag"], keep = False)
    
    #ensure that only genes with a total length divisible by 3 are used
    df = df[(df['stop']-df['start']) %3 == 2]
    
    #filter out prophages from dataframe
    filter_word = 'phage'
    virus_genes = df[(df['qualifiers'].str.contains(filter_word)==True)]
    df = df[(df['qualifiers'].str.contains(filter_word)==False)]
    
    #adjust for codon bias
    for index in df.index:
        nt_seq = df.at[index, "coding_sequence"]
        if len(nt_seq)%3 == 0:
            df.at[index, "iCUB"] = iCUB.iCUB_Calculator(nt_seq).get_iCUB()
    
    df = df[df['iCUB'].isnull()==False]
    df =df[df['energy_binding'].isnull()==False]
    
    return df

# Clean Virus data

In [3]:
def clean_virus_tsv(df):
    """
    
    """  
    initial_shape = df.shape
    df = df.reset_index(drop = True)
    
    #Ensure each locus tag is only used once
    df['viral_id'] = df['qualifiers'].str.split('ID=', n=1, expand=True)[1]\
                            .str.split(';', n=1, expand=True)[0]
    
    df = df.drop_duplicates(subset = ["viral_id"], keep = False)
    
    #ensure that only genes with a total length divisible by 3 are used
    df = df[(df['stop']-df['start']) %3 == 2]
    
    #adjust for codon bias
    for index in df.index:
        nt_seq = df.at[index, "coding_sequence"]
        if len(nt_seq) != nt_seq.count('A') + nt_seq.count('T') + nt_seq.count('C') + nt_seq.count('G'):
            continue
        
        if len(nt_seq)%3 != 0:
            continue
            
        df.at[index, "iCUB"] = iCUB.iCUB_Calculator(nt_seq).get_iCUB()           

    df = df[df['iCUB'].isnull()==False]
    df = df[df['energy_binding'].isnull()==False]
    return df

In [5]:
for virus_folder in glob.glob('../Data/*_rep_viruses/'):
    if '90371' in virus_folder:
        continue 
    print(virus_folder)
    for virus_tsv_file in glob.glob(virus_folder + '*.tsv'):
        print(virus_tsv_file)
        
        if "clean" in virus_tsv_file:
            continue
            
        df = pd.read_csv(virus_tsv_file, sep = "\t", index_col = 0)

        initial_shape = df.shape
        #print(initial_shape)
        
        df = clean_virus_tsv(df)
        final_shape = df.shape
        gene_ratio = final_shape[0]/initial_shape[0]
        if gene_ratio <= .8:
            print(tsv_location)
            break
    
        clean_tsv_loc = virus_tsv_file.replace(".tsv",".clean.tsv")
        df.to_csv(clean_tsv_loc, sep = "\t")

../Data/1314_rep_viruses/
../Data/1314_rep_viruses/1022.tsv
../Data/1314_rep_viruses/3755.tsv
../Data/1314_rep_viruses/7591.clean.tsv
../Data/1314_rep_viruses/4398.tsv
../Data/1314_rep_viruses/5035.clean.tsv
../Data/1314_rep_viruses/3755.clean.tsv
../Data/1314_rep_viruses/8429.clean.tsv
../Data/1314_rep_viruses/1022.clean.tsv
../Data/1314_rep_viruses/3568.tsv
../Data/1314_rep_viruses/8429.tsv
../Data/1314_rep_viruses/628.tsv
../Data/1314_rep_viruses/6777.tsv
../Data/1314_rep_viruses/17895.tsv
../Data/1314_rep_viruses/15413.clean.tsv
../Data/1314_rep_viruses/5646.tsv
../Data/1314_rep_viruses/3568.clean.tsv
../Data/1314_rep_viruses/6579.clean.tsv
../Data/1314_rep_viruses/6777.clean.tsv
../Data/1314_rep_viruses/10921.clean.tsv
../Data/1314_rep_viruses/11570.tsv
../Data/1314_rep_viruses/6173.tsv
../Data/1314_rep_viruses/1738.clean.tsv
../Data/1314_rep_viruses/4398.clean.tsv
../Data/1314_rep_viruses/15421.clean.tsv
../Data/1314_rep_viruses/15413.tsv
../Data/1314_rep_viruses/657.clean.tsv
..

../Data/562_rep_viruses/4785.clean.tsv
../Data/562_rep_viruses/18961.tsv
../Data/562_rep_viruses/7916.tsv
../Data/562_rep_viruses/5461.clean.tsv
../Data/562_rep_viruses/170.tsv
../Data/562_rep_viruses/11993.tsv
../Data/562_rep_viruses/15277.tsv
../Data/562_rep_viruses/3071.clean.tsv
../Data/562_rep_viruses/6462.clean.tsv
../Data/562_rep_viruses/3247.clean.tsv
../Data/562_rep_viruses/1355.tsv
../Data/562_rep_viruses/5077.clean.tsv
../Data/562_rep_viruses/6903.clean.tsv
../Data/562_rep_viruses/1619.tsv
../Data/562_rep_viruses/2462.tsv
../Data/562_rep_viruses/1976.tsv
../Data/562_rep_viruses/206.tsv
../Data/562_rep_viruses/12861.tsv
../Data/562_rep_viruses/2264.tsv
../Data/562_rep_viruses/5852.clean.tsv
../Data/562_rep_viruses/992.tsv
../Data/562_rep_viruses/12529.clean.tsv
../Data/562_rep_viruses/6566.tsv
../Data/562_rep_viruses/2072.tsv
../Data/562_rep_viruses/9899.tsv
../Data/562_rep_viruses/5721.tsv
../Data/562_rep_viruses/5753.clean.tsv
../Data/562_rep_viruses/1223.tsv
../Data/562_re

../Data/562_rep_viruses/3964.clean.tsv
../Data/562_rep_viruses/4517.tsv
../Data/562_rep_viruses/7750.tsv
../Data/562_rep_viruses/5686.clean.tsv
../Data/562_rep_viruses/1137.tsv
../Data/562_rep_viruses/8521.clean.tsv
../Data/562_rep_viruses/11532.clean.tsv
../Data/562_rep_viruses/4852.tsv
../Data/562_rep_viruses/3557.clean.tsv
../Data/562_rep_viruses/2210.tsv
../Data/562_rep_viruses/7973.clean.tsv
../Data/562_rep_viruses/7697.clean.tsv
../Data/562_rep_viruses/4113.tsv
../Data/562_rep_viruses/7340.tsv
../Data/562_rep_viruses/601.clean.tsv
../Data/562_rep_viruses/747.clean.tsv
../Data/562_rep_viruses/7886.clean.tsv
../Data/562_rep_viruses/6274.tsv
../Data/562_rep_viruses/7618.tsv
../Data/562_rep_viruses/1255.tsv
../Data/562_rep_viruses/1856.clean.tsv
../Data/562_rep_viruses/12391.tsv
../Data/562_rep_viruses/7707.clean.tsv
../Data/562_rep_viruses/4312.clean.tsv
../Data/562_rep_viruses/5553.tsv
../Data/562_rep_viruses/3650.tsv
../Data/562_rep_viruses/7916.clean.tsv
../Data/562_rep_viruses/5

../Data/562_rep_viruses/5188.tsv
../Data/562_rep_viruses/609.clean.tsv
../Data/562_rep_viruses/924.clean.tsv
../Data/562_rep_viruses/849.tsv
../Data/562_rep_viruses/16060.tsv
../Data/562_rep_viruses/7990.tsv
../Data/562_rep_viruses/16295.clean.tsv
../Data/562_rep_viruses/2135.clean.tsv
../Data/562_rep_viruses/12010.tsv
../Data/562_rep_viruses/2856.clean.tsv
../Data/562_rep_viruses/2584.clean.tsv
../Data/562_rep_viruses/10658.clean.tsv
../Data/562_rep_viruses/15761.clean.tsv
../Data/562_rep_viruses/2121.clean.tsv
../Data/562_rep_viruses/1587.clean.tsv
../Data/562_rep_viruses/6720.tsv
../Data/562_rep_viruses/667.clean.tsv
../Data/562_rep_viruses/3582.clean.tsv
../Data/562_rep_viruses/337.clean.tsv
../Data/562_rep_viruses/4061.clean.tsv
../Data/562_rep_viruses/10203.tsv
../Data/562_rep_viruses/5946.clean.tsv
../Data/562_rep_viruses/13091.tsv
../Data/562_rep_viruses/631.clean.tsv
../Data/562_rep_viruses/3806.clean.tsv
../Data/562_rep_viruses/12790.clean.tsv
../Data/562_rep_viruses/5096.cle

../Data/562_rep_viruses/6999.tsv
../Data/562_rep_viruses/6999.clean.tsv
../Data/562_rep_viruses/4803.tsv
../Data/562_rep_viruses/11395.tsv
../Data/562_rep_viruses/5328.tsv
../Data/562_rep_viruses/2447.tsv
../Data/562_rep_viruses/8168.clean.tsv
../Data/562_rep_viruses/4546.tsv
../Data/562_rep_viruses/12529.tsv
../Data/562_rep_viruses/6386.tsv
../Data/562_rep_viruses/4591.tsv
../Data/562_rep_viruses/12861.clean.tsv
../Data/562_rep_viruses/17675.clean.tsv
../Data/562_rep_viruses/14846.clean.tsv
../Data/562_rep_viruses/177.clean.tsv
../Data/562_rep_viruses/197.tsv
../Data/562_rep_viruses/3151.clean.tsv
../Data/562_rep_viruses/4771.clean.tsv
../Data/562_rep_viruses/7516.tsv
../Data/562_rep_viruses/9275.tsv
../Data/562_rep_viruses/3758.tsv
../Data/562_rep_viruses/544.tsv
../Data/562_rep_viruses/587.tsv
../Data/562_rep_viruses/3659.clean.tsv
../Data/562_rep_viruses/1695.clean.tsv
../Data/562_rep_viruses/8023.tsv
../Data/562_rep_viruses/12110.tsv
../Data/562_rep_viruses/2413.clean.tsv
../Data/

../Data/435591_rep_viruses/12294.clean.tsv
../Data/435591_rep_viruses/12361.clean.tsv
../Data/435591_rep_viruses/13371.tsv
../Data/435591_rep_viruses/4029.tsv
../Data/435591_rep_viruses/14594.tsv
../Data/435591_rep_viruses/14583.clean.tsv
../Data/435591_rep_viruses/14583.tsv
../Data/435591_rep_viruses/19215.tsv
../Data/435591_rep_viruses/19349.tsv
../Data/435591_rep_viruses/17985.tsv
../Data/435591_rep_viruses/14336.clean.tsv
../Data/435591_rep_viruses/14336.tsv
../Data/435591_rep_viruses/6502.tsv
../Data/435591_rep_viruses/16646.tsv
../Data/435591_rep_viruses/5751.tsv
../Data/435591_rep_viruses/12018.tsv
../Data/435591_rep_viruses/17290.clean.tsv
../Data/435591_rep_viruses/14695.tsv
../Data/435591_rep_viruses/6649.tsv
../Data/435591_rep_viruses/6502.clean.tsv
../Data/435591_rep_viruses/11199.clean.tsv
../Data/435591_rep_viruses/12361.tsv
../Data/435591_rep_viruses/12018.clean.tsv
../Data/435591_rep_viruses/17135.clean.tsv
../Data/435591_rep_viruses/5751.clean.tsv
../Data/435591_rep_vi

../Data/1639_rep_viruses/2271.clean.tsv
../Data/1639_rep_viruses/397.tsv
../Data/1639_rep_viruses/5303.tsv
../Data/1639_rep_viruses/5451.clean.tsv
../Data/1639_rep_viruses/383.tsv
../Data/1639_rep_viruses/6452.clean.tsv
../Data/1639_rep_viruses/8287.clean.tsv
../Data/1639_rep_viruses/6031.tsv
../Data/1639_rep_viruses/3214.clean.tsv
../Data/1639_rep_viruses/5024.clean.tsv
../Data/1639_rep_viruses/8239.clean.tsv
../Data/1639_rep_viruses/3214.tsv
../Data/1639_rep_viruses/1371.tsv
../Data/1639_rep_viruses/5719.clean.tsv
../Data/657318_rep_viruses/
../Data/657318_rep_viruses/8774.tsv
../Data/657318_rep_viruses/10410.clean.tsv
../Data/657318_rep_viruses/15169.clean.tsv
../Data/657318_rep_viruses/12749.clean.tsv
../Data/657318_rep_viruses/5535.tsv
../Data/657318_rep_viruses/13374.tsv
../Data/657318_rep_viruses/11775.tsv
../Data/657318_rep_viruses/11274.clean.tsv
../Data/657318_rep_viruses/12527.tsv
../Data/657318_rep_viruses/12094.tsv
../Data/657318_rep_viruses/6808.tsv
../Data/657318_rep_vir

../Data/1280_rep_viruses/4823.clean.tsv
../Data/1280_rep_viruses/2717.tsv
../Data/1280_rep_viruses/3347.tsv
../Data/1280_rep_viruses/2717.clean.tsv
../Data/1280_rep_viruses/1187.clean.tsv
../Data/1280_rep_viruses/5521.clean.tsv
../Data/1280_rep_viruses/4809.tsv
../Data/1280_rep_viruses/3288.clean.tsv
../Data/1280_rep_viruses/11376.clean.tsv
../Data/1280_rep_viruses/375.tsv
../Data/1280_rep_viruses/1187.tsv
../Data/1280_rep_viruses/7079.tsv
../Data/1280_rep_viruses/5849.tsv
../Data/1280_rep_viruses/4809.clean.tsv
../Data/1280_rep_viruses/16552.tsv
../Data/1280_rep_viruses/7325.clean.tsv
../Data/1280_rep_viruses/16229.clean.tsv
../Data/1280_rep_viruses/7079.clean.tsv
../Data/1280_rep_viruses/5965.clean.tsv
../Data/1280_rep_viruses/5542.clean.tsv
../Data/1280_rep_viruses/5890.clean.tsv
../Data/1280_rep_viruses/338.clean.tsv
../Data/1280_rep_viruses/6389.clean.tsv
../Data/1280_rep_viruses/2175.clean.tsv
../Data/1280_rep_viruses/4823.tsv
../Data/1280_rep_viruses/7325.tsv
../Data/1280_rep_vi

../Data/1280_rep_viruses/4962.tsv
../Data/1280_rep_viruses/381.tsv
../Data/1280_rep_viruses/4037.tsv
../Data/1280_rep_viruses/7882.clean.tsv
../Data/1280_rep_viruses/6026.tsv
../Data/1280_rep_viruses/1006.clean.tsv
../Data/1280_rep_viruses/10195.clean.tsv
../Data/1280_rep_viruses/5972.clean.tsv
../Data/1280_rep_viruses/7977.clean.tsv
../Data/1280_rep_viruses/5143.clean.tsv
../Data/1280_rep_viruses/16299.clean.tsv
../Data/1280_rep_viruses/6218.tsv
../Data/1428_rep_viruses/
../Data/1428_rep_viruses/9495.tsv
../Data/1428_rep_viruses/2649.tsv
../Data/1428_rep_viruses/1816.tsv
../Data/1428_rep_viruses/6917.clean.tsv
../Data/1428_rep_viruses/5549.clean.tsv
../Data/1428_rep_viruses/2437.clean.tsv
../Data/1428_rep_viruses/7696.clean.tsv
../Data/1428_rep_viruses/6968.tsv
../Data/1428_rep_viruses/2071.tsv
../Data/1428_rep_viruses/7135.tsv
../Data/1428_rep_viruses/2071.clean.tsv
../Data/1428_rep_viruses/10917.tsv
../Data/1428_rep_viruses/4580.clean.tsv
../Data/1428_rep_viruses/3512.clean.tsv
../D

../Data/573_rep_viruses/8737.tsv
../Data/573_rep_viruses/6530.tsv
../Data/573_rep_viruses/4331.tsv
../Data/573_rep_viruses/11441.tsv
../Data/573_rep_viruses/4762.clean.tsv
../Data/573_rep_viruses/9205.tsv
../Data/573_rep_viruses/8464.clean.tsv
../Data/573_rep_viruses/3207.clean.tsv
../Data/573_rep_viruses/4046.tsv
../Data/573_rep_viruses/2329.clean.tsv
../Data/573_rep_viruses/13900.tsv
../Data/573_rep_viruses/4575.clean.tsv
../Data/573_rep_viruses/6245.tsv
../Data/573_rep_viruses/4040.clean.tsv
../Data/573_rep_viruses/10148.tsv
../Data/573_rep_viruses/9206.clean.tsv
../Data/573_rep_viruses/2787.clean.tsv
../Data/573_rep_viruses/3180.clean.tsv
../Data/573_rep_viruses/14461.tsv
../Data/573_rep_viruses/5584.clean.tsv
../Data/573_rep_viruses/2787.tsv
../Data/573_rep_viruses/3499.tsv
../Data/573_rep_viruses/5317.clean.tsv
../Data/573_rep_viruses/1503.tsv
../Data/573_rep_viruses/4309.tsv
../Data/573_rep_viruses/13767.clean.tsv
../Data/573_rep_viruses/157.clean.tsv
../Data/573_rep_viruses/577

../Data/287_rep_viruses/680.clean.tsv
../Data/287_rep_viruses/1378.tsv
../Data/287_rep_viruses/6030.clean.tsv
../Data/287_rep_viruses/7566.clean.tsv
../Data/287_rep_viruses/10108.tsv
../Data/287_rep_viruses/5297.tsv
../Data/287_rep_viruses/9863.tsv
../Data/287_rep_viruses/4351.clean.tsv
../Data/287_rep_viruses/5482.clean.tsv
../Data/287_rep_viruses/11113.tsv
../Data/287_rep_viruses/728.tsv
../Data/287_rep_viruses/9753.clean.tsv
../Data/287_rep_viruses/1674.clean.tsv
../Data/287_rep_viruses/4321.clean.tsv
../Data/287_rep_viruses/1109.tsv
../Data/287_rep_viruses/4065.tsv
../Data/287_rep_viruses/1089.clean.tsv
../Data/287_rep_viruses/2616.tsv
../Data/287_rep_viruses/10356.clean.tsv
../Data/287_rep_viruses/5839.tsv
../Data/287_rep_viruses/4299.tsv
../Data/287_rep_viruses/6257.clean.tsv
../Data/287_rep_viruses/4855.clean.tsv
../Data/287_rep_viruses/10191.tsv
../Data/287_rep_viruses/2312.clean.tsv
../Data/287_rep_viruses/3070.clean.tsv
../Data/287_rep_viruses/12433.tsv
../Data/287_rep_viruse

../Data/287_rep_viruses/2646.clean.tsv
../Data/287_rep_viruses/10116.tsv
../Data/287_rep_viruses/787.clean.tsv
../Data/287_rep_viruses/9892.clean.tsv
../Data/287_rep_viruses/1010.tsv
../Data/287_rep_viruses/2008.clean.tsv
../Data/287_rep_viruses/5839.clean.tsv
../Data/287_rep_viruses/6219.tsv
../Data/287_rep_viruses/16451.clean.tsv
../Data/287_rep_viruses/9300.tsv
../Data/287_rep_viruses/10672.tsv
../Data/287_rep_viruses/7265.tsv
../Data/287_rep_viruses/5314.tsv
../Data/287_rep_viruses/800.tsv
../Data/287_rep_viruses/13110.clean.tsv
../Data/287_rep_viruses/9863.clean.tsv
../Data/287_rep_viruses/9922.tsv
../Data/287_rep_viruses/800.clean.tsv
../Data/287_rep_viruses/8463.clean.tsv
../Data/287_rep_viruses/1538.clean.tsv
../Data/287_rep_viruses/2485.tsv
../Data/287_rep_viruses/10418.clean.tsv
../Data/287_rep_viruses/1952.tsv
../Data/287_rep_viruses/6410.clean.tsv
../Data/287_rep_viruses/2763.clean.tsv
