In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import glob
import sys
sys.path.append('/Users/ChaseWeaver/Desktop/Projects/iCUB-master/') 
import iCUB

# Cleaning Host Data

In [3]:
def clean_host_tsv(df):
    """
    
    """
    
    initial_shape = df.shape
    df = df.reset_index(drop = True)
    
    #Ensure each locus tag is only used once
    df['locus_tag'] = df['qualifiers'].str.split('locus_tag=', n=1, expand=True)[1]\
                            .str.split(';', n=1, expand=True)[0]
    df = df.drop_duplicates(subset = ["locus_tag"], keep = False)
    
    #ensure that only genes with a total length divisible by 3 are used
    df = df[(df['stop']-df['start']) %3 == 2]
    
    #filter out prophages from dataframe
    filter_word = 'phage'
    virus_genes = df[(df['qualifiers'].str.contains(filter_word)==True)]
    df = df[(df['qualifiers'].str.contains(filter_word)==False)]
    
    #adjust for codon bias
    for index in df.index:
        nt_seq = df.at[index, "coding_sequence"]
        if len(nt_seq)%3 == 0:
            df.at[index, "iCUB"] = iCUB.iCUB_Calculator(nt_seq).get_iCUB()
    
    df = df[df['iCUB'].isnull()==False]
    df =df[df['energy_binding'].isnull()==False]
    
    return df

# Clean Virus data

In [3]:
def clean_virus_tsv(df):
    """
    
    """  
    initial_shape = df.shape
    df = df.reset_index(drop = True)
    
    #Ensure each locus tag is only used once
    df['viral_id'] = df['qualifiers'].str.split('ID=', n=1, expand=True)[1]\
                            .str.split(';', n=1, expand=True)[0]
    
    df = df.drop_duplicates(subset = ["viral_id"], keep = False)
    
    #ensure that only genes with a total length divisible by 3 are used
    df = df[(df['stop']-df['start']) %3 == 2]
    
    #adjust for codon bias
    for index in df.index:
        nt_seq = df.at[index, "coding_sequence"]
        if len(nt_seq) != nt_seq.count('A') + nt_seq.count('T') + nt_seq.count('C') + nt_seq.count('G'):
            continue
        
        if len(nt_seq)%3 != 0:
            continue
            
        df.at[index, "iCUB"] = iCUB.iCUB_Calculator(nt_seq).get_iCUB()           

    df = df[df['iCUB'].isnull()==False]
    df = df[df['energy_binding'].isnull()==False]
    return df

In [5]:
for virus_folder in glob.glob('../Data/*_rep_viruses/'):
    if '90371' not in virus_folder:
        continue 
    print(virus_folder)
    for virus_tsv_file in glob.glob(virus_folder + '*.tsv'):
        print(virus_tsv_file)
        
        if "clean" in virus_tsv_file:
            continue
            
        df = pd.read_csv(virus_tsv_file, sep = "\t", index_col = 0)

        initial_shape = df.shape
        #print(initial_shape)
        
        df = clean_virus_tsv(df)
        final_shape = df.shape
        gene_ratio = final_shape[0]/initial_shape[0]
        if gene_ratio <= .8:
            print(tsv_location)
            break
    
        clean_tsv_loc = virus_tsv_file.replace(".tsv",".clean.tsv")
        df.to_csv(clean_tsv_loc, sep = "\t")

../Data/90371_rep_viruses/
../Data/90371_rep_viruses/10087.tsv
../Data/90371_rep_viruses/18747.tsv
../Data/90371_rep_viruses/17138.tsv
../Data/90371_rep_viruses/760.tsv
../Data/90371_rep_viruses/995.tsv
../Data/90371_rep_viruses/5524.tsv
../Data/90371_rep_viruses/1622.tsv
../Data/90371_rep_viruses/10069.tsv
../Data/90371_rep_viruses/14297.tsv
../Data/90371_rep_viruses/4933.tsv
../Data/90371_rep_viruses/3085.tsv
../Data/90371_rep_viruses/9036.tsv
../Data/90371_rep_viruses/5618.tsv
../Data/90371_rep_viruses/6846.tsv
../Data/90371_rep_viruses/6271.tsv
../Data/90371_rep_viruses/4466.tsv
../Data/90371_rep_viruses/15006.tsv
../Data/90371_rep_viruses/894.tsv
../Data/90371_rep_viruses/13728.tsv
../Data/90371_rep_viruses/3111.tsv
../Data/90371_rep_viruses/1060.tsv
../Data/90371_rep_viruses/7388.tsv
../Data/90371_rep_viruses/1061.tsv
../Data/90371_rep_viruses/13929.tsv
../Data/90371_rep_viruses/9826.tsv
../Data/90371_rep_viruses/5417.tsv
../Data/90371_rep_viruses/12955.tsv
../Data/90371_rep_viru