In [1]:
%matplotlib inline
%load_ext autoreload

%autoreload 2

# Notebook summary

The purpose of this notebook is to create a set of `.tsv` files from various sources for both host and viral genomes. It is expected that the project organization follows the relative path where all initial files are located within `../Data`. 

# Imports

In [2]:
import json
import gff3_parsing ###A separate .py library that should be available in the python path
import pandas as pd
import glob

# Custom function definitions

In [3]:
def add_RBS_energy(dataframe, energy_dict, col_name="RBS_energy",\
                   gaps=(4,10), expected_len=30, RBS_len=6):
    """
    This function adds a ribosome binding site (RBS) energy column to the dataframe based off of 
    free energy values pre-computed and stored in the corresponding energy_dict. 
    
    Inputs:
        dataframe - 
        energy_dict - 
        gaps - 
        expected_len - 
    
    Outputs:
        dataframe - the transformed dataframe object now containing the energy_binding column
    
    """
    for index in dataframe.index:
        upstream = dataframe.loc[index,"upstream_sequence"]
        test_string = upstream.replace("T", "U")
        ###Ensure that the sequence is the proper expected length
        if len(test_string) != expected_len:
            continue
        ###Ensure that the sequence has no abnormal bases
        if test_string.count("A") + test_string.count("U") +\
                                    test_string.count("C") + test_string.count("G") != expected_len:
            continue
            
        ###Calculate the energy for the indicated gap offsets
        energy_list = []
        for gap in range(gaps[0],gaps[1]+1):
             energy_list.append(energy_dict[test_string[-gap - RBS_len: -gap]])

        dataframe.at[index, col_name] = min(energy_list)        
    return dataframe

# Notebook-wide parameters 

In [4]:
sep = "\t"
upstream_len = 30

with open('../Data/energy_files/energyRef_CCUCCU_ensemble_noneConstraint.json', 'r') as infile:
       energy_dict = json.load(infile)

# Creating `.tsv` files for host genomes

In [5]:
host_ids = [36809,\
           717959,\
           305,\
           1590,\
           435591,\
           90371,\
           1314,\
           357276,\
           657318,\
           1639,\
           1428,\
           470,\
           573,\
           1280,\
           287,\
           562]

for host_id in host_ids:
    ###Creates the dataframe based off a gff3 and fasta file
    host_df, host_genome = gff3_parsing.compile_sequences(["../Data/host_genomes/{}.gff3".format(host_id)],
                                                        ["../Data/host_genomes/{}.fasta".format(host_id)], upstream_len)
    ###Adds the ribosome binding site energy column
    host_df = add_RBS_energy(host_df, energy_dict, col_name="RBS_energy", gaps=(4,10))
    host_df = add_RBS_energy(host_df, energy_dict, col_name="RBS_energy_upstream", gaps=(11,17))

    
    ###Writes to a file
    host_df.to_csv("../Data/host_genomes/{}.tsv".format(host_id), sep = sep )

## Treat the 2 chromosome hosts slightly differently to ensure no errors occur

(There is currently only one of these)

In [6]:
host_ids_2chrom = [28450]

for host_id in host_ids_2chrom:
    gffs = ["../Data/host_genomes/{}.1.gff3".format(host_id),\
            "../Data/host_genomes/{}.2.gff3".format(host_id)]

    fastas = ["../Data/host_genomes/{}.1.fasta".format(host_id),\
              "../Data/host_genomes/{}.2.fasta".format(host_id)]

    host_df, host_genome = gff3_parsing.compile_sequences(gffs, fastas, upstream_len)
    ###Adds the ribosome binding site energy column
    host_df = add_RBS_energy(host_df, energy_dict, col_name="RBS_energy", gaps=(4,10))
    host_df = add_RBS_energy(host_df, energy_dict, col_name="RBS_energy_upstream", gaps=(11,17))


    host_df.to_csv("../Data/host_genomes/{}.tsv".format(host_id), sep=sep)

# Creating `.tsv` files for viral genomes

Note that somewhat annoyingly I made these viral files end in `.gff` whereas the hosts end in `.gff3`.

In [7]:
###Concatenate the two existing host lists
host_ids = host_ids + host_ids_2chrom
for host_id in host_ids:
    for gff_file in glob.glob("../Data/{}_rep_viruses/*.gff".format(host_id)):
        fasta_file = gff_file.replace(".gff", ".fasta")
        tsv_file = gff_file.replace(".gff", ".tsv")
        
        viral_df, viral_genome = gff3_parsing.compile_sequences([gff_file], [fasta_file], upstream_len)
        viral_df = add_RBS_energy(viral_df, energy_dict, col_name="RBS_energy", gaps=(4,10))
        viral_df = add_RBS_energy(viral_df, energy_dict, col_name="RBS_energy_upstream", gaps=(11,17))
        viral_df.to_csv(tsv_file, sep=sep)