In [1]:
%matplotlib inline

In [12]:
import json
import gff3_parsing
# from matplotlib import pyplot as plt
# import pandas as pd
import numpy as np
from scipy import stats
import glob

In [6]:
def analyze_genome(dataframe, energy_dict, gaps = (4,10), expected_len = 20):
    
    for index in dataframe.index:
        upstream = dataframe.loc[index,"upstream_sequence"]
        test_string = upstream.replace("T", "U")
        if len(test_string) != expected_len:
            continue
        if test_string.count("A") + test_string.count("U") +\
                                    test_string.count("C") + test_string.count("G") != expected_len:
            continue
            
        energy_list = []
        for gap in range(gaps[0],gaps[1]+1):
             energy_list.append(energy_dict[test_string[-gap - 6: -gap]])

        dataframe.at[index, "energy_binding"] = min(energy_list)
    return dataframe

In [7]:
with open('../Data/examples_for_testing/energyRef_CCUCCU_mfe.json', 'r') as infile:
       energy_dict = json.load(infile)


host_df, host_genome = gff3_parsing.compile_sequences("../Data/examples_for_testing/ecoli.gff3",\
                                                    "../Data/examples_for_testing/ecoli.fasta", 20)

host_df = analyze_genome(host_df, energy_dict)

In [13]:
mean_difs = []

p_values = []
for gff_file in glob.glob("../Data/ecoli_viruses/562_rep_viruses_concat/*.gff"):
    #print(gff_file)
    fasta_file = "../Data/ecoli_viruses/562_rep_viruses/" + gff_file.split("/")[-1].replace("gff", "fasta")
    
    viral_df, viral_genome = gff3_parsing.compile_sequences(gff_file,fasta_file, 20)
    viral_df = analyze_genome(viral_df, energy_dict)

    #print(stats.ranksums(host_df["energy_binding"], viral_df["energy_binding"]))
    rstat, p_value = stats.ranksums(host_df["energy_binding"], viral_df["energy_binding"])
    p_values.append(p_value)
    
    mean_difs.append(np.mean(viral_df["energy_binding"]) - np.mean(host_df["energy_binding"]))
    

In [16]:
len(p_values), len([i for i in p_values if i < 0.01])

(373, 157)

# Info theory stuff

In [21]:
def get_info_theory_init(sequence_list, randomize=False):
    positional_entropies = []
    matrix = np.array([list(i) for i in sequence_list])
    if randomize:
        for i in matrix:
            np.random.shuffle(i)
    seqs, lengths = np.shape(matrix)
    bool_A = matrix == 'A'
    bool_T = matrix == 'T'
    bool_G = matrix == 'G'
    bool_C = matrix == 'C'

    A_counts = bool_A.sum(axis=0)
    T_counts = bool_T.sum(axis=0)
    G_counts = bool_G.sum(axis=0)
    C_counts = bool_C.sum(axis=0)
    p_A = A_counts/float(seqs)
    p_T = T_counts/float(seqs)
    p_G = G_counts/float(seqs)
    p_C = C_counts/float(seqs)
    positional_entropies = np.log2(4) - stats.entropy([p_A, p_T, p_G, p_C], base=2)
    return positional_entropies

def get_delta_I(sequence_list, randomizations=20):
    positional_data = get_info_theory_init(sequence_list)
    I_actual = np.sum(positional_data)
    
    randomized_list = []
    for i in range(randomizations):
        positions_random = get_info_theory_init(sequence_list, randomize=True)
        randomized_list.append(np.sum(positions_random))
    return I_actual, I_actual - np.mean(randomized_list)

In [22]:
get_delta_I(list(host_df['upstream_sequence']))

(2.1997334813704663, 1.3776474156696548)

In [24]:
get_delta_I(list(viral_df['upstream_sequence']))

(2.855252839806277, 0.8252228309332739)

In [25]:
p_value

0.05440718457820569

In [43]:
with open('../Data/examples_for_testing/energyRef_CCUCCU_mfe.json', 'r') as infile:
       energy_dict = json.load(infile)


host_df, host_genome = gff3_parsing.compile_sequences("../Data/examples_for_testing/ecoli.gff3",\
                                                    "../Data/examples_for_testing/ecoli.fasta", 30)

host_df = analyze_genome(host_df, energy_dict)
host_I, host_Idiff = get_delta_I(list(host_df['upstream_sequence']))

In [44]:
a, b = [], []
for gff_file in glob.glob("../Data/ecoli_viruses/562_rep_viruses_concat/*.gff")[:]:
    #print(gff_file)
    fasta_file = "../Data/ecoli_viruses/562_rep_viruses/" + gff_file.split("/")[-1].replace("gff", "fasta")
    
    viral_df, viral_genome = gff3_parsing.compile_sequences(gff_file,fasta_file, 30)
    viral_df = analyze_genome(viral_df, energy_dict)
    viral_df = viral_df[viral_df['upstream_sequence'].str.len() == 30]

    temp_a, temp_b = get_delta_I(list(viral_df['upstream_sequence']))
    a.append(temp_a)
    b.append(temp_b)
    #print(stats.ranksums(host_df["energy_binding"], viral_df["energy_binding"]))
#     rstat, p_value = stats.ranksums(host_df["energy_binding"], viral_df["energy_binding"])
#     p_values.append(p_value)
    
#     mean_difs.append(np.mean(viral_df["energy_binding"]) - np.mean(host_df["energy_binding"]))
    

In [45]:
len([i for i in a if i > host_I]), len([i for i in b if i > host_Idiff])

(367, 165)

In [35]:
b

[1.1896242714773755,
 1.009867843670647,
 0.7806178579410419,
 0.9676170612992911,
 1.1714052951356086]