In [14]:
import pandas as pd
import os
import glob
import numpy as np
import statistics as stats
import scipy
import src.mpra_tools.predicted_occupancy as po
import src.mpra_tools.fasta_utils as fu
import math
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import random
from sklearn.metrics import f1_score
from src.grammar.sentences import *

In [15]:
activity_df = pd.read_csv("Data/activity.csv", index_col=0)
retinopathy_df = pd.read_csv("Data/retinopathy.csv", index_col=0)
test_labels = activity_df[activity_df['test_set'] | activity_df['cnn_validation_set']].index
train_labels = activity_df[~activity_df.index.isin(test_labels)].index
L = 164
print(len(train_labels), "samples")

114527 samples


### Transform Each Sequence into a sentence

In [17]:
k=5
divs=4

activity_df['sentence'] = po_sentences(activity_df.index.to_list(), "Data/Motifs/summarized_motifs.parquet", k=k, divs=divs)
retinopathy_df['sentence'] = po_sentences(retinopathy_df.index.to_list(), "Data/Motifs/retinopathy_motifs.parquet", k=k, divs=divs)




118364 sentences to create
	 29591 / 118364 sentences created
	 59182 / 118364 sentences created
	 88773 / 118364 sentences created
All sentences created
1723 sentences to create
	 430 / 1723 sentences created
	 860 / 1723 sentences created
	 1290 / 1723 sentences created
	 1720 / 1723 sentences created
All sentences created


### Get word counts for each document class

In [18]:
classes = ['Silencer','Inactive','WeakEnhancer','StrongEnhancer']

In [19]:
#Total word counts for each class

class_counts = dict()
priors = dict()
total = 0

for c, sentences in activity_df.loc[train_labels].groupby('activity_bin')['sentence']:
    word_counter = Counter()
    [word_counter.update(s.split()) for s in sentences.to_list()]
    class_counts[c] = word_counter
    priors[c] = len(sentences)
    total += len(sentences)
    
class_priors = dict([(c, v / total) for c,v in priors.items()])


In [20]:


#If no priors are passed then set all class priors to zero = log(1)
class_priors = pd.Series(np.zeros(len(classes)), index = classes)

#Determine the entire vocabulary seen or to be seen
all = activity_df['sentence'].to_list()+retinopathy_df['sentence'].to_list()

# Create a set with all known characters
alphabet = set()
[alphabet.update(s.split()) for s in all]
V = len(alphabet)

#Convert raw counts to log probs use smoothing by adding 1 to each word in alphabet for each class
[class_counts[c].update(alphabet) for c in classes]
class_word_logs = dict([
    (c,
    dict([
        (word,
        math.log(count / class_counts[c].total()))
        for word,count in class_counts[c].items()
    ]))
    for c in classes
])    
    
    

### Get class probabilities for all docs in the testing set

In [21]:
# CNN Mutagenesis test

preds_test = []
truths_test = activity_df.loc[test_labels]['activity_bin'].to_list()


for sentence in activity_df.loc[test_labels]['sentence']:
    preds = class_priors.copy()
    for c in classes:
        preds.loc[c] += sum(class_word_logs[c][w] for w in sentence.split())
    preds_test.append(preds.idxmax())
    
    

In [22]:
# Retinopathy Test

preds_ret = []
truths_ret = retinopathy_df['activity_bin'].to_list()


for sentence in retinopathy_df['sentence']:
    preds = class_priors.copy()
    for c in classes:
        preds.loc[c] += sum(class_word_logs[c][w] for w in sentence.split())
    preds_ret.append(preds.idxmax())
    
    

In [23]:
averages = ['micro', 'macro','weighted']

f1_ret = [f1_score(truths_ret,preds_ret, labels=classes, average=a) for a in averages]
f1_test = [f1_score(truths_test,preds_test, labels=classes, average=a) for a in averages]

In [24]:
bayes_preformance = pd.DataFrame(
    data = [f1_ret,f1_test],
    columns=averages,
    index=['Retinopathy Set', 'Test Set']
)
bayes_preformance

Unnamed: 0,micro,macro,weighted
Retinopathy Set,0.413233,0.296562,0.382855
Test Set,0.379984,0.329694,0.375722


In [70]:
# Import standard CLF performance metrics
Clf_performance = pd.read_csv("Data/cnn_dataset_performance_metrics.txt", sep ='\t', index_col=[0,1])
clf_test = Clf_performance[Clf_performance['test_set']=='test_set'][averages].groupby(level=0).agg('median')
clf_ret = Clf_performance[Clf_performance['test_set']=='test_retinopathy'][averages].groupby(level=0).agg('median')


In [81]:
bayes_preformance #No allostery

Unnamed: 0,micro,macro,weighted
Retinopathy Set,0.412652,0.296304,0.382456
Test Set,0.379203,0.328883,0.375029
