In [4]:
import pandas as pd
import os
import glob
import numpy as np
import statistics as stats
import scipy
import src.mpra_tools.predicted_occupancy as po
import src.mpra_tools.fasta_utils as fu
import math
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import random
from sklearn.metrics import f1_score
from src.grammar.sentences import po_sentences

In [9]:
activity_df = pd.read_csv("Data/activity.csv", index_col=0)
retinopathy_df = pd.read_csv("Data/retinopathy.csv", index_col=0)
test_labels = set(activity_df[activity_df['test_set'] | activity_df['cnn_validation_set']].index)
train_labels = set(activity_df[~activity_df.index.isin(test_labels)].index)
L = 164
print(len(activity_df), "samples")

118364 samples


In [3]:
# fimo_df = pd.concat([pd.read_csv('Data/Motifs/fimo_eLife_activity/fimo.tsv', sep='\t'),pd.read_csv('Data/Motifs/fimo_eLife_retinopathy/fimo.tsv', sep='\t')], ignore_index=True)
# del fimo_df['motif_alt_id']
# fimo_df.dropna(inplace=True)
# fimo_df['motif_id'] = fimo_df['motif_id'].map(lambda x: x.split('_')[0])
# fimo_df = fimo_df.astype({'start':int, 'stop':int})
# print(len(fimo_df),"motifs")
# list(set(fimo_df['motif_id']))

In [5]:
po_df = pd.concat([pd.read_parquet("Data/Motifs/summarized_motifs.parquet"),pd.read_parquet("Data/Motifs/retinopathy_motifs.parquet")])
print(len(po_df), "motifs")
set(po_df['motif'])

856599 motifs


{'CRX', 'GFI1', 'MAZ', 'MEF2D', 'NDF1', 'NRL', 'RAX', 'RORB'}

### Transform Each Sequence into a sentence

In [15]:
k=5
divs=4

activity_df['sentence'] = po_sentences(activity_df.index.to_list(), "Data/Motifs/summarized_motifs.parquet", k=k, divs=divs)
retinopathy_df['sentence'] = po_sentences(retinopathy_df.index.to_list(), "Data/Motifs/retinopathy_motifs.parquet", k=k, divs=divs)




118364 sentences to create
	 29591 / 118364 sentences created
	 59182 / 118364 sentences created
	 88773 / 118364 sentences created
All sentences created
1723 sentences to create
	 430 / 1723 sentences created
	 860 / 1723 sentences created
	 1290 / 1723 sentences created
	 1720 / 1723 sentences created
All sentences created


In [16]:
activity_df.columns

Index(['expression', 'expression_std', 'expression_reps', 'mu', 'sigma',
       'pval', 'qval', 'expression_log2', 'library', 'sequence',
       'standard_seq', 'scrambled', 'immediate_precursor', 'original_seq',
       'activity_bin', 'rational_mutagenesis', 'ic_scan', 'original_genomic',
       'mut_all_crx', 'mut_shape', 'entropy_sampling', 'margin_sampling',
       'random_sampling', 'high_conf_pilot', 'high_conf_cnn', 'l9_controls',
       'l9_repeat_l8', 'test_set', 'derived_from_test_set_seq',
       'cnn_validation_set', 'data_batch_name', 'svm_train', 'cnn_train',
       'sentence'],
      dtype='object')

In [17]:
# FIMO motifs
# # main dataset
# labels = fimo_df.groupby(by="sequence_name")
# sentences = dict()
# IVAL = 5.0

# for name, df in labels:
#     sdf = df.sort_values(by="start")
#     i = 1
#     s = []
#     for index, row in sdf.iterrows():
#         d = row['start']-i
#         if d > 0:
#             xIVALmer = math.ceil(d/IVAL)
#             s.append(xIVALmer)
#         s.append(row['motif_id']+row['strand'])
#         i = row['stop']+1
#     if i < L:
#         xIVALmer = math.ceil((L-i)/IVAL)
#         s.append(xIVALmer)
#     sentences[name] = s


### Get word counts for each document class

In [23]:
classes = ['Silencer','Inactive','WeakEnhancer','StrongEnhancer']

In [24]:
#Total word counts for each class

class_counts = dict()
priors = dict()
total = 0

for c, sentences in activity_df[activity_df.index.isin(train_labels)].groupby('activity_bin')['sentence']:
    word_counter = Counter()
    [word_counter.update(s.split()) for s in sentences.to_list()]
    class_counts[c] = word_counter
    priors[c] = len(sentences)
    total += len(sentences)
    
class_priors = dict([(c, v / total) for c,v in priors.items()])


In [28]:


#If no priors are passed then set all class priors to zero = log(1)
class_priors = pd.Series(np.zeros(len(classes)), index = classes)


#Determine the entire vocabulary seen or to be seen
all = activity_df['sentence'].to_list()+retinopathy_df['sentence'].to_list()
alphabet = set([str(s).split() for s in all])
V = len(alphabet)

#Convert raw counts to log probs use smoothing by adding 1 to each word in alphabet for each class
[class_counts[c].update(alphabet) for c in classes]
class_word_logs = dict([
    (c,
    dict([
        (word,
        math.log(count / class_counts[c].total()))
        for word,count in class_counts[c].items()
    ]))
    for c in classes
])    
    
    

TypeError: unhashable type: 'list'

### Get class probabilities for all docs in the testing set

In [89]:
preds_test = dict()
truths_test = dict()

test_labels = test_labels[test_labels.isin(sentences.keys())]
test_bins = activity_df.loc[test_labels]['activity_bin']

for i in test_labels:
    #Inititial probabilities for each doc P(c_i)
    probs = dict([(b,p_c[b]) for b in bins])
    for b in bins:
        for w in sentences[i]:
            # Probability that a word appears in the doc.  Log transform means we can add
            probs[b] = probs[b] * (counts[b][w]+1)/(counts[b].total()+V)
    norm = sum(probs.values())
    preds_test[i] = np.array(list(probs.values())) / norm
    truths_test[i] = bins.index(test_bins.loc[i])





In [90]:
preds_ret = dict()
truths_ret = dict()

ret_labels = retinopathy_df[retinopathy_df.index.isin(sentences.keys())].index
ret_bins = retinopathy_df.loc[ret_labels]['activity_bin']

for i in ret_labels:
    #Inititial probabilities for each doc P(c_i)
    probs = dict([(b,p_c[b]) for b in bins])
    for b in bins:
        for w in sentences[i]:
            # Probability that a word appears in the doc.  Log transform means we can add
            probs[b] = probs[b] + math.log((counts[b][w]+1)/(counts[b].total()+V))
    #norm = sum(probs.values())
    preds_ret[i] = np.array(list(probs.values())) #/ norm
    truths_ret[i] = bins.index(ret_bins.loc[i])




In [91]:
averages = ['micro', 'macro','weighted']

print("retinopathy test")
t = list(truths_ret.values())
p = [a.argmax() for a in preds_ret.values()]

correct = 0
for truth, pred in zip(t,p):
    if truth == pred:
        correct += 1
        
correct = correct/len(t)


f1_ret = [f1_score(t,p, labels=range(len(bins)), average=a) for a in averages] + [correct]

retinopathy test


In [92]:

t = list(truths_test.values())
p = [a.argmax() for a in preds_test.values()]

correct = 0
for truth, pred in zip(t,p):
    if truth == pred:
        correct += 1
        
correct = correct/len(t)

f1_test = [f1_score(t,p, labels=range(len(bins)), average=a) for a in averages] + [correct]


In [93]:
bayes_preformance = pd.DataFrame(
    data = [f1_ret,f1_test],
    columns=averages+['correct'],
    index=['Retinopathy Set', 'Test Set']
)

In [5]:
Clf_performance = pd.read_csv("Data/cnn_dataset_performance_metrics.txt", sep ='\t', index_col=[0,1])
clf_test = Clf_performance[Clf_performance['test_set']=='test_set'].copy()
clf_ret = Clf_performance[Clf_performance['test_set']=='test_retinopathy'].copy()
clf_test.drop(columns=['test_set','nseqs_train','nseqs_test'],inplace=True)
clf_ret.drop(columns=['test_set','nseqs_train','nseqs_test'],inplace=True)


In [6]:
clf_test

Unnamed: 0_level_0,Unnamed: 1_level_0,micro,macro,weighted,Si,In,WE,SE
dataset,fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GenomicOnly,1,0.340366,0.135456,0.190666,0.0,0.015748,0.508004,0.018072
GenomicOnly,2,0.340366,0.143296,0.200819,0.0,0.028986,0.508696,0.035503
GenomicOnly,3,0.354430,0.134794,0.188082,0.0,0.016260,0.522917,0.000000
GenomicOnly,4,0.338959,0.152656,0.226782,0.0,0.000000,0.489888,0.120735
GenomicOnly,5,0.353024,0.139779,0.201786,0.0,0.000000,0.516949,0.042169
...,...,...,...,...,...,...,...,...
Round3aNoRound2,6,0.412096,0.281765,0.398091,0.0,0.212963,0.469256,0.444840
Round3aNoRound2,7,0.395218,0.280363,0.390207,0.0,0.237918,0.453427,0.430108
Round3aNoRound2,8,0.424754,0.282919,0.409341,0.0,0.188119,0.440433,0.503125
Round3aNoRound2,9,0.406470,0.285823,0.400776,0.0,0.240000,0.435714,0.467577


In [95]:
bayes_preformance

Unnamed: 0,micro,macro,weighted,correct
Retinopathy Set,0.334301,0.288965,0.342043,0.334301
Test Set,0.363368,0.323388,0.371521,0.363368


In [47]:
bayes_preformance #Without Prior IVAL=5

Unnamed: 0,micro,macro,weighted,correct
Retinopathy Set,0.346051,0.306759,0.369134,0.346051
Test Set,0.399625,0.333877,0.413703,0.399625
