In [4]:
%load_ext autoreload
%autoreload 2

%pylab inline
import numpy as np
import os
import pandas as pd

import seaborn as sns
sns.set_style('ticks')
sns.set_context('paper')

import eternabench as eb
from RiboGraphViz import RGV

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Populating the interactive namespace from numpy and matplotlib


In [5]:
df = pd.read_json(os.environ['ETERNABENCH_PATH']+'/data/EternaBench_ChemMapping_Full_10Jul2021.json.zip')

In [6]:
df['GC content'] = [(x.count('G')+x.count('C'))/len(x) for x in df['sequence']]
df['Length'] = [len(x)-21 for x in df['sequence']]
df['signal_to_noise'] = [float(x.split(':')[-1]) for x in df['signal_to_noise']]
df['Max. reactivity'] = df.apply(lambda row: np.max(row['reactivity']), axis=1)
df['Median reactivity'] = df.apply(lambda row: np.median(row['reactivity']), axis=1)

def get_RGV_stats(row):
    try:
        mdl = RGV(row['structure'])
        mdl.run_structure_properties()
        loop_count = len(list([x for x in mdl.G.nodes if isinstance(x, int)]))-1
        return loop_count, np.clip(mdl.n_hairpins,0,100), mdl.n_internal_loops, mdl.n_3WJs + mdl.n_4WJs+ mdl.n_5WJs_up
    except:
        struct = row['structure'].replace('))(((((((....))))))).....................','..(((((((....))))))).....................')
        mdl = RGV(struct)
        mdl.run_structure_properties()
        loop_count = len(list([x for x in mdl.G.nodes if isinstance(x, int)]))-1
        return loop_count, np.clip(mdl.n_hairpins,0,100), mdl.n_internal_loops, mdl.n_3WJs + mdl.n_4WJs + mdl.n_5WJs_up
    
unique_struct_df = pd.DataFrame({'structure': [x for x in set(df.structure)]}) 
unique_struct_df[['Target structure, total loops', 'Target structure, # hairpins','Target structure, # Internal loops','Target structure, # Multiloops']] =\
unique_struct_df.apply(lambda row: get_RGV_stats(row), axis=1, result_type='expand')
df = df.merge(unique_struct_df, on='structure')

df.to_json('CloudLabMetadata.json.zip')

In [7]:
project_scores = pd.read_csv(os.environ['ETERNABENCH_PATH']+'/scoring_data/EB_projects_-efold_pearson_zscores_by_project_name.csv')
print(len(project_scores))
project_scores = project_scores.loc[project_scores.pearson_std<0.05]
print(len(project_scores))

project_variance = project_scores.groupby(['project_name'])['pearson_mean'].std()
project_variance = project_variance.reset_index()
project_variance['Stddev_of_package_correlations'] = project_variance['pearson_mean']
project_variance = project_variance.drop(columns=['pearson_mean'])


project_mean = project_scores.groupby(['project_name'])['pearson_mean'].mean()
project_mean = project_mean.reset_index()
project_mean['Mean_of_package_correlations'] = project_mean['pearson_mean']
project_mean = project_mean.drop(columns=['pearson_mean'])


21864
13583


In [8]:
def create_project_stats(df):
    project_stats = df.groupby(['Dataset','project_name']).mean()
    project_sizes = df.groupby(['Dataset','project_name'])['sequence'].size()

    project_sizes = project_sizes.reset_index()
    project_stats = project_stats.reset_index()
    project_stats['# constructs'] = project_sizes['sequence']
    project_stats = project_stats.loc[~project_stats.Dataset.isna()]
    project_stats = project_stats.loc[project_stats.Dataset!='RYOS_I']
    
    proj_entropies = pd.DataFrame()
    for proj_name in df.project_name.unique():
        if proj_name is not None:
            seqs = [x for x in df.loc[df.project_name==proj_name]['sequence']]
            n_lengths = len(list(set([len(x) for x in seqs])))
            if n_lengths == 1:
                entropy = eb.sequence_analysis.positional_entropy(seqs)
                proj_entropies = proj_entropies.append({'project_name': proj_name, 'Sequence Entropy': entropy/np.log(4)}, ignore_index=True)

    project_stats = project_stats.merge(proj_entropies, on='project_name',how='left')
    return project_stats

proj_stats = create_project_stats(df)
proj_stats_filt = create_project_stats(df.loc[df.passed_CDHIT_filter==True])
proj_stats_filt = proj_stats_filt.merge(project_mean,on='project_name',how='left')
proj_stats_filt = proj_stats_filt.merge(project_variance,on='project_name',how='left')

In [9]:
cfold_zscores = project_scores.loc[project_scores.package=='contrafold_2']
cfold_zscores['cfold_zscore'] = cfold_zscores['pearson_zscore_by_project_name_mean']
cfold_zscores = cfold_zscores[['project_name', 'cfold_zscore']]
proj_stats_filt = proj_stats_filt.merge(cfold_zscores, on='project_name')

In [10]:
proj_stats.to_json('~/das/github/EternaBench/analysis/proj_stats.json.zip')
proj_stats_filt.to_json('~/das/github/EternaBench/analysis/proj_stats_filt.json.zip')