# Dataset Statistics

This notebook plots statistics for our RNA inverse design dataset created using [RNASolo](https://rnasolo.cs.put.poznan.pl).
We visualise the diversity of our dataset in terms of sequence length, number of structures per sequence, as well as structural variations among conformations per sequence.

In [None]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../')

import os
import random
import argparse
import wandb
import numpy as np
from tqdm import tqdm
from functools import partial
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes, InsetPosition, mark_inset

import torch
import torch_geometric

from MDAnalysis.analysis import rms

from src.data import RNADesignDataset

In [None]:
wandb.init(project="gRNAde", entity="chaitjo", config="../configs/default.yaml", name="debug", mode='disabled')
config = wandb.config
for key, val in config.items():
    print(f"  {key}: {val}")

# Set device (GPU/CPU)
device = torch.device("cuda:{}".format(config.gpu) if torch.cuda.is_available() else "cpu")

In [None]:
data_list = torch.load(os.path.join("../data/", "processed.pt"))

In [None]:
# Number of unique sequences
len(data_list)

In [None]:
# Total number of structures
total = 0
for data in data_list:
    total += len(data['coords_list'])
print(total)

In [None]:
# Distribution of sequence length

seq_lens = [len(data['seq']) for data in data_list]

print(f"Distribution: {np.mean(seq_lens)} +- {np.std(seq_lens)}")
print(f"Max: {np.max(seq_lens)}, Min: {np.min(seq_lens)}")

# Plot a histogram
plt.hist(seq_lens, bins=100)

# Add labels and title
plt.xlabel('Sequence length')
plt.ylabel('Frequency')
plt.title('Histogram of sequence lengths')

# Display the plot
plt.show()

In [None]:
# (Zoomed) Distribution of sequence length

# Plot a histogram
plt.hist(seq_lens, bins=1000)

# Add labels and title
plt.xlabel('Sequence length')
plt.ylabel('Frequency')
plt.title('Histogram of sequence lengths')

# Uncomment to change range
ax = plt.gca()
ax.set_xlim([0,100])

# Display the plot
plt.show()

In [None]:
fig, ax1 = plt.subplots()

# Plot a histogram
ax1.hist(seq_lens, bins=100)

# Add labels and title
ax1.set_xlabel('Sequence length')
ax1.set_ylabel('Frequency')
ax1.set_title( 
    r"$\bf{" + "Histogram \ of \ sequence \ lengths" + "}$" + f"\nDistribution: {np.mean(seq_lens):.1f} ± {np.std(seq_lens):.1f}, Max: {np.max(seq_lens)}, Min: {np.min(seq_lens)}"
)

# Create a set of inset Axes
ax2 = plt.axes([0,0,1,1])
# Manually set the position and relative size of the inset axes within ax1
ip = InsetPosition(ax1, [0.4,0.4,0.5,0.5])
ax2.set_axes_locator(ip)
# Mark the region corresponding to the inset axes on ax1 
# and draw lines in grey linking the two axes.
mark_inset(ax1, ax2, loc1=2, loc2=4, fc="none", ec='0.6')

# Plot second histogram
ax2.hist(seq_lens, bins=1000)
ax2.set_xlim([0,200])

# Some ad hoc tweaks.
# ax2.set_xticklabels(ax2.get_xticks(), backgroundcolor='w')
# ax2.tick_params(axis='x', which='major', pad=8)

# Display the plot
# plt.savefig('hist_seq_len.pdf', dpi=300)
plt.show()

In [None]:
# Distribution of number of structures per unique sequence

num_struct_per_seq = [len(data['coords_list']) for data in data_list]
print(f"Distribution: {np.mean(num_struct_per_seq)} +- {np.std(num_struct_per_seq)}")
print(f"Max: {np.max(num_struct_per_seq)}, Min: {np.min(num_struct_per_seq)}")

# Plot a histogram
plt.hist(num_struct_per_seq, bins=100)

# Add labels and title
plt.xlabel('Number of structures')
plt.ylabel('Frequency')
plt.title('Histogram of no. of structures per unique sequence')

# Display the plot
plt.show()

In [None]:
# (Zoomed) Distribution of number of structures per unique sequence

# Plot a histogram
plt.hist(num_struct_per_seq, bins=500)

# Add labels and title
plt.xlabel('Number of structures')
plt.ylabel('Frequency')
plt.title('Histogram of no. of structures per unique sequence')

# Uncomment to change range
ax = plt.gca()
ax.set_xlim([0,20])

# Display the plot
plt.show()

In [None]:
fig, ax1 = plt.subplots()

# Plot a histogram
ax1.hist(num_struct_per_seq, bins=100)

# Add labels and title
ax1.set_xlabel('Number of structures per sequence')
ax1.set_ylabel('Frequency')
ax1.set_title( 
    r"$\bf{" + "Histogram \ of \ no. \ of \ structures \ per \ unique \ sequence" + "}$" + f"\nDistribution: {np.mean(num_struct_per_seq):.2f} ± {np.std(num_struct_per_seq):.2f}, Max: {np.max(num_struct_per_seq)}, Min: {np.min(num_struct_per_seq)}"
)

# Create a set of inset Axes
ax2 = plt.axes([0,0,1,1])
# Manually set the position and relative size of the inset axes within ax1
ip = InsetPosition(ax1, [0.4,0.4,0.5,0.5])
ax2.set_axes_locator(ip)
# Mark the region corresponding to the inset axes on ax1 
# and draw lines in grey linking the two axes.
mark_inset(ax1, ax2, loc1=2, loc2=4, fc="none", ec='0.6')

# Plot second histogram
ax2.hist(num_struct_per_seq, bins=400)
ax2.set_xlim([0,20])
ax2.set_ylim([0,1000])

# Some ad hoc tweaks.
# ax2.set_xticklabels(ax2.get_xticks(), backgroundcolor='w')
# ax2.tick_params(axis='x', which='major', pad=8)

# Display the plot
# plt.savefig('hist_num_struct_per_seq.pdf', dpi=300)
plt.show()

In [None]:
# Raw values of number of some structures per sequence distribution
bin, count = np.unique(num_struct_per_seq, return_counts=True)
for i, pair in enumerate(zip(bin, count)):
    if i < 10 or i > len(bin) - 10:
        print(f"{pair[0]} structures -> {pair[1]} sequences")
    elif i == len(bin) // 2:
        print("...")

In [None]:
# Frequency of each base in the sequences
# Pyrimidine (C, U)
# Purine (A, G)

base_counts = {'A': 0, 'G': 0, 'C': 0, 'U': 0, 'other': 0}
for data in tqdm(data_list):
    for base in data['seq']:
        if base in base_counts:
            base_counts[base] += 1
        else:
            base_counts['other'] += 1

for base in base_counts:
    print(f"{base}: {base_counts[base]}")

In [None]:
# RMSD distribution among structures within each set

seq_to_rmsds = {}

for data in tqdm(data_list):

    seq, coords_list = data["seq"], data["coords_list"]
    
    if len(coords_list) > 1 and len(seq) > 1:
        # Compute pairwise RMSD among all pairs
        rmsds = []
        for i in range(len(coords_list)):
            for j in range(i+1, len(coords_list)):
                try:
                    rmsds.append(rms.rmsd(
                        # coords_list[i].reshape(-1, 3), coords_list[j].reshape(-1, 3),
                        coords_list[i][:, 1], coords_list[j][:, 1],  # C4' only
                        center=True, superposition=True
                    ))
                except:
                    # Very short sequences, where this fails 
                    # Fall back to RMSE w/out superposition
                    c_i, c_j = coords_list[i][:, 1], coords_list[j][:, 1]
                    c_i = c_i - np.mean(c_i, axis=0)
                    c_j = c_j - np.mean(c_j, axis=0)
                    rmsds.append(np.sqrt(np.mean((c_i - c_j)**2)))
        seq_to_rmsds[seq] = rmsds
    else:
        seq_to_rmsds[seq] = [0.0]

# torch.save(seq_to_rmsds, os.path.join("../data/", "seq_to_rmsds.pt"))
# seq_to_rmsds = torch.load(os.path.join("../data/", "seq_to_rmsds.pt"))

In [None]:
# Distribution of average RMSD per sequence
# Note: Omit avg. RMSD = 0, which are sequences with a single structure

rmsd_per_seq = [np.mean(rmsds) for rmsds in seq_to_rmsds.values() if np.mean(rmsds) > 0.0]
print(f"Distribution: {np.mean(rmsd_per_seq)} +- {np.std(rmsd_per_seq)}")
print(f"Max: {np.max(rmsd_per_seq)}, Min: {np.min(rmsd_per_seq)}")

# Plot a histogram
plt.hist(rmsd_per_seq, bins=200)

# Add labels and title
plt.xlabel('Avg. pairwise RMSD among structures per sequence (Å)')
plt.ylabel('Frequency')
# plt.title('Histogram of average RMSD per sequence')
plt.title( 
    r"$\bf{" + "Histogram \ of \ avg. \ pairwise \ RMSD \ per \ sequence" + "}$" + f"\nDistribution: {np.mean(rmsd_per_seq):.2f}Å ± {np.std(rmsd_per_seq):.2f}, Max: {np.max(rmsd_per_seq):.2f}Å, Min: {np.min(rmsd_per_seq):.2f}Å"
)

# Display the plot
# plt.savefig('hist_rmsd_per_sequence.pdf', dpi=300)
plt.show()

In [None]:
# Distribution of average RMSD per sequence
# Note: Omit avg. RMSD = 0, which are sequences with a single structure

rmsd_per_seq = [np.mean(rmsds) for rmsds in seq_to_rmsds.values() if np.mean(rmsds) > 0.0]
print(f"Distribution: {np.mean(rmsd_per_seq)} +- {np.std(rmsd_per_seq)}")
print(f"Max: {np.max(rmsd_per_seq)}, Min: {np.min(rmsd_per_seq)}")

fig, ax1 = plt.subplots()

# Plot a histogram
ax1.hist(rmsd_per_seq, bins=200)

# Add labels and title
ax1.set_xlabel('Avg. pairwise RMSD among structures per sequence (Å)')
ax1.set_ylabel('Frequency')
# plt.title('Histogram of average RMSD per sequence')
ax1.set_title( 
    r"$\bf{" + "Histogram \ of \ avg. \ pairwise \ RMSD \ per \ sequence" + "}$" + f"\nDistribution: {np.mean(rmsd_per_seq):.2f}Å ± {np.std(rmsd_per_seq):.2f}, Max: {np.max(rmsd_per_seq):.2f}Å, Min: {np.min(rmsd_per_seq):.2f}Å"
)

# Create a set of inset Axes
ax2 = plt.axes([0,0,1,1])
# Manually set the position and relative size of the inset axes within ax1
ip = InsetPosition(ax1, [0.4,0.4,0.5,0.5])
ax2.set_axes_locator(ip)
# Mark the region corresponding to the inset axes on ax1 
# and draw lines in grey linking the two axes.
mark_inset(ax1, ax2, loc1=2, loc2=4, fc="none", ec='0.6')

# Plot second histogram
ax2.hist(rmsd_per_seq, bins=400)
ax2.set_xlim([0,5])
# ax2.set_ylim([0,1000])

# Display the plot
# plt.savefig('hist_rmsd_per_sequence.pdf', dpi=300)
plt.show()

In [None]:
# Distribution of maximum RMSD per sequence
# Note: Omit avg. RMSD = 0, which are sequences with a single structure

rmsd_per_seq = [np.max(rmsds) for rmsds in seq_to_rmsds.values() if np.max(rmsds) > 0.0]
print(f"Distribution: {np.mean(rmsd_per_seq)} +- {np.std(rmsd_per_seq)}")
print(f"Max: {np.max(rmsd_per_seq)}, Min: {np.min(rmsd_per_seq)}")

# Plot a histogram
plt.hist(rmsd_per_seq, bins=200)

# Add labels and title
plt.xlabel('Maximum RMSD among structures per sequence')
plt.ylabel('Frequency')
plt.title('Histogram of maximum RMSD per sequence')

# Display the plot
plt.show()

In [None]:
# Distribution of minimum RMSD per sequence
# Note: Omit avg. RMSD = 0, which are sequences with a single structure

rmsd_per_seq = [np.min(rmsds) for rmsds in seq_to_rmsds.values() if np.min(rmsds) > 0.0]
print(f"Distribution: {np.mean(rmsd_per_seq)} +- {np.std(rmsd_per_seq)}")
print(f"Max: {np.max(rmsd_per_seq)}, Min: {np.min(rmsd_per_seq)}")

# Plot a histogram
plt.hist(rmsd_per_seq, bins=200)

# Add labels and title
plt.xlabel('Minimum RMSD among structures per sequence')
plt.ylabel('Frequency')
plt.title('Histogram of minimum RMSD per sequence')

# Display the plot
plt.show()

In [None]:
# Sequence length vs average RMSD distribution - are longer/shorter sequence more likely to have structural variation?
# Note: Omit avg. RMSD = 0, which are sequences with a single structure

# Modify parameters to change visualisation
len_range = (0, 5000)
rmsd_range = (0, 5)

seq_len = []
rmsd_per_seq = []
for seq, rmsds in seq_to_rmsds.items():
    if len_range[0] < len(seq) < len_range[1]:
        if rmsd_range[0] < np.mean(rmsds) < rmsd_range[1]:
            seq_len.append(len(seq))
            rmsd_per_seq.append(np.mean(rmsds))

# Create a bivariate distribution plot
plt.hist2d(np.log(seq_len), np.log(rmsd_per_seq), bins=50, cmap='Reds')

# Add a colorbar
plt.colorbar()

# Add labels and title
plt.xlabel('log(Sequence length)')
plt.ylabel('log(Average RMSD among structures)')
plt.title('Bivariate Distribution Plot for sequence length vs. average RMSD')

# Uncomment to change range
# ax = plt.gca()
# ax.set_ylim([0,10])
# ax.set_xlim([1400,1600])

# Show the plot
plt.show()

In [None]:
import pandas as pd
import seaborn as sns

df = pd.DataFrame({'Sequence length': seq_len, 'Average RMSD': rmsd_per_seq})
# create a bivariate distribution plot using seaborn
sns.jointplot(x='Sequence length', y='Average RMSD', data=df, kind='hist', log_scale=(True, False), marginal_kws=dict(element='step', fill=True))

# Add labels and title
plt.xlabel('Sequence length (log scale)')
plt.ylabel('Avg. pairwise RMSD among structures (Å)')
# plt.title( 
#     "Bivariate Distribution Plot for sequence length vs. avg. pairwise RMSD"
#     # r"$\bf{" + "Histogram \ of \ avg. \ pairwise \ RMSD \ per \ sequence" + "}$" + f"\nDistribution: {np.mean(rmsd_per_seq):.2f} ± {np.std(rmsd_per_seq):.2f}, Max: {np.max(rmsd_per_seq):.2f}, Min: {np.min(rmsd_per_seq):.2f}"
# )

# Display the plot
# plt.savefig('bivariate_seq_vs_rmsd.pdf', dpi=300)
plt.show()


In [None]:
# create a bivariate distribution plot using seaborn
sns.jointplot(x='Sequence length', y='Average RMSD', data=df, kind='kde')

# show the plot
plt.show()
