## Data exploration

In [1]:
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import pandas as pd

from eda import DataLoader
from IPython.display import display

project_root_path = Path.cwd().parent

2025-03-28 16:13:01.140101: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-28 16:13:01.140608: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-28 16:13:01.142670: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-28 16:13:01.147547: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743153181.155756  191849 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743153181.15

In [2]:

data_loader = DataLoader(project_root_path / "kaggle/input/stanford-rna-3d-folding")
data_loader.load_data()
all_data = data_loader.data

In [3]:
for key, df in all_data.items():
    print(f"File name: {key}, shape: {df.shape}")

File name: sample_submission, shape: (2515, 18)
File name: validation_labels, shape: (2515, 123)
File name: train_labels, shape: (137095, 6)
File name: validation_sequences, shape: (12, 5)
File name: test_sequences, shape: (12, 5)
File name: train_sequences, shape: (844, 5)


## 1SCL_A - `target_id`

- from fasta file
```txt
>query
GGGUGCUCAGUACGAGAGGAACCGCACCC
>430D_A_1_55_f/1-29
GGGUGCUCAGUACGAGAGGAACCGCAGCC
>URS00021257C0_2904_2990_f/30-58
```

- sequence: `GGGUGCUCAGUACGAGAGGAACCGCACCC`
- target_id: 1SCL_A


## train



- train_sequences.csv: 
  - target_id: arbitrary identifier, formatted as `pdb_id_chain_id`
    - 844 unique
    - `pdb_id`: id of the entry in the Protein Data Bank
    - `chain_id`: chain id of the monomer in the pdb file
  - sequence: RNA sequence
    - 784 unique
    - some sequences (45) are present multiple times with different `target_id`. how different targets can have the same sequence?
  - 5 has null `all_sequences` 

### train_sequences

In [4]:
df_s = all_data['train_sequences']
print("=== train_sequences.head() ===")
display(df_s.head())

print("=== train_sequences.describe() ===")
display(df_s.describe())

print("=== train_sequences.isna().sum() ===")
display(df_s.isna().sum())

=== train_sequences.head() ===


Unnamed: 0,target_id,sequence,temporal_cutoff,description,all_sequences
0,1SCL_A,GGGUGCUCAGUACGAGAGGAACCGCACCC,1995-01-26,"THE SARCIN-RICIN LOOP, A MODULAR RNA",>1SCL_1|Chain A|RNA SARCIN-RICIN LOOP|Rattus n...
1,1RNK_A,GGCGCAGUGGGCUAGCGCCACUCAAAAGGCCCAU,1995-02-27,THE STRUCTURE OF AN RNA PSEUDOKNOT THAT CAUSES...,>1RNK_1|Chain A|RNA PSEUDOKNOT|null\nGGCGCAGUG...
2,1RHT_A,GGGACUGACGAUCACGCAGUCUAU,1995-06-03,24-MER RNA HAIRPIN COAT PROTEIN BINDING SITE F...,>1RHT_1|Chain A|RNA (5'-R(P*GP*GP*GP*AP*CP*UP*...
3,1HLX_A,GGGAUAACUUCGGUUGUCCC,1995-09-15,P1 HELIX NUCLEIC ACIDS (DNA/RNA) RIBONUCLEIC ACID,>1HLX_1|Chain A|RNA (5'-R(*GP*GP*GP*AP*UP*AP*A...
4,1HMH_E,GGCGACCCUGAUGAGGCCGAAAGGCCGAAACCGU,1995-12-07,THREE-DIMENSIONAL STRUCTURE OF A HAMMERHEAD RI...,">1HMH_1|Chains A, C, E|HAMMERHEAD RIBOZYME-RNA..."


=== train_sequences.describe() ===


Unnamed: 0,target_id,sequence,temporal_cutoff,description,all_sequences
count,844,844,844,844,839
unique,844,784,476,716,732
top,1SCL_A,UUUUUAAUUUCUACUCUUGUAGAUGUGAUAAGUGGAAUGCCAUGUGGA,2014-07-09,Structure of a mammalian 80S ribosome obtained...,>4V5Z_1|Chain A[auth AA]|18S Ribosomal RNA|Can...
freq,1,5,46,11,11


=== train_sequences.isna().sum() ===


target_id          0
sequence           0
temporal_cutoff    0
description        0
all_sequences      5
dtype: int64

#### sequences with multiple occurrences

In [5]:
sequences_with_multiple_occurrences = df_s[df_s.groupby('sequence')['sequence'].transform('count') > 1]['sequence'].unique()
print(len(sequences_with_multiple_occurrences))
df_s[df_s['sequence'] == sequences_with_multiple_occurrences[0]]

45


Unnamed: 0,target_id,sequence,temporal_cutoff,description,all_sequences
17,1ZDI_S,ACAUGAGGAUUACCCAUGU,1997-04-21,RNA BACTERIOPHAGE MS2 COAT PROTEIN/RNA COMPLEX,">1ZDI_1|Chains A[auth R], B[auth S]|RNA (5'-R(..."
200,2BQ5_S,ACAUGAGGAUUACCCAUGU,2006-03-22,MS2 (N87AE89K mutant) - RNA hairpin complex,">2BQ5_1|Chains A, B, C|COAT PROTEIN|BACTERIOPH..."
202,2BQ5_R,ACAUGAGGAUUACCCAUGU,2006-03-22,MS2 (N87AE89K mutant) - RNA hairpin complex,">2BQ5_1|Chains A, B, C|COAT PROTEIN|BACTERIOPH..."
205,2B2E_R,ACAUGAGGAUUACCCAUGU,2006-05-09,RNA stemloop from bacteriophage MS2 complexed ...,">2B2E_1|Chains A[auth R], B[auth S]|5'-R(*AP*C..."
206,2B2E_S,ACAUGAGGAUUACCCAUGU,2006-05-09,RNA stemloop from bacteriophage MS2 complexed ...,">2B2E_1|Chains A[auth R], B[auth S]|5'-R(*AP*C..."


#### sequence lengths

In [28]:
import plotly.express as px
sequence_lengths = df_s['sequence'].str.len()
print("=== sequence_lengths.describe() ===")
display(sequence_lengths.describe())
px.histogram(
    x=sequence_lengths,
    nbins=50,  # Adjust number of bins as needed
    title='Distribution of Sequence Lengths, train_sequences',
    labels={'x': 'Sequence Length', 'y': 'Count'},
)

=== sequence_lengths.describe() ===


count     844.000000
mean      162.434834
std       515.031957
min         3.000000
25%        22.000000
50%        39.500000
75%        86.000000
max      4298.000000
Name: sequence, dtype: float64

#### null `all_sequences`

In [30]:
df_s[df_s['all_sequences'].isna()]

Unnamed: 0,target_id,sequence,temporal_cutoff,description,all_sequences
286,2ZJQ_Y,CACCCCCGUGCCCAUAGCACUGUGGAACCACCCCACCCCAUGCCGA...,2008-06-17,Interaction of L7 with L11 induced by Microcco...,
287,2ZJQ_X,GGUCAAGAUAGUAAGGGUCCACGGUGGAUGCCCUGGCGCUGGAGCC...,2008-06-17,Interaction of L7 with L11 induced by Microcco...,
435,4V65_A1,UGAAGAGUUUGAUCAUGGCUCAGAUUGAACGCUGGCGGCAGGCCUA...,2014-07-09,Structure of the E. coli ribosome in the Pre-a...,
445,4V65_BB,GGUUAAGCGACUAAGCGUACACGGUGGAUGCCCUGGCAGUCAGAGG...,2014-07-09,Structure of the E. coli ribosome in the Pre-a...,
458,4V5F_CA,UGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGUGCCUA...,2014-07-09,The structure of the ribosome with elongation ...,


### train_labels

In [5]:
def remove_after_last_underscore(text):
    if '_' not in text:
        return text
    return text.rsplit('_', 1)[0]

In [84]:
df_l = all_data['train_labels']
df_l['target_id'] = df_l['ID'].apply(remove_after_last_underscore)
print("=== train_labels ===")
display(df_l.head(3))

print("=== train_labels.describe(include='object') ===")
display(df_l.describe(include='object'))
display(df_l.info())

print("=== train_labels.describe(include='number') ===")
display(df_l.describe(include='number'))


=== train_labels ===


Unnamed: 0,ID,resname,resid,x_1,y_1,z_1,target_id
0,1SCL_A_1,G,1,13.76,-25.974001,0.102,1SCL_A
1,1SCL_A_2,G,2,9.31,-29.638,2.669,1SCL_A
2,1SCL_A_3,G,3,5.529,-27.813,5.878,1SCL_A


=== train_labels.describe(include='object') ===


Unnamed: 0,ID,resname,target_id
count,137095,137095,137095
unique,137095,6,844
top,1SCL_A_1,G,4V6X_A5
freq,1,41450,4298


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137095 entries, 0 to 137094
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   ID         137095 non-null  object 
 1   resname    137095 non-null  object 
 2   resid      137095 non-null  int64  
 3   x_1        130950 non-null  float64
 4   y_1        130950 non-null  float64
 5   z_1        130950 non-null  float64
 6   target_id  137095 non-null  object 
dtypes: float64(3), int64(1), object(3)
memory usage: 7.3+ MB


None

=== train_labels.describe(include='number') ===


Unnamed: 0,resid,x_1,y_1,z_1
count,137095.0,130950.0,130950.0,130950.0
mean,897.255633,80.447315,84.040727,98.611226
std,1014.321987,147.422319,114.928902,119.410665
min,1.0,-821.085999,-449.414001,-333.403992
25%,61.0,-1.11575,-4.89075,2.2185
50%,481.0,62.651501,67.901501,72.938499
75%,1459.5,178.809753,170.451996,184.53175
max,4298.0,849.887024,889.507996,668.776978


## validation

In [33]:
df_s = all_data['validation_sequences']

df_s

Unnamed: 0,target_id,sequence,temporal_cutoff,description,all_sequences
0,R1107,GGGGGCCACAGCAGAAGCGUUCACGUCGCAGCCCCUGUCAGCCAUU...,2022-05-28,CPEB3 ribozyme\nHuman\nhuman CPEB3 HDV-like ri...,>7QR4_1|Chain A|U1 small nuclear ribonucleopro...
1,R1108,GGGGGCCACAGCAGAAGCGUUCACGUCGCGGCCCCUGUCAGCCAUU...,2022-05-27,CPEB3 ribozyme\nChimpanzee\nChimpanzee CPEB3 H...,">7QR3_1|Chains A, B|U1 small nuclear ribonucle..."
2,R1116,CGCCCGGAUAGCUCAGUCGGUAGAGCAGCGGCUAAAACAGCUCUGG...,2022-06-04,Cloverleaf RNA\nPoliovirus\nCrystal Structure ...,">8S95_1|Chain A[auth C]|Lysine tRNA scaffold,P..."
3,R1117v2,UUGGGUUCCCUCACCCCAAUCAUAAAAAGG,2022-06-03,PreQ1 class I type III riboswitch\nK. pneumoni...,">8FZA_1|Chains A, B|PreQ1 Riboswitch (30-MER)|..."
4,R1126,GGAAUCUCGCCCGAUGUUCGCAUCGGGAUUUGCAGGUCCAUGGAUU...,2022-06-11,Traptamer\nSynthetic\nAdditional Information: ...,>8TVZ_1|Chain A[auth C]|RNA (363-MER)|syntheti...
5,R1128,GGAAUAUCGUCAUGGUGAUUCGUCACCAUGAGGCUAGAUCUCAUAU...,2022-06-10,6WJ\nSingle-stranded Paranemic Crossover RNA T...,>8BTZ_1|Chain A|RNA Paranemic croosover triang...
6,R1136,GGAUACGUCUACGCUCAGUGACGGACUCUCUUCGGAGAGUCUGACA...,2022-06-18,Apta-FRET\nAdditional Information: Info...,>7ZJ4_1|Chain A[auth E]|brocolli-pepper aptame...
7,R1138,GGGAGAGUACUAUUCAGAUGCAGACCGCAAGUUCAGAGCGGUUUGC...,2022-06-24,6HBC-Young\nAdditional Information: Thi...,>7PTK_1|Chain A[auth B]|RNA|synthetic construc...
8,R1149,GGACACGAGUAACUCGUCUAUCUUCUGCAGGCUGCUUACGGUUUCG...,2022-07-02,SARS-CoV-2 SL5\nAdditional Information: ...,>8UYS_1|Chain A|SARS-CoV-2 RNA SL5 domain.|Sev...
9,R1156,GGAGCAUCGUGUCUCAAGUGCUUCACGGUCACAAUAUACCGUUUCG...,2022-07-07,BtCoV-HKU5 SL5\nBtCoV-HKU5 5 proximal stem-loo...,>8UYE_1|Chain A|BtCoV-HKU5 5' proximal stem-lo...


In [34]:
df_l = all_data['validation_labels']
df_l


Unnamed: 0,ID,resname,resid,x_1,y_1,z_1,x_2,y_2,z_2,x_3,...,z_37,x_38,y_38,z_38,x_39,y_39,z_39,x_40,y_40,z_40
0,R1107_1,G,1,-5.499000,8.520000,8.605000,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,...,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18
1,R1107_2,G,2,-5.826000,10.453000,14.010000,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,...,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18
2,R1107_3,G,3,-5.849000,14.768000,17.584999,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,...,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18
3,R1107_4,G,4,-5.784000,19.985001,18.666000,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,...,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18
4,R1107_5,G,5,-5.755000,25.533001,17.132999,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,...,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2510,R1190_114,U,114,87.870003,105.432999,115.183998,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,...,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18
2511,R1190_115,U,115,92.911003,105.394997,113.741997,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,...,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18
2512,R1190_116,U,116,99.012001,105.749001,113.073997,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,...,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18
2513,R1190_117,U,117,103.861000,103.453003,114.589996,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,...,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18,-1.000000e+18
