In [1]:
import pandas as pd 
dataset_clean = pd.read_csv("dataset/SMRT_dataset.csv")

## Random splitting

In [15]:
# Randomly split the dataset into training and validation sets (80:20)
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from sklearn.model_selection import train_test_split
import plotly.express as px

# Split the dataset
train_set, valid_set = train_test_split(dataset_clean, test_size=0.2, random_state=42,shuffle=True)

print(f"Training set size: {len(train_set)}")
print(f"Validation set size: {len(valid_set)}")

# Create an overlapped histogram for RT values in training and validation sets
fig = px.histogram(
    train_set, x='rt', nbins=50, opacity=0.6, 
    #title='Overlapped RT Distribution: Training vs Validation',
    labels={'rt': 'RT (Retention Time, Seconds)', 'Count': 'Frequency'},
    color_discrete_sequence=['blue'],
    marginal=None
)

# Add validation set to the same figure
fig.add_trace(
    px.histogram(
        valid_set, x='rt', nbins=50, opacity=0.6, 
        color_discrete_sequence=['red']
    ).data[0]
)

# Update layout for better visualization
fig.update_layout(
    barmode='overlay',
    plot_bgcolor='rgba(229, 236, 246,1)',
    paper_bgcolor='rgba(229, 236, 246,1)',
    width=400,
    height=400,
    margin=dict(l=20, r=20, t=30, b=20),
    legend=dict(title='Dataset', itemsizing='constant')
)


# Combine training and validation sets for Tukey's HSD test
data_combined = train_set[['rt']].copy()
data_combined['set'] = 'Training'
validation_data = valid_set[['rt']].copy()
validation_data['set'] = 'Validation'
data_combined = pd.concat([data_combined, validation_data])

# Perform Tukey's HSD test
tukey_result = pairwise_tukeyhsd(endog=data_combined['rt'], groups=data_combined['set'], alpha=0.05)
print(tukey_result)

# Annotate the plot with Tukey's HSD results
fig.add_annotation(
    text=f"Tukey's HSD p-value: {tukey_result.pvalues[0]:.4f}",
    xref="paper", yref="paper",
    x=0.5, y=0.8, showarrow=False,
    font=dict(size=12, color="black"),
    align="center",
    bgcolor="rgba(255, 255, 255, 0.8)",
    bordercolor="black",
    borderwidth=1
)

fig.show()
fig.write_image("images/Figure-2.1-Random.svg", scale=2)

Training set size: 64030
Validation set size: 16008
  Multiple Comparison of Means - Tukey HSD, FWER=0.05   
 group1    group2   meandiff p-adj   lower  upper reject
--------------------------------------------------------
Training Validation   2.6764 0.1428 -0.9032 6.256  False
--------------------------------------------------------


## Scaffold split

In [16]:
scaffold_sets = dataset_clean['scaffold'].value_counts()
train_scaffolds,valid_scaffolds = [],[]
train_cnt = 0
train_cutoff = int(0.8*len(dataset_clean))
for scaffold, count in scaffold_sets.items():
    if train_cnt + count > train_cutoff:
        valid_scaffolds.append(scaffold)
    else:
        train_scaffolds.append(scaffold)
        train_cnt += count
train_set_scaffold = dataset_clean[dataset_clean['scaffold'].isin(train_scaffolds)].copy()
valid_set_scaffold = dataset_clean[dataset_clean['scaffold'].isin(valid_scaffolds)].copy()
print(f"Training set size (scaffold split): {len(train_set_scaffold)}")
print(f"Validation set size (scaffold split): {len(valid_set_scaffold)}")

# Create an overlapped histogram for RT values in training and validation sets
fig = px.histogram(
    train_set_scaffold, x='rt', nbins=50, opacity=0.6, 
    #title='Overlapped RT Distribution: Training vs Validation',
    labels={'rt': 'RT (Retention Time, Seconds)', 'Count': 'Frequency'},
    color_discrete_sequence=['blue'],
    marginal=None
)

# Add validation set to the same figure
fig.add_trace(
    px.histogram(
        valid_set_scaffold, x='rt', nbins=50, opacity=0.6, 
        color_discrete_sequence=['red']
    ).data[0]
)

# Update layout for better visualization
fig.update_layout(
    barmode='overlay',
    plot_bgcolor='rgba(229, 236, 246,1)',
    paper_bgcolor='rgba(229, 236, 246,1)',
    width=400,
    height=400,
    margin=dict(l=20, r=20, t=30, b=20),
    legend=dict(title='Dataset', itemsizing='constant')
)

# Combine training and validation sets for Tukey's HSD test
data_combined = train_set_scaffold[['rt']].copy()
data_combined['set'] = 'Training'
validation_data = valid_set_scaffold[['rt']].copy()
validation_data['set'] = 'Validation'
data_combined = pd.concat([data_combined, validation_data])

# Perform Tukey's HSD test
tukey_result = pairwise_tukeyhsd(endog=data_combined['rt'], groups=data_combined['set'], alpha=0.05)
print(tukey_result)

# Annotate the plot with Tukey's HSD results
fig.add_annotation(
    text=f"Tukey's HSD p-value: {tukey_result.pvalues[0]:.4f}",
    xref="paper", yref="paper",
    x=0.5, y=0.8, showarrow=False,
    font=dict(size=12, color="black"),
    align="center",
    bgcolor="rgba(255, 255, 255, 0.8)",
    bordercolor="black",
    borderwidth=1
)

fig.show()
fig.write_image("images/Figure-2.2-Scaffold.svg", scale=2)

Training set size (scaffold split): 64030
Validation set size (scaffold split): 15975
   Multiple Comparison of Means - Tukey HSD, FWER=0.05    
 group1    group2   meandiff p-adj  lower    upper  reject
----------------------------------------------------------
Training Validation -87.8591   0.0 -91.3862 -84.332   True
----------------------------------------------------------


## Scaffold splitting add randomness

In [17]:
import numpy as np
scaffold_sets = dataset_clean['scaffold'].value_counts()
train_scaffolds,valid_scaffolds = [],[]
train_cnt,valid_cnt = 0,0
train_cutoff = int(0.8*len(dataset_clean))
for scaffold, count in scaffold_sets.items():
    
    if (valid_cnt + count <= len(dataset_clean) - train_cutoff) and (train_cnt + count <= train_cutoff):
        if np.random.rand() <= 0.2:
            valid_scaffolds.append(scaffold)
            valid_cnt += count
        else:
            train_scaffolds.append(scaffold)
            train_cnt += count
    elif train_cnt + count <= train_cutoff:
        train_scaffolds.append(scaffold)
        train_cnt += count
    else:
        valid_scaffolds.append(scaffold)
        valid_cnt += count
train_set_scaffold = dataset_clean[dataset_clean['scaffold'].isin(train_scaffolds)].copy()
valid_set_scaffold = dataset_clean[dataset_clean['scaffold'].isin(valid_scaffolds)].copy()
print(f"Training set size (scaffold split): {len(train_set_scaffold)}")
print(f"Validation set size (scaffold split): {len(valid_set_scaffold)}")

# Create an overlapped histogram for RT values in training and validation sets
fig = px.histogram(
    train_set_scaffold, x='rt', nbins=50, opacity=0.6, 
    #title='Overlapped RT Distribution: Training vs Validation',
    labels={'rt': 'RT (Retention Time, Seconds)', 'Count': 'Frequency'},
    color_discrete_sequence=['blue'],
    marginal=None
)

# Add validation set to the same figure
fig.add_trace(
    px.histogram(
        valid_set_scaffold, x='rt', nbins=50, opacity=0.6, 
        color_discrete_sequence=['red']
    ).data[0]
)

# Update layout for better visualization
fig.update_layout(
    barmode='overlay',
    plot_bgcolor='rgba(229, 236, 246,1)',
    paper_bgcolor='rgba(229, 236, 246,1)',
    width=400,
    height=400,
    margin=dict(l=20, r=20, t=30, b=20),
    legend=dict(title='Dataset', itemsizing='constant')
)

from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Combine training and validation sets for Tukey's HSD test
data_combined = train_set_scaffold[['rt']].copy()
data_combined['set'] = 'Training'
validation_data = valid_set_scaffold[['rt']].copy()
validation_data['set'] = 'Validation'
data_combined = pd.concat([data_combined, validation_data])

# Perform Tukey's HSD test
tukey_result = pairwise_tukeyhsd(endog=data_combined['rt'], groups=data_combined['set'], alpha=0.05)
print(tukey_result)

# Annotate the plot with Tukey's HSD results
fig.add_annotation(
    text=f"Tukey's HSD p-value: {tukey_result.pvalues[0]:.4f}",
    xref="paper", yref="paper",
    x=0.5, y=0.8, showarrow=False,
    font=dict(size=12, color="black"),
    align="center",
    bgcolor="rgba(255, 255, 255, 0.8)",
    bordercolor="black",
    borderwidth=1
)

fig.show()
fig.write_image("images/Figure-2.3-Scaffold-Random.svg", scale=2)

Training set size (scaffold split): 64030
Validation set size (scaffold split): 15975
    Multiple Comparison of Means - Tukey HSD, FWER=0.05    
 group1    group2   meandiff p-adj  lower    upper   reject
-----------------------------------------------------------
Training Validation -17.7345   0.0 -21.3117 -14.1574   True
-----------------------------------------------------------


## Chronological splitting

In [18]:
# Sort the dataset by date from old to new
dataset_clean_sorted = dataset_clean.sort_values(by='date')

# Split the dataset by time: older 80% for training, newer 20% for validation
split_index = int(len(dataset_clean_sorted) * 0.8)
train_set_time = dataset_clean_sorted.iloc[:split_index]
valid_set_time = dataset_clean_sorted.iloc[split_index:]

print(f"Training set size (time-based): {len(train_set_time)}")
print(f"Validation set size (time-based): {len(valid_set_time)}")

# Create an overlapped histogram for RT values in training and validation sets (time-based split)
fig = px.histogram(
    train_set_time, x='rt', nbins=50, opacity=0.6, 
    #title='Overlapped RT Distribution: Training vs Validation (Time-Based Split)',
    labels={'rt': 'RT (Retention Time, Seconds)', 'Count': 'Frequency'},
    color_discrete_sequence=['blue'],
    marginal=None
)

# Add validation set to the same figure
fig.add_trace(
    px.histogram(
        valid_set_time, x='rt', nbins=50, opacity=0.6, 
        color_discrete_sequence=['red']
    ).data[0]
)

# Update layout for better visualization
fig.update_layout(
    barmode='overlay',
    plot_bgcolor='rgba(229, 236, 246,1)',
    paper_bgcolor='rgba(229, 236, 246,1)',
    width=400,
    height=400,
    margin=dict(l=20, r=20, t=30, b=20),
    legend=dict(title='Dataset', itemsizing='constant')
)


from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Combine training and validation sets for Tukey's HSD test
data_combined = train_set_time[['rt']].copy()
data_combined['set'] = 'Training'
validation_data = valid_set_time[['rt']].copy()
validation_data['set'] = 'Validation'
data_combined = pd.concat([data_combined, validation_data])

# Perform Tukey's HSD test
tukey_result = pairwise_tukeyhsd(endog=data_combined['rt'], groups=data_combined['set'], alpha=0.05)
print(tukey_result)

# Annotate the plot with Tukey's HSD results
fig.add_annotation(
    text=f"Tukey's HSD p-value: {tukey_result.pvalues[0]:.4f}",
    xref="paper", yref="paper",
    x=0.5, y=0.8, showarrow=False,
    font=dict(size=12, color="black"),
    align="center",
    bgcolor="rgba(255, 255, 255, 0.8)",
    bordercolor="black",
    borderwidth=1
)

fig.show()
fig.write_image("images/Figure-2.4-Chronological.svg", scale=2)

Training set size (time-based): 64030
Validation set size (time-based): 16008
  Multiple Comparison of Means - Tukey HSD, FWER=0.05   
 group1    group2   meandiff p-adj  lower  upper  reject
--------------------------------------------------------
Training Validation   0.6839 0.708 -2.8957 4.2636  False
--------------------------------------------------------
