### Importing Libraries

In [5]:
# Import Necessary Libraries
import msprime
import numpy as np
import random
import gc
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.stats import skew, kurtosis
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

### Simulation Parameters

In [6]:
# Simulation constants
NUM_SAMPLES_PER_SIM = 50      # Number of diploid samples per simulation
SEQUENCE_LENGTH = 50_000      # Length of the genomic region in base pairs

# Data generation parameters
NUM_DATASETS = 10_000           # Total number of datasets to simulate
BATCH_SIZE = 32               # Batch size for training

# Adjusted Population sizes (N) and mutation rates (μ) to balance theta distribution
MIN_N = 50_000                # Increased minimum N
MAX_N = 200_000               # Increased maximum N
N_values = np.linspace(MIN_N, MAX_N, num=20, dtype=int)  # 20 values from 50,000 to 200,000

MIN_MU = 1e-8
MAX_MU = 1e-6                 # Increased maximum μ
mu_values = np.logspace(np.log10(MIN_MU), np.log10(MAX_MU), num=20)  # 20 values on log scale

# For reproducibility
random.seed(42)
np.random.seed(42)

### Data Simulation & Feature Construction

In [7]:
# Initialize lists to store inputs and labels
inputs_basic = []
inputs_derived = []
inputs_combined = []
labels = []
population_sizes = []
mutation_rates = []

for i in range(NUM_DATASETS):
    # Randomly select population size N and mutation rate μ
    N = random.choice(N_values)
    mu = random.choice(mu_values)
    theta = 4 * N * mu  # θ = 4Nμ

    # Simulate ancestral history
    ts = msprime.sim_ancestry(
        samples=NUM_SAMPLES_PER_SIM,
        recombination_rate=0,
        sequence_length=SEQUENCE_LENGTH,
        population_size=N,
        random_seed=random.randint(1, 1e6)
    )

    # Simulate mutations
    mts = msprime.sim_mutations(ts, rate=mu, random_seed=random.randint(1, 1e6))

    # Collect genotype data and allele frequencies
    genotypes_list = []
    allele_frequencies = []
    positions = []

    for variant in mts.variants():
        genotypes = variant.genotypes  # Array of 0s and 1s
        derived_allele_count = np.count_nonzero(genotypes)  # Number of '1's
        frequency = derived_allele_count / mts.num_samples  # Allele frequency

        genotypes_list.append(genotypes)
        allele_frequencies.append(frequency)
        positions.append(variant.site.position)

    # Convert lists to arrays
    allele_frequencies = np.array(allele_frequencies)
    positions = np.array(positions)

    # Proceed only if there are variants
    if len(allele_frequencies) == 0:
        continue  # Skip this iteration if no variants are present

    genotypes_array = np.vstack(genotypes_list)  # Shape: (num_variants, NUM_SAMPLES)
    allele_counts = genotypes_array.sum(axis=1)

    # Compute Basic Statistics
    S = len(allele_frequencies)  # Number of segregating sites
    pi = 2 * np.sum(allele_frequencies * (1 - allele_frequencies))
    singleton_count = np.sum(allele_counts == 1)
    allele_freq_variance = np.var(allele_frequencies)
    allele_freq_skewness = skew(allele_frequencies)
    allele_freq_kurtosis = kurtosis(allele_frequencies)
    rare_variant_threshold = 0.05
    rare_variant_proportion = np.sum(allele_frequencies <= rare_variant_threshold) / S
    mean_inter_snp_distance = np.mean(np.diff(np.sort(positions))) if S > 1 else 0.0
    var_inter_snp_distance = np.var(np.diff(np.sort(positions))) if S > 1 else 0.0

    # Prepare basic statistics feature vector
    features_basic = np.array([
        S,
        pi,
        singleton_count,
        allele_freq_variance,
        allele_freq_skewness,
        allele_freq_kurtosis,
        rare_variant_proportion,
        mean_inter_snp_distance,
        var_inter_snp_distance
    ], dtype=np.float32)

    # Compute Derived Statistics
    n = mts.num_samples
    a1 = np.sum(1.0 / np.arange(1, n))
    a2 = np.sum(1.0 / (np.arange(1, n) ** 2))
    b1 = (n + 1) / (3 * (n - 1))
    b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1))
    c1 = b1 - (1 / a1)
    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1 ** 2))
    e1 = c1 / a1
    e2 = c2 / (a1 ** 2 + a2)
    variance_tajd = e1 * S + e2 * S * (S - 1)
    tajimas_d = (pi - (S / a1)) / np.sqrt(variance_tajd) if variance_tajd > 0 else 0.0

    # Prepare derived statistics feature vector
    features_derived = np.array([
        tajimas_d
        # Add other derived statistics if computed
    ], dtype=np.float32)

    # Combine features
    features_combined = np.concatenate([features_basic, features_derived])

    # Append features and labels
    inputs_basic.append(features_basic)
    inputs_derived.append(features_derived)
    inputs_combined.append(features_combined)
    labels.append(theta)
    population_sizes.append(N)
    mutation_rates.append(mu)

    # Optional: Print progress
    if (i + 1) % 100 == 0:
        print(f"Processed {i + 1}/{NUM_DATASETS} datasets")

KeyboardInterrupt: 

### Convert lists to Arrays

In [None]:
# Convert lists to arrays
inputs_basic_array = np.array(inputs_basic, dtype=np.float32)
inputs_derived_array = np.array(inputs_derived, dtype=np.float32)
inputs_combined_array = np.array(inputs_combined, dtype=np.float32)
labels_array = np.array(labels, dtype=np.float32)
population_sizes_array = np.array(population_sizes, dtype=np.int32)
mutation_rates_array = np.array(mutation_rates, dtype=np.float32)

# Scale Theta values by a scaling factor
scaling_factor = 1e6  # Scaling theta to bring values into a larger range
labels_array_scaled = labels_array * scaling_factor

# Apply Box-Cox transformation to scaled theta values
labels_array_transformed, lambda_ = stats.boxcox(labels_array_scaled)

NameError: name 'np' is not defined

### Data Split -- Training, Testing, Validation

In [None]:
# Create an array of indices representing each sample
indices = np.arange(len(labels_array_transformed))

# Split indices into training+validation and test sets
train_val_indices, test_indices = train_test_split(
    indices, test_size=0.15, random_state=42)

# Further split training+validation indices into training and validation sets
train_indices, val_indices = train_test_split(
    train_val_indices, test_size=0.15 / 0.85, random_state=42)

# Split data for basic statistics using indices
X_basic_train = inputs_basic_array[train_indices]
X_basic_val = inputs_basic_array[val_indices]
X_basic_test = inputs_basic_array[test_indices]

# Split data for derived statistics using indices
X_derived_train = inputs_derived_array[train_indices]
X_derived_val = inputs_derived_array[val_indices]
X_derived_test = inputs_derived_array[test_indices]

# Split data for combined features using indices
X_combined_train = inputs_combined_array[train_indices]
X_combined_val = inputs_combined_array[val_indices]
X_combined_test = inputs_combined_array[test_indices]

# Split transformed labels using indices
y_train = labels_array_transformed[train_indices]
y_val = labels_array_transformed[val_indices]
y_test = labels_array_transformed[test_indices]

### Fit Scalers on Training Data and Transform

In [None]:
# Initialize scalers
scaler_basic = StandardScaler()
scaler_derived = StandardScaler()
scaler_combined = StandardScaler()

# Fit scalers on training data
scaler_basic.fit(X_basic_train)
scaler_derived.fit(X_derived_train)
scaler_combined.fit(X_combined_train)

# Transform training, validation, and test data
X_basic_train_scaled = scaler_basic.transform(X_basic_train)
X_basic_val_scaled = scaler_basic.transform(X_basic_val)
X_basic_test_scaled = scaler_basic.transform(X_basic_test)

X_derived_train_scaled = scaler_derived.transform(X_derived_train)
X_derived_val_scaled = scaler_derived.transform(X_derived_val)
X_derived_test_scaled = scaler_derived.transform(X_derived_test)

X_combined_train_scaled = scaler_combined.transform(X_combined_train)
X_combined_val_scaled = scaler_combined.transform(X_combined_val)
X_combined_test_scaled = scaler_combined.transform(X_combined_test)

### Define Features

In [None]:
# Define feature names
feature_names_basic = [
    'S',  # Number of segregating sites
    'pi',  # Nucleotide diversity
    'singleton_count',
    'allele_freq_variance',
    'allele_freq_skewness',
    'allele_freq_kurtosis',
    'rare_variant_proportion',
    'mean_inter_snp_distance',
    'var_inter_snp_distance'
]

feature_names_derived = [
    'tajimas_d'
    # Add other derived statistic names if included
]

# Combine feature names
feature_names_combined = feature_names_basic + feature_names_derived

### Examine Theta Distribution and Feature statistics

In [None]:
# Examine the distribution of transformed theta values
plt.hist(labels_array_transformed, bins=50, edgecolor='k')
plt.xlabel('Box-Cox Transformed Theta')
plt.ylabel('Frequency')
plt.title('Distribution of Box-Cox Transformed Theta Values')
plt.show()

# Print statistical summaries of transformed theta
print(f"Transformed Theta Min: {labels_array_transformed.min()}")
print(f"Transformed Theta Max: {labels_array_transformed.max()}")
print(f"Transformed Theta Mean: {labels_array_transformed.mean()}")
print(f"Transformed Theta Std Dev: {labels_array_transformed.std()}")
print(f"Box-Cox Lambda: {lambda_}")

# Create DataFrame for combined features
df_features = pd.DataFrame(inputs_combined_array, columns=feature_names_combined)
df_features['theta_transformed'] = labels_array_transformed

### Construct and Visualizing Correlation Matrix

In [None]:
# Compute the correlation matrix with transformed theta
corr_matrix_transformed = df_features.corr()

# Visualize the correlation matrix
plt.figure(figsize=(12, 10))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr_matrix_transformed, annot=True, fmt=".2f", cmap=cmap, center=0, square=True, linewidths=.5)
plt.title('Correlation Matrix of Features and Transformed Theta')
plt.show()

### Defining Model

In [None]:
def create_model(input_dim):
    model = Sequential([
        Dense(512, activation='relu', input_shape=(input_dim,)),
        Dropout(0.5),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1)  # Output layer for regression
    ])
    # Use a smaller learning rate
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mean_squared_error', metrics=['mae'])
    return model

### Model Training and Evaluation

#### Model with Basic Statistics

In [None]:
# Create TensorFlow datasets
train_dataset_basic = tf.data.Dataset.from_tensor_slices((X_basic_train_scaled, y_train))
train_dataset_basic = train_dataset_basic.shuffle(buffer_size=1024).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

val_dataset_basic = tf.data.Dataset.from_tensor_slices((X_basic_val_scaled, y_val))
val_dataset_basic = val_dataset_basic.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

test_dataset_basic = tf.data.Dataset.from_tensor_slices((X_basic_test_scaled, y_test))
test_dataset_basic = test_dataset_basic.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# Create the model
model_basic = create_model(input_dim=X_basic_train_scaled.shape[1])

# Early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history_basic = model_basic.fit(
    train_dataset_basic,
    epochs=200,
    validation_data=val_dataset_basic,
    callbacks=[early_stopping]
)

# Evaluate the model on the test set
test_loss_basic_transformed, test_mae_basic_transformed = model_basic.evaluate(test_dataset_basic)
print(f"Basic Statistics Model - Test Loss (MSE): {test_loss_basic_transformed:.4f}")
print(f"Basic Statistics Model - Test MAE: {test_mae_basic_transformed:.4f}")

# Make predictions on the test set
y_pred_basic_transformed = model_basic.predict(test_dataset_basic).flatten()

#### Model with Derived Statistics

In [None]:
# Create TensorFlow datasets
train_dataset_derived = tf.data.Dataset.from_tensor_slices((X_derived_train_scaled, y_train))
train_dataset_derived = train_dataset_derived.shuffle(buffer_size=1024).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

val_dataset_derived = tf.data.Dataset.from_tensor_slices((X_derived_val_scaled, y_val))
val_dataset_derived = val_dataset_derived.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

test_dataset_derived = tf.data.Dataset.from_tensor_slices((X_derived_test_scaled, y_test))
test_dataset_derived = test_dataset_derived.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# Create the model
model_derived = create_model(input_dim=X_derived_train_scaled.shape[1])

# Train the model
history_derived = model_derived.fit(
    train_dataset_derived,
    epochs=200,
    validation_data=val_dataset_derived,
    callbacks=[early_stopping]
)

# Evaluate the model on the test set
test_loss_derived_transformed, test_mae_derived_transformed = model_derived.evaluate(test_dataset_derived)
print(f"Derived Statistics Model - Test Loss (MSE): {test_loss_derived_transformed:.4f}")
print(f"Derived Statistics Model - Test MAE: {test_mae_derived_transformed:.4f}")

# Make predictions on the test set
y_pred_derived_transformed = model_derived.predict(test_dataset_derived).flatten()

#### Model Combined

In [None]:
# Create TensorFlow datasets
train_dataset_combined = tf.data.Dataset.from_tensor_slices((X_combined_train_scaled, y_train))
train_dataset_combined = train_dataset_combined.shuffle(buffer_size=1024).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

val_dataset_combined = tf.data.Dataset.from_tensor_slices((X_combined_val_scaled, y_val))
val_dataset_combined = val_dataset_combined.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

test_dataset_combined = tf.data.Dataset.from_tensor_slices((X_combined_test_scaled, y_test))
test_dataset_combined = test_dataset_combined.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# Create the model
model_combined = create_model(input_dim=X_combined_train_scaled.shape[1])

# Train the model
history_combined = model_combined.fit(
    train_dataset_combined,
    epochs=200,
    validation_data=val_dataset_combined,
    callbacks=[early_stopping]
)

# Evaluate the model on the test set
test_loss_combined_transformed, test_mae_combined_transformed = model_combined.evaluate(test_dataset_combined)
print(f"Combined Features Model - Test Loss (MSE): {test_loss_combined_transformed:.4f}")
print(f"Combined Features Model - Test MAE: {test_mae_combined_transformed:.4f}")

# Make predictions on the test set
y_pred_combined_transformed = model_combined.predict(test_dataset_combined).flatten()

### Scale Back Theta(Predictions) and Evaluate

In [None]:
# Inverse Box-Cox transformation to get predictions and true values in original scale
y_test_original = stats.inv_boxcox(y_test, lambda_)

y_pred_basic = stats.inv_boxcox(y_pred_basic_transformed, lambda_)
y_pred_derived = stats.inv_boxcox(y_pred_derived_transformed, lambda_)
y_pred_combined = stats.inv_boxcox(y_pred_combined_transformed, lambda_)

# Compute Test MSE, MAE, R², and MAPE with original theta scale
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

# Basic Statistics Model
test_mse_basic = mean_squared_error(y_test_original, y_pred_basic)
test_mae_basic = mean_absolute_error(y_test_original, y_pred_basic)
r2_basic = r2_score(y_test_original, y_pred_basic)
mape_basic = mean_absolute_percentage_error(y_test_original, y_pred_basic)

# Derived Statistics Model
test_mse_derived = mean_squared_error(y_test_original, y_pred_derived)
test_mae_derived = mean_absolute_error(y_test_original, y_pred_derived)
r2_derived = r2_score(y_test_original, y_pred_derived)
mape_derived = mean_absolute_percentage_error(y_test_original, y_pred_derived)

# Combined Features Model
test_mse_combined = mean_squared_error(y_test_original, y_pred_combined)
test_mae_combined = mean_absolute_error(y_test_original, y_pred_combined)
r2_combined = r2_score(y_test_original, y_pred_combined)
mape_combined = mean_absolute_percentage_error(y_test_original, y_pred_combined)

### Gathering Results

In [None]:
# Compile results into a DataFrame
results = pd.DataFrame({
    'Model': ['Basic Statistics', 'Derived Statistics', 'Combined Features'],
    'Test MSE': [test_mse_basic, test_mse_derived, test_mse_combined],
    'Test MAE': [test_mae_basic, test_mae_derived, test_mae_combined],
    'Test R2': [r2_basic, r2_derived, r2_combined],
    'Test MAPE': [mape_basic, mape_derived, mape_combined]
})

print(results)

### Prediction Statistics

In [None]:
# Print prediction statistics
predictions = [y_pred_basic, y_pred_derived, y_pred_combined]
models = ['Basic Statistics', 'Derived Statistics', 'Combined Features']

for model_name, y_pred in zip(models, predictions):
    print(f"\n{model_name} Predictions:")
    print(f"Min: {y_pred.min()}")
    print(f"Max: {y_pred.max()}")
    print(f"Mean: {y_pred.mean()}")
    print(f"Std Dev: {y_pred.std()}")

### Plot: Predicted vs True Theta

In [None]:
# Plot predicted vs. true theta values for each model
models = ['Basic Statistics', 'Derived Statistics', 'Combined Features']
predictions = [y_pred_basic, y_pred_derived, y_pred_combined]

plt.figure(figsize=(18, 5))

for i, (model_name, y_pred) in enumerate(zip(models, predictions)):
    plt.subplot(1, 3, i+1)
    plt.scatter(y_test_original, y_pred, alpha=0.5)
    plt.plot([y_test_original.min(), y_test_original.max()], [y_test_original.min(), y_test_original.max()], 'r--')
    plt.xlabel('True Theta Values')
    plt.ylabel('Predicted Theta Values')
    plt.title(f'Predicted vs. True Theta ({model_name})')

plt.tight_layout()
plt.show()

### Plot: Training History

In [None]:
# Plot training & validation loss for each model
histories = [history_basic, history_derived, history_combined]

plt.figure(figsize=(18, 5))

for i, (model_name, history) in enumerate(zip(models, histories)):
    plt.subplot(1, 3, i+1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'Model Loss ({model_name})')
    plt.ylabel('Loss (MSE)')
    plt.xlabel('Epoch')
    plt.legend()

plt.tight_layout()
plt.show()

### Additional Model Comparison Metrics

#### Model Stats Comparison

In [None]:
# Bar plot for Test MSE, MAE, R², and MAPE
metrics = ['Test MSE', 'Test MAE', 'Test R2', 'Test MAPE']

for metric in metrics:
    plt.figure(figsize=(6, 4))
    sns.barplot(x='Model', y=metric, data=results)
    plt.title(f'Model Comparison - {metric}')
    plt.ylabel(metric)
    plt.xlabel('Model')
    plt.show()