In [2]:
# docker run --rm -it --platform linux/amd64 -p 8888:8888 -v "/Users/em/code/DeepASM/notebooks:/notebooks" us-east1-docker.pkg.dev/hmh-em-deepasm/docker-repo/python:${SHORT_SHA} jupyter notebook --ip=0.0.0.0 --no-browser --allow-root --NotebookApp.notebook_dir=/notebooks

import sys
import os
import io

from google.cloud import bigquery
from google.cloud import storage

import numpy as np
import pandas as pd

from hmmlearn.hmm import GaussianHMM
from sklearn.utils import check_random_state

rs = check_random_state(546)


In [3]:
# GCP authorization
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "hmh-em-deepasm-404a5540e8c3.json"
bq_client = bigquery.Client()

In [4]:
# Variables
PROJECT_ID = "hmh-em-deepasm"
BQ_ML_DATASET = "hg19_250_ml_test"

## Download data from BQ

In [5]:
dic_data = {'train': {'samples': ['gm12878',
                                  'CD14',
                                  'fibroblast',
                                  'A549',
                                  'spleen_female_adult',
                                  'HeLa_S3']},
            'validation': {'samples': ['mammary_epithelial',
                                       'sk_n_sh',
                                       'CD34']},
            'test': {'samples': ['HepG2',
                                 'righ_lobe_liver',
                                 't_cell_male_adult']}}

In [6]:
dataset_name = 'train'
samples = dic_data[dataset_name]    
# Reformat samples for BQ
quoted_samples = ",".join([f"'{sample}'" for sample in samples['samples']])

# Query
query = f"SELECT * FROM {PROJECT_ID}.{BQ_ML_DATASET}.tabular WHERE sample IN ({quoted_samples})"

# Execute Query and store as DF
df = bq_client.query(query).to_dataframe()

In [7]:
all_obs = np.array(np.concatenate(df['directional_cpg_frac'].values))

In [8]:
def prepare_data_for_hmm(sequence):
    """
    Prepares a sequence of data for processing with a Hidden Markov Model (HMM).

    This function ensures the input sequence is in a 2D NumPy array format required by HMM processing routines, handling both single-dimensional sequences (interpreting them as a sequence of scalar observations) and two-dimensional sequences (interpreting them as a sequence of vector observations). It also calculates the length of the sequence, which is necessary for some HMM algorithms.

    Parameters:
    - sequence (np.ndarray): The input sequence to be processed. This can be either a 1D array of scalar observations or a 2D array of vector observations, where each row represents a timestep.

    Returns:
    - sequence (np.ndarray): The input sequence reshaped into a 2D NumPy array format, with individual observations along rows.
    - lengths (list of int): A list containing a single integer, which is the length of the input sequence. This is used by HMM algorithms that require the lengths of sequences being processed.

    Raises:
    - ValueError: If the input `sequence` has more than two dimensions, indicating it's not in an acceptable format for HMM processing.
    """

    if sequence.ndim == 1:
        sequence = np.atleast_2d(sequence).T
    elif sequence.ndim > 2:
        raise ValueError(
            "Sequence must be 1D (for single float sequence) or 2D (for sequence of vectors)"
        )

    # Determine the length of the sequence dynamically
    sequence_length = sequence.shape[0]

    # For a single sequence, the lengths list contains just one element: the sequence length
    lengths = [sequence_length]

    return sequence, lengths

In [9]:
reshaped_data, lengths = prepare_data_for_hmm(all_obs)

In [None]:
aic = []
bic = []
lls = []
ns = [2, 3, 4, 5, 6]
for n in ns:
    print(f"Number of states: {n}")
    best_ll = None
    best_model = None
    for i in range(10):
        h = GaussianHMM(n, n_iter=200, tol=1e-4, random_state=rs)
        h.fit(reshaped_data, lengths)
        score = h.score(reshaped_data)
        if not best_ll or best_ll < best_ll:
            best_ll = score
            best_model = h
    aic.append(best_model.aic(reshaped_data))
    bic.append(best_model.bic(reshaped_data))
    lls.append(best_model.score(reshaped_data))

Number of states: 2


Model is not converging.  Current: 4205335.523713576 is not greater than 4205335.52418127. Delta is -0.00046769436448812485
Model is not converging.  Current: 4205335.525052782 is not greater than 4205335.525366359. Delta is -0.00031357724219560623
Model is not converging.  Current: 4205335.526135785 is not greater than 4205335.526223325. Delta is -8.754059672355652e-05


In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ln1 = ax.plot(ns, aic, label="AIC", color="blue", marker="o")
ln2 = ax.plot(ns, bic, label="BIC", color="green", marker="o")
ax2 = ax.twinx()
ln3 = ax2.plot(ns, lls, label="LL", color="orange", marker="o")

ax.legend(handles=ax.lines + ax2.lines)
ax.set_title("Using AIC/BIC for Model Selection")
ax.set_ylabel("Criterion Value (lower is better)")
ax2.set_ylabel("LL (higher is better)")
ax.set_xlabel("Number of HMM Components")
fig.tight_layout()

plt.show()

In [None]:
# Assuming best_model.transmat_ is defined and contains the transition matrix you want to plot
# Create the figure and axis
fig, ax = plt.subplots()

# Plot the transition matrix
cax = ax.imshow(best_model.transmat_, aspect='auto')

# Create a colorbar with a reference to the imshow plot
fig.colorbar(cax)

# Set the title and labels
ax.set_title('Transition Matrix')
ax.set_xlabel('State To')
ax.set_ylabel('State From')

# Display the plot
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

# Example Gaussian parameters for 5 states
means = np.round(best_model.means_.flatten(),3)  # [1, 3, 5, 7, 9]
covariances = np.round(best_model.covars_.flatten(),3) #[0.5, 0.2, 1.0, 0.3, 0.7]  # Variance in this case

# Create a figure
plt.figure(figsize=(10, 6))

# Value range for x-axis
x = np.linspace(-1, 1, 1000)

# Plot each Gaussian
for mean, cov in zip(means, covariances):
    plt.plot(x, norm.pdf(x, mean, np.sqrt(cov)), label=f'Mean: {mean}, Var: {cov}')

plt.title('Gaussian Distributions of HMM States')
plt.xlabel('Value')
plt.ylabel('Probability Density')
plt.legend()

plt.show()