In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import scipy.stats as stats

from scipy.stats import zscore
from sklearn.mixture import GaussianMixture

from utils.samples import USED_SAMPLES, REPROGRAMMED_SAMPLES

In [2]:
df = pd.read_parquet("/app/data")

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [None]:
# Defaults
COLUMN_GENE = "ID_REF"
COLUMN_TISSUE_SAMPLE = "TISSUE_SAMPLE"
COLUMN_TISSUE_NAME = "tissue_name"
COLUMN_NORM_VALUE = "VALUE"

# Basic stats

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# Get sample counts
df_counts = df.groupby(by="TISSUE_SAMPLE").agg(sample_count=("ID_REF", "count")).reset_index()
df_counts.head(30)

Mouse embryonic tissues have more samples. Omitting them for the time being.

In [None]:
# Select tissues
mask_tissues = df["TISSUE_SAMPLE"].isin(list(USED_SAMPLES.keys()))
df = df[mask_tissues]

In [None]:
# Bring in samples
df_samples = (
    pd.DataFrame
    .from_dict(USED_SAMPLES, orient="index", columns=["tissue_name"])
    .reset_index()
    .rename(columns={"index": "TISSUE_SAMPLE"})
    )

df = df.merge(
    df_samples,
    on="TISSUE_SAMPLE",
    how="left"
)

In [None]:
# Add repgromming flag
df["reprogrammed"] = df["tissue_name"].str.contains("_OSKM_").astype(int)

In [None]:
# Get tissues and genes
tissues = df["TISSUE_SAMPLE"].unique()
genes = df["ID_REF"].unique()

In [None]:
# This should match the # of unique rows for non mouse samples
len(genes)

In [None]:
import plotly.graph_objects as go

# Calculate mean and std for each group
group_stats = df.groupby('tissue_name')['VALUE'].agg(['mean', 'median', 'std', 'min', 'max']).reset_index()

# Create Plotly plot
fig = go.Figure()

# Add bar for mean
fig.add_trace(go.Bar(
    x=group_stats['tissue_name'],
    y=group_stats['mean'],
    name='Mean',
    error_y=dict(type='data', array=group_stats['std'])
))

# Customize layout
fig.update_layout(title='Mean and Standard Deviation by tissue_name', xaxis_title='tissue_name', yaxis_title='VALUE')

fig.show()

In [None]:
# Create Plotly plot
fig = go.Figure()

# Add scatter plot for mean
fig.add_trace(go.Scatter(
    x=group_stats['tissue_name'],
    y=group_stats['mean'],
    mode='markers',
    name='Mean'
))

# Add scatter plot for median
fig.add_trace(go.Scatter(
    x=group_stats['tissue_name'],
    y=group_stats['median'],
    mode='markers',
    name='Median'
))

# Add scatter plot for min
fig.add_trace(go.Scatter(
    x=group_stats['tissue_name'],
    y=group_stats['min'],
    mode='markers',
    name='Min'
))

# Add scatter plot for max
fig.add_trace(go.Scatter(
    x=group_stats['tissue_name'],
    y=group_stats['max'],
    mode='markers',
    name='Max'
))

# Customize layout
fig.update_layout(
    title='Mean, Median, Min, and Max by tissue_name',
    xaxis_title='tissue_name',
    yaxis_title='VALUE',
    xaxis=dict(
        tickmode='array',
        tickvals=group_stats['tissue_name'],
        ticktext=[
            '<b>{}</b>'.format(sample) if 'OSKM' in sample else sample 
            for sample in group_stats['tissue_name']
        ]
    ),
    height=800
    )

fig.show()

Distributions not directly comparable based on the above plot

**NOTE** The above plot is quite similar to a box plot, but much faster to compute at leat when using plotly.

## Test for a single case

**Tissue:** GSM1215705  
**Gene:** A_24_P325046  

In [None]:
test_tissue = "GSM1215634"
test_gene = "A_24_P325046"

mask_test_tissue = df["TISSUE_SAMPLE"] == test_tissue
mask_test_gene = df["ID_REF"] == test_gene

df_test_tissue = df[mask_test_tissue].copy()
df_test_gene = df[mask_test_gene].copy()

In [None]:
# Creating a histogram using Plotly Express
fig = px.histogram(df_test_tissue, x='VALUE')

# Customize the layout
fig.update_layout(
    title_text=f'Normalized value gene histogram for tissue={test_tissue}',
    xaxis_title_text='Value', # xaxis label
    yaxis_title_text='Frequency', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
)

# Show the plot
fig.show()

We should probably run distribution checks per tissue and use appropriate measure to normalize: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-020-03892-w

In [None]:
# Assuming 'df' is your DataFrame and it has a 'VALUE' column for each 'TISSUE_SAMPLE'
def test_bimodal(sample_values):
    """
    Test if the distribution of the sample values is bimodal.
    """
    sample_values = sample_values.reshape(-1, 1)

    # Fit a Gaussian Mixture Model with 1 component
    gmm1 = GaussianMixture(n_components=1, random_state=0).fit(sample_values)
    aic1 = gmm1.aic(sample_values)

    # Fit a Gaussian Mixture Model with 2 components
    gmm2 = GaussianMixture(n_components=2, random_state=0).fit(sample_values)
    aic2 = gmm2.aic(sample_values)

    # If the AIC is significantly lower for 2 components, it suggests bimodality
    return aic2 < aic1


def test_distribution(sample_values, dist_name):
    """
    Test a sample against a specified distribution and return a goodness-of-fit measure.
    """
    if dist_name == 'normal':
        statistic, p_value = stats.shapiro(sample_values)
        return p_value > 0.05  # True if sample is likely normal
    elif dist_name == 'cauchy':
        # Fit to a Cauchy distribution and return goodness of fit (example: using KS test)
        params = stats.cauchy.fit(sample_values)
        statistic, p_value = stats.kstest(sample_values, 'cauchy', args=params)
        return p_value > 0.05
    elif dist_name == 'lognormal':
        # Similar approach as Cauchy
        params = stats.lognorm.fit(sample_values)
        statistic, p_value = stats.kstest(sample_values, 'lognorm', args=params)
        return p_value > 0.05
    elif dist_name == 'gamma':
        # Similar approach as Cauchy
        params = stats.gamma.fit(sample_values)
        statistic, p_value = stats.kstest(sample_values, 'gamma', args=params)
        return p_value > 0.05
    else:
        raise ValueError("Unknown distribution")


def analyze_distributions(df):
    results = []
    tissue_samples = df['TISSUE_SAMPLE'].unique()
    total_samples = len(tissue_samples)

    for index, tissue in enumerate(tissue_samples, start=1):
        print(f"Analyzing {index}/{total_samples}: TISSUE_SAMPLE = {tissue}")
        
        sample_values = df[df['TISSUE_SAMPLE'] == tissue]['VALUE'].dropna().to_numpy()
        
        result = {'TISSUE_SAMPLE': tissue}
        if test_bimodal(sample_values):
            result.update({dist: False for dist in ['normal', 'cauchy', 'lognormal', 'gamma']})
            result['bimodal'] = True
            print(f'\tDistribution=bimodal')
        else:
            for dist in ['normal', 'cauchy', 'lognormal', 'gamma']:
                print(f'\tFitting distribution: {dist}')
                result[dist] = test_distribution(sample_values, dist)
            result['bimodal'] = False
        
        results.append(result)

    return pd.DataFrame(results)

# Assuming you have a DataFrame 'df'
# df = pd.read_csv('your_data.csv')  # For example
results_df = analyze_distributions(df)

In [None]:
results_df[results_df["bimodal"] == False]

Only two samples GSM1215652 and GSM1215769 do not seem bimodal. Then again, they do not seem to follow any of the chose distributions...

In [None]:
# Creating a histogram using Plotly Express
fig = px.histogram(df[df['TISSUE_SAMPLE'] == "GSM1215652"], x='VALUE')

# Customize the layout
fig.update_layout(
    title_text=f'Normalized value gene histogram for tissue=GSM1215652',
    xaxis_title_text='Value', # xaxis label
    yaxis_title_text='Frequency', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
)

# Show the plot
fig.show()

In [None]:
# Creating a histogram using Plotly Express
fig = px.histogram(df[df['TISSUE_SAMPLE'] == "GSM1215769"], x='VALUE')

# Customize the layout
fig.update_layout(
    title_text=f'Normalized value gene histogram for tissue=GSM1215769',
    xaxis_title_text='Value', # xaxis label
    yaxis_title_text='Frequency', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
)

# Show the plot
fig.show()

However, the plots seem bimodalish with some peculiar properties..

What if we compare with the total distribution

In [None]:
# Creating a histogram using Plotly Express
fig = px.histogram(df_test_gene, x='VALUE')

# Customize the layout
fig.update_layout(
    title_text=f'Histogram for gene={test_gene}',
    xaxis_title_text='Value', # xaxis label
    yaxis_title_text='Frequency', # yaxis label
    #bargap=0.2, # gap between bars of adjacent location coordinates
)

# Show the plot
fig.show()

In [None]:
# Lets see about Dark and brightcorners
mask_control = df["ID_REF"].isin(["DarkCorner", "GE_BrightCorner"])
df_control = df[mask_control]

In [None]:
fig = px.histogram(df_control, x='VALUE', color="ID_REF")

# Customize the layout
fig.update_layout(
    title_text=f'Normalized value gene histogram for gene={test_gene}',
    xaxis_title_text='Value', # xaxis label
    yaxis_title_text='Frequency', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
)

# Show the plot
fig.show()

In [None]:
# Explode and scatter plot
df_control_pivoted = df_control.pivot(index='TISSUE_SAMPLE', columns='ID_REF', values='VALUE').reset_index()
df_control_pivoted["control_mean"] = (df_control_pivoted["DarkCorner"] + df_control_pivoted["GE_BrightCorner"]) / 2

df_control_pivoted.head()

In [None]:
# Creating the scatter plot
fig = px.scatter(df_control_pivoted, x='DarkCorner', y='GE_BrightCorner')

# Show the plot
fig.show()

In [None]:
def count_gene_expression_simple_threshold(
        df: pd.DataFrame,
        id_cols: list[str] = ["ID_REF"],
        value_col: str = "VALUE",
        threshold: float = 0.0
        ) -> pd.DataFrame:
    X = df.copy()
    X = (
        X
        .assign(gene_expressed=lambda x: (x[value_col] > threshold).astype(int))
        .groupby(id_cols)
        .agg(
            tissue_sample_count=("gene_expressed", "count"),
            gene_expressed_sum=("gene_expressed", "sum")
            )
        .reset_index()
    )
    return X


def count_gene_expression_column_threshold(
        df: pd.DataFrame,
        threshold_col: str,
        id_cols: list[str] = ["ID_REF"],
        value_col: str = "VALUE",
        ) -> pd.DataFrame:
    X = df.copy()
    X = (
        X
        .assign(gene_expressed=lambda x: (x[value_col] > x[threshold_col]).astype(int))
        .groupby(id_cols)
        .agg(
            tissue_sample_count=("gene_expressed", "count"),
            gene_expressed_sum=("gene_expressed", "sum")
            )
        .reset_index()
    )
    return X


def scale_to_range(group):
    min_val = group.min()
    max_val = group.max()
    return 2 * ((group - min_val) / (max_val - min_val)) - 1

In [None]:
df.head()

In [None]:
# Bring in mean control values
df = df.merge(
    df_control_pivoted[["TISSUE_SAMPLE", "DarkCorner", "GE_BrightCorner", "control_mean"]],
    on="TISSUE_SAMPLE",
    how="left"
)

In [None]:
# Drop Dark and Bright corners
mask_control = df["ID_REF"].isin(["GE_BrightCorner", "DarkCorner"])
df = df[~mask_control]

In [None]:
# Apply scaling within each group
df["scaled_value"] = df.groupby("TISSUE_SAMPLE")["VALUE"].transform(scale_to_range)

In [None]:
X_simple = count_gene_expression_simple_threshold(df, value_col="scaled_value", id_cols=["ID_REF"], threshold=0.0)
X_column = count_gene_expression_column_threshold(df, threshold_col="control_mean", id_cols=["ID_REF", "reprogrammed"])

In [None]:
X_simple = count_gene_expression_simple_threshold(df, value_col="scaled_value", id_cols=["ID_REF"], threshold=0.0)

In [None]:
X_simple_not_scaled = count_gene_expression_simple_threshold(df, value_col="VALUE", id_cols=["ID_REF"], threshold=0.0)

In [None]:
X_simple.head()

In [None]:
fig = px.histogram(X_simple, x='gene_expressed_sum')

# Customize the layout
fig.update_layout(
    title_text=f'Gene expressed sum histogram based on simple filtering. # of tissues = {X_simple.iloc[0, 1]}. Threshold = 0.0',
    xaxis_title_text='Expressed sum', # xaxis label
    yaxis_title_text='Frequency', # yaxis label
    #bargap=0.2, # gap between bars of adjacent location coordinates
)

# Show the plot
fig.show()

In [None]:
fig = px.histogram(X_simple_not_scaled, x='gene_expressed_sum')

# Customize the layout
fig.update_layout(
    title_text=f'Gene expressed sum histogram based on simple filtering. # of tissues = {X_simple_not_scaled.iloc[0, 1]} Threshold = 0.0',
    xaxis_title_text='Expressed sum', # xaxis label
    yaxis_title_text='Frequency', # yaxis label
    #bargap=0.2, # gap between bars of adjacent location coordinates
)

# Show the plot
fig.show()

In [None]:
X_simple

In [None]:
X_simple["gene_expressed_sum"].unique()

In [None]:
X_column["gene_expressed_sum"].unique()

In [None]:
df.head()

# Tests for expression thresholds

In [None]:
# Set thresholds
score_thresholds_two_tailed = {
    "90th_percentile": 1.64,
    "95th_percentile": 1.96,
    "99th_percentile": 2.58
}

quantile_thresholds_two_tailed = {
    "90th_percentile": 0.90,
    "95th_percentile": 0.95,
    "99th_percentile": 0.99
}

high_expression_filter = ('reprogrammed_share_highly_expressed', 0.99)
negative_expression_filter = ('normal_share_negatively_expressed', 0.80)

In [None]:
def prune_results(
        df: pd.DataFrame, 
        high_expression: tuple[str, float],
        negative_expression: tuple[str, float]
        ) -> pd.DataFrame:
    """
    Filters the DataFrame based on conditions for high and negative expression.

    This function prunes the input DataFrame by applying two filters: one for high expression and another for negative expression. 
    Rows are retained if they meet both conditions: their value in the column specified for high expression exceeds the 
    corresponding threshold, and their value in the column specified for negative expression exceeds its threshold.

    Parameters:
    df (pd.DataFrame): The input DataFrame to be pruned.
    high_expression (tuple[str, float]): A tuple containing the column name and threshold for high expression share.
    negative_expression (tuple[str, float]): A tuple containing the column name and threshold for negative expression share.

    Returns:
    pd.DataFrame: A pruned DataFrame containing rows that meet both high and negative expression criteria.
    """
    df = df.copy()
    mask = (
        (df[high_expression[0]] >= high_expression[1]) &
        (df[negative_expression[0]] >= negative_expression[1])
    )
    return df[mask]


def _get_expression_shares(
        df: pd.DataFrame,
        reprogrammed_tissues: dict[str, list[str]],
        column_score: str,
        high_threshold: float
        ) -> pd.DataFrame:
    # Process each key in reprogrammed tissues
    dictkey_cols = []
    for key, tissue_samples in reprogrammed_tissues.items():
        key_high_expr_col = f'{key}_highly_expressed'
        dictkey_cols.append(key_high_expr_col)
        
        # Find genes highly expressed in any of the tissue samples for this key
        mask_highly_expressed_in_key = (
            df[COLUMN_TISSUE_SAMPLE].isin(tissue_samples) & (df[column_score] > high_threshold)
        )
        highly_expressed_genes = df[mask_highly_expressed_in_key][COLUMN_GENE].unique()

        # Mark genes as 1 if they are highly expressed in any of the key's tissue samples
        df[key_high_expr_col] = df[COLUMN_GENE].isin(highly_expressed_genes).astype(int)

    # Calculate shares for normal and reprogrammed samples
    normal_sample_mask = df['reprogrammed'] == 0
    normal_negatively_expressed = df[normal_sample_mask].groupby(COLUMN_GENE)['negatively_expressed'].mean()
    df = df.join(normal_negatively_expressed, on=COLUMN_GENE, rsuffix='_normal_share')

    df = (df
        .rename(
            columns={'negatively_expressed_normal_share': 'normal_share_negatively_expressed'}
            )
        )

    df['reprogrammed_share_highly_expressed'] = df[dictkey_cols].mean(axis=1)

    return df


def z_score_analysis(
        df: pd.DataFrame,
        threshold: float = 1.96,
        reprogrammed_tissues: dict[str, list[str]] = REPROGRAMMED_SAMPLES,
        column_score: str = 'z_score'
        ) -> pd.DataFrame:
    """
    Performs z-score analysis on gene expression data to identify highly and negatively expressed genes.

    This function applies z-score thresholds to determine high and negative expression in a gene expression dataset.
    It adds columns indicating whether each gene is highly or negatively expressed based on the z-score. It also processes 
    specified reprogrammed tissues to identify genes that are highly expressed in any of the tissue samples for each given key.
    Additionally, it calculates the share of negatively expressed genes in normal samples and the share of highly expressed genes 
    in reprogrammed samples.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing gene expression data.
    threshold (float, optional): The z-score threshold for defining high expression. Defaults to 1.96 (~95th percentile).
    reprogrammed_tissues (dict[str, list[str]], optional): A dictionary mapping keys to lists of tissue samples 
                                                            representing reprogrammed tissues.

    Returns:
    pd.DataFrame: The DataFrame with additional columns indicating high/negative expression and shares of expression in normal 
                  and reprogrammed samples.
    """
    
    df = df.copy()

    # Define thresholds for high and low expression
    high_threshold = threshold
    low_threshold = -1 * high_threshold

    df['z_score'] = df.groupby(COLUMN_TISSUE_SAMPLE)[COLUMN_NORM_VALUE].transform(zscore)

    df['highly_expressed'] = (df['z_score'] > high_threshold).astype(int)
    df['negatively_expressed'] = (df['z_score'] < low_threshold).astype(int)

    return _get_expression_shares(df, reprogrammed_tissues, column_score, high_threshold)


def bimodal_analysis(
        df: pd.DataFrame,
        threshold: float = 1.96,
        reprogrammed_tissues: dict[str, list[str]] = REPROGRAMMED_SAMPLES,
        column_score: str = 'bimodal_expression'
        ) -> pd.DataFrame:
    """
    Analyzes gene expression data to classify genes based on bimodal expression patterns and calculates the share 
    of highly and negatively expressed genes in each tissue sample.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing gene expression data.
    value_column (str): The column name in df that contains the gene expression values to be analyzed.
    threshold (float, optional): The z-score threshold for defining high and low expression in each mode. Defaults to 1.96.

    Returns:
    pd.DataFrame: The DataFrame with additional columns indicating bimodal high, low, or normal expression for each tissue sample,
                  along with the share of highly and negatively expressed genes.
    """
    df = df.copy()
    tissue_samples = df['TISSUE_SAMPLE'].unique()

    # Define thresholds for high and low expression
    high_threshold = threshold
    low_threshold = -1 * high_threshold

    # Classify gene expression using bimodal models
    for tissue in tissue_samples:
        sample_values = df[df[COLUMN_TISSUE_SAMPLE] == tissue][COLUMN_NORM_VALUE]
        bimodal_labels = classify_expression_bimodal(sample_values, threshold)

        # Assuming 'ID_REF' is the identifier for each gene
        df.loc[df['TISSUE_SAMPLE'] == tissue, column_score] = bimodal_labels

    df['highly_expressed'] = (df['bimodal_expression'] > high_threshold).astype(int)
    df['negatively_expressed'] = (df['bimodal_expression'] < low_threshold).astype(int)

    return _get_expression_shares(df, reprogrammed_tissues, column_score, high_threshold)


def classify_expression_bimodal(sample_values: pd.Series | np.ndarray, threshold: float) -> list[str]:
    """
    Classifies each value in a sample as highly expressed, negatively expressed, or normally expressed in a bimodal distribution.

    The classification is based on a Gaussian Mixture Model with two components. Each value is assigned to a component (mode)
    and then classified as high, low, or normal based on its distance from the mean of its assigned mode.

    Parameters:
    sample_values (pd.Series | np.ndarray): An array or series of gene expression values to be classified.
    threshold (float, optional): The threshold (in standard deviations from the mean) for defining high and low expression. 
                                 Defaults to 1.96 (approximately the 95th percentile).

    Returns:
    list[str]: A list of labels ('bimodal_highly_expressed', 'bimodal_negatively_expressed', 'bimodal_normally_expressed') 
               for each value in sample_values.
    """
    # Reshape the data and fit a GMM with 2 components
    if isinstance(sample_values, pd.Series):
        sample_values = sample_values.to_numpy()
    sample_values_reshaped = sample_values.reshape(-1, 1)
    gmm = GaussianMixture(n_components=2, random_state=0).fit(sample_values_reshaped)

    # Predict the component each sample belongs to
    component_labels = gmm.predict(sample_values_reshaped)

    # Calculate mean and std for each component
    means = gmm.means_.flatten()
    stds = np.sqrt(gmm.covariances_.flatten())

    # Define thresholds for high and low expression for each mode
    high_thresholds = means + threshold * stds
    low_thresholds = means - threshold * stds

    # Classify each gene based on its mode and expression level
    expression_labels: list[float] = []
    for value, label in zip(sample_values, component_labels): # Should negativity be considerd here?
        if value > high_thresholds[label]:
            expression_labels.append(1.1 * threshold)
        elif value < low_thresholds[label]:
            expression_labels.append(-1.1 * threshold)
        else:
            expression_labels.append(0.0)

    return expression_labels


def quantile_analysis(
        df: pd.DataFrame,
        threshold: float = 1.96,
        reprogrammed_tissues: dict[str, list[str]] = REPROGRAMMED_SAMPLES,
        column_score:str = 'quantile_expression',
        lower_quantile: float = 0.05,
        upper_quantile: float = 0.95,
        ) -> pd.DataFrame:
    """
    Analyzes gene expression data to classify genes based on bimodal expression patterns and calculates the share 
    of highly and negatively expressed genes in each tissue sample.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing gene expression data.
    value_column (str): The column name in df that contains the gene expression values to be analyzed.
    threshold (float, optional): The z-score threshold for defining high and low expression in each mode. Defaults to 1.96.

    Returns:
    pd.DataFrame: The DataFrame with additional columns indicating bimodal high, low, or normal expression for each tissue sample,
                  along with the share of highly and negatively expressed genes.
    """
    df = df.copy()
    tissue_samples = df['TISSUE_SAMPLE'].unique()

    # Classify gene expression using bimodal models
    for tissue in tissue_samples:
        sample_values = df[df[COLUMN_TISSUE_SAMPLE] == tissue][COLUMN_NORM_VALUE]
        quantile_labels = classify_quantiles(sample_values, threshold, lower_quantile, upper_quantile)

        # Assuming 'ID_REF' is the identifier for each gene
        df.loc[df[COLUMN_TISSUE_SAMPLE] == tissue, column_score] = quantile_labels

    df['highly_expressed'] = (df[column_score] == 'highly_expressed').astype(int)
    df['negatively_expressed'] = (df[column_score] == 'negatively_expressed').astype(int)

    return _get_expression_shares(df, reprogrammed_tissues, column_score, threshold)


def classify_quantiles(
        sample_values: pd.Series | np.ndarray,
        threshold: float,
        lower_quantile: float,
        upper_quantile: float
    ) -> pd.DataFrame:
    """
    Classifies each value in a sample as highly expressed, negatively expressed, or normally expressed based on quantiles.

    The classification uses the 25th and 75th quantiles (by default) to define thresholds for low and high expression. 
    Values above the upper quantile threshold are classified as 'highly_expressed', values below the lower quantile threshold 
    are classified as 'negatively_expressed', and values in between are classified as 'normally_expressed'.

    Parameters:
    sample_values (pd.Series | np.ndarray): An array or series of gene expression values to be classified.
    lower_quantile (float, optional): The lower quantile threshold for defining low expression. Defaults to 0.25.
    upper_quantile (float, optional): The upper quantile threshold for defining high expression. Defaults to 0.75.

    Returns:
    list[str]: A list of labels ('highly_expressed', 'negatively_expressed', 'normally_expressed') for each value in sample_values.
    """
    if isinstance(sample_values, pd.Series):
        sample_values = sample_values.to_numpy()
    # Define quantile thresholds
    high_threshold = np.quantile(sample_values, upper_quantile)
    low_threshold = np.quantile(sample_values, lower_quantile)

    # Classify each gene based on quantile thresholds
    expression_labels: list[float] = []
    for value in sample_values:
        if value > high_threshold:
            expression_labels.append(1.1 * threshold)
        elif value < low_threshold:
            expression_labels.append(-1.1 * threshold)
        else:
            expression_labels.append(0.0)

    return expression_labels


def grid_search(
        df: pd.DataFrame,
        lower_thresholds: np.ndarray,
        upper_thresholds: np.ndarray,
        reprogrammed_tissues: dict[str, list[str]] = REPROGRAMMED_SAMPLES,
        ) -> pd.DataFrame:
    df = df.copy()
    _high_expression_filter = ('reprogrammed_share_highly_expressed', 0.99)
    _negative_expression_filter = ('normal_share_negatively_expressed', 0.80)

    potential_gene_dfs: list[pd.DataFrame] = []
    for low_threshold in lower_thresholds:
        for high_threshold in upper_thresholds:
             # Define thresholds for high and low expression
                df['highly_expressed'] = (df[COLUMN_NORM_VALUE] > high_threshold).astype(int)
                df['negatively_expressed'] = (df[COLUMN_NORM_VALUE] < low_threshold).astype(int)
                df_current = prune_results(
                    _get_expression_shares(df, reprogrammed_tissues, COLUMN_NORM_VALUE, high_threshold),
                    _high_expression_filter,
                    _negative_expression_filter
                    )
                print(f'Iteration: ({low_threshold, high_threshold}. Df shape: {df_current.shape})')
                if df_current.shape[0] > 0:
                    print(
                        f'Lower/Upper thresholds set at: ({low_threshold}, {high_threshold})'
                        f'# of unique genes: {df_current[COLUMN_GENE].nunique()}'
                        )
                    if df_current[COLUMN_GENE].nunique() < 20:
                        print(f'Unique genes: {df_current[COLUMN_GENE].unique()}')
                    potential_gene_dfs.append(df_current)
    return potential_gene_dfs

## Z-scored based filtering of highly and negatively expressed genes

In [None]:
# Filtering
for key, threshold in score_thresholds_two_tailed.items():
    print(f'Running z-score analysis for {key}~={threshold}')
    df_z_scores = z_score_analysis(df, threshold)
    df_pruned_z_scores = prune_results(df_z_scores, high_expression_filter, negative_expression_filter)

    print(f'Left with {df_pruned_z_scores[COLUMN_GENE].nunique()} genes after pruning.')

In [None]:
df_pruned_z_scores[COLUMN_GENE].unique()

## Bimodal distribution based filtering of highly and negatively expressed genes

In [None]:
df_bimodal = bimodal_analysis(df)

In [None]:
# Filtering
for key, threshold in score_thresholds_two_tailed.items():
    print(f'Running bimodal analysis for {key}~={threshold}')
    df_pruned_bimodal = prune_results(df_bimodal, high_expression_filter, negative_expression_filter)

    print(f'Left with {df_pruned_bimodal[COLUMN_GENE].nunique()} genes after pruning.')

## Quantile based filtering of highly and negatively expressed genes

In [None]:
# Filtering
for key, threshold in quantile_thresholds_two_tailed.items():
    print(f'Running quantile analysis for {key}~={threshold}')
    df_quantiles = quantile_analysis(df, threshold, lower_quantile=1-threshold, upper_quantile=threshold)
    df_pruned_quantiles = prune_results(df_z_scores, high_expression_filter, negative_expression_filter)

    print(f'Left with {df_pruned_quantiles[COLUMN_GENE].nunique()} genes after pruning.')

In [None]:
df_pruned_quantiles[COLUMN_GENE].unique()

## Grid search based filterig of highly and negatively expressed genes

In [None]:
lower_thresholds = np.arange(-4.7, -3.5, 0.1)
upper_thresholds = np.arange(5.5, 7.1, 0.1)
df_grid = grid_search(df, lower_thresholds, upper_thresholds)