In [1]:
import dash
from dash import dcc, html, Input, Output, callback, State, dash_table, dependencies
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import os

In [2]:
# Base directory for data
BASE_DIR = '/home/pdutta/Data/Cancer_wiseGDC/New_data'

# Listing the main categories
#categories = [d for d in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d)) and not d.startswith('.')]

In [3]:
#categories

In [4]:
# Create a Dash app
app = dash.Dash(__name__)

In [5]:
# App layout
app.layout = html.Div([
    html.H1("Analysis of somatic variants across various GDC cancer patients", style={'text-align': 'center'}),
    dcc.Dropdown(
        id='cancer-type-dropdown',
        options=[
            {'label': 'Select Cancer Type', 'value': ''},
            {'label': 'Lung', 'value': 'Lung'},
            {'label': 'Brain', 'value': 'Brain'},
            {'label': 'Breast', 'value': 'Breast'}
        ],
        value='',  # Default value prompting user selection
        style={'width': '50%', 'margin': 'auto'}
    ),
    html.H3("Barplot of the Number of Unique Variants across various DNABERT fine-tuned models", style={'text-align': 'center'}),
    dcc.Graph(id='overall-variants-bar-plot'),
    html.H3("Patients statistics", style={'text-align': 'center'}),
    dcc.Graph(id='variant-statistics-plot'),
    html.H3("Variant wise statistics", style={'text-align': 'center'}),
    html.Div(id='subgraphs-container', style={'display': 'flex', 'flex-direction': 'row', 'flex-wrap': 'wrap'}),  # Container for subgraphs and sliders
])

In [6]:
def load_data(category):
    """Load df_log_odd_score.tsv files from specified category."""
    path = os.path.join(BASE_DIR, category, 'Generated_files', 'Variant_Analysis')
    data = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if 'df_log_odd_score.tsv' in file:
                subfolder = os.path.basename(root)
                full_path = os.path.join(root, file)
                df = pd.read_csv(full_path, sep='\t')
                data.append((subfolder, df))
    return data

In [7]:
def calculation(df):
    # Calculate the max value in each column (excluding the first column which we assume is not variant data)
    column_max = df.iloc[:, 1:].max()
    
    # Sort these max values in descending order
    sorted_max = column_max.sort_values(ascending=False)
    
    # Extract the column names for the top values
    top_columns = sorted_max.index
    
    # Count the number of non-NaN values in each of these columns
    non_nan_counts = [df[col].notna().sum() for col in top_columns]
    
    # Create a DataFrame from the sorted max values
    top_df = sorted_max.reset_index()
    top_df.columns = ['Column', 'Max Value']
    
    # Add the non-NaN counts as a new column
    top_df['Non-NaN Count'] = non_nan_counts
    
    # Drop any rows where the max value is NaN
    top_df = top_df.dropna(subset=['Max Value'])
    
    # Sort the DataFrame by the 'Non-NaN Count' column in descending order
    top_df = top_df.sort_values(by='Non-NaN Count', ascending=False)
    
    # Calculate the percentage of non-NaN counts for each column
    top_df['Percentage'] = top_df['Non-NaN Count'] / df.shape[0] * 100
    
    # Define the bins for percentage ranges
    bins = [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    
    # Bin the percentage values and count how many fall into each bin without adding it to top_df
    binned_counts = pd.cut(top_df['Percentage'], bins=bins).value_counts().sort_index()

    # Return just the binned counts, which is a Series object where the index represents bins
    # and the values represent the count of entries in each bin
    return binned_counts, top_df


In [8]:
@app.callback(
    Output('sliders-container', 'children'),
    Input('cancer-type-dropdown', 'value'))
def update_sliders(selected_category):
    if not selected_category:
        return html.Div()  # Return an empty div if no category is selected

    cat_data = load_data(selected_category)
    sliders = []
    for subfolder, df in cat_data:
        num_columns = df.shape[1] - 1  # Assuming the first column isn't a variant column
        slider = dcc.RangeSlider(
            id=f'slider-{subfolder}',
            min=0,
            max=num_columns,
            step=1,
            value=[0, 20],  # Default to first half
            marks={i: str(i) for i in range(0, num_columns + 1, max(1, num_columns // 10))},
            tooltip={"placement": "bottom", "always_visible": True}
        )
        sliders.append(html.Div([html.H5(subfolder), slider], style={'margin-top': '20px'}))
    return sliders

In [9]:
@app.callback(
    Output('overall-variants-bar-plot', 'figure'),
    Input('cancer-type-dropdown', 'value')
)
def update_overall_variants_plot(selected_category):
    if not selected_category:
        return px.bar()  # Return an empty plot if no category is selected

    cat_data = load_data(selected_category)
    data = []
    for subfolder, df in cat_data:
        num_variants = df.shape[1] - 1  # Assuming the first column is not a variant
        num_patients = df.shape[0]
        data.append({'Non-coding regions': subfolder, 'Unique Variants': num_variants, 'Patients': num_patients})
    
    df_plot = pd.DataFrame(data)
    fig = px.bar(
        df_plot, 
        x='Non-coding regions', 
        y='Unique Variants', 
        text='Unique Variants',
        hover_data=['Patients'],
        color='Non-coding regions',  # Use the custom legend entry for coloring
        color_continuous_scale='Viridis'  # Using Viridis color scale
    )
    fig.update_traces(texttemplate='%{text}', textposition='outside')
    return fig

In [10]:
# @app.callback(
#     Output('variant-statistics-plot', 'figure'),
#     [Input('cancer-type-dropdown', 'value')] +
#     [dependencies.Input(f'slider-{subfolder}', 'value') for subfolder, _ in load_data('Lung')]  # Example category for initialization
# )
# def update_variant_statistics(selected_category, *slider_ranges):
#     if not selected_category:
#         return go.Figure()  # Return an empty figure if no category is selected

#     # Load the data based on the selected cancer type
#     cat_data = load_data(selected_category)
#     calculated_data = []
#     subfolder_titles = []

#     # Process each subfolder's data
#     for (subfolder, df), slider_range in zip(cat_data, slider_ranges):
#         start_index, end_index = slider_range
#         # Apply the calculation function to the subset of columns indicated by the slider
#         calculated_df = calculation(df.iloc[:, start_index:end_index + 1])
#         calculated_data.append((subfolder, calculated_df))
#         subfolder_titles.append(subfolder)

#     # Create subplots, one for each subfolder, using the calculated data
#     fig_subplots = make_subplots(rows=1, cols=len(subfolder_titles), subplot_titles=subfolder_titles)
#     for i, (subfolder, data) in enumerate(calculated_data, start=1):
#         fig_subplots.add_trace(
#             go.Bar(
#                 x=data['Column'],
#                 y=data['Non-NaN Count'],
#                 name=subfolder
#             ),
#             row=1, col=i
#         )
#         fig_subplots.update_xaxes(title_text='Variant Position', row=1, col=i)
#         fig_subplots.update_yaxes(title_text='Non-NaN Count', row=1, col=i)

#     # Update the layout to enhance readability and presentation
#     fig_subplots.update_layout(title_text='Variant Wise Statistics', showlegend=False)
#     return fig_subplots

In [11]:
@app.callback(
    [Output('subgraphs-container', 'children')],
    [Input('cancer-type-dropdown', 'value')]
)
def update_subgraphs(selected_category):
    if not selected_category:
        return [[]]  # Return an empty list if no category is selected

    cat_data = load_data(selected_category)
    children = []
    for i, (subfolder, df) in enumerate(cat_data):
        num_columns = df.shape[1] - 1  # Assuming the first column isn't a variant column
        slider = dcc.RangeSlider(
            id=f'slider-{subfolder}',
            min=0,
            max=num_columns,
            step=1,
            value=[0, 20],
            marks={i: str(i) for i in range(0, num_columns + 1, max(1, num_columns // 10))},
            tooltip={"placement": "bottom", "always_visible": True}
        )
        graph = dcc.Graph(id=f'graph-{subfolder}', style={'height': '900px'})
        # Set flex to 1 to evenly distribute the space and maxWidth to 25% of the container width
        children.append(html.Div([
            html.H5(subfolder, style={'textAlign': 'center'}),
            slider,
            graph
        ], style={'flex': '1', 'maxWidth': '25%', 'padding': '0 10px'}))

    return [children]


In [12]:
# This callback should update the figure in each subgraph based on the slider values.
@app.callback(
    [Output(f'graph-{subfolder}', 'figure') for subfolder, _ in load_data('Lung')],  # We initialize with 'Lung' just to get the number of callbacks needed
    [Input(f'slider-{subfolder}', 'value') for subfolder, _ in load_data('Lung')] + [Input('cancer-type-dropdown', 'value')]
)
def update_subgraph_figures(*args):
    # Since we have an input for the cancer type dropdown at the end, all slider values come before it
    slider_values = args[:-1]
    selected_category = args[-1]

    if not selected_category:
        # If no category is selected, we return empty figures
        return [go.Figure() for _ in slider_values]

    cat_data = load_data(selected_category)
    figures = []

    # We match each subfolder's data with the corresponding slider value
    for (subfolder, df), slider_value in zip(cat_data, slider_values):
        start_index, end_index = slider_value
        binned_values, calculated_df = calculation(df.iloc[:, start_index:end_index + 1])
        
        # Create a bar chart for the current subfolder's data
        fig = go.Figure(data=[
            go.Bar(
                x=calculated_df['Column'],
                y=calculated_df['Non-NaN Count'],
                name=subfolder
            )
        ])
        
        fig.update_layout(
            title={'text': subfolder, 'x': 0.5, 'xanchor': 'center'},
            xaxis_title="Variant",
            yaxis_title="Non-NaN Count"
        )
        
        figures.append(fig)

    return figures

In [13]:
@app.callback(
    Output('variant-statistics-plot', 'figure'),
    [Input('cancer-type-dropdown', 'value')] +
    [dependencies.Input(f'slider-{subfolder}', 'value') for subfolder, _ in load_data('Lung')]  # Example category for initialization
)
def update_variant_statistics(selected_category, *slider_ranges):
    if not selected_category:
        return go.Figure()  # Return an empty figure if no category is selected

    cat_data = load_data(selected_category)
    fig_subplots = make_subplots(rows=1, cols=len(cat_data), subplot_titles=[subfolder for subfolder, _ in cat_data], specs=[[{"type": "bar"}]*len(cat_data)])

    for (subfolder, df), slider_range in zip(cat_data, slider_ranges):
        start_index, end_index = slider_range
        binned_values = calculation(df.iloc[:, start_index:end_index + 1])
        # Convert bin intervals to strings for plotting
        bin_intervals = [str(interval) for interval in binned_values.index]

        fig_subplots.add_trace(
            go.Bar(
                x=bin_intervals,  # Bin ranges as strings
                y=binned_values.values,  # Count of entries in each bin
                name=subfolder
            ),
            row=1, col=cat_data.index((subfolder, df))+1
        )

    fig_subplots.update_layout(
        title_text='Variant Wise Statistics with Percentage Bins', 
        showlegend=True,
        xaxis_title="Percentage Bins",
        yaxis_title="Variant Counts",
        barmode='group'
    )
    return fig_subplots

In [14]:
if __name__ == '__main__':
    app.run_server(host='0.0.0.0', port=9757, mode='external')

[2024-04-18 20:17:35,010] ERROR in app: Exception on /_dash-update-component [POST]
Traceback (most recent call last):
  File "/home/pdutta/anaconda3/envs/GDC_VCF/lib/python3.10/site-packages/flask/app.py", line 1473, in wsgi_app
    response = self.full_dispatch_request()
  File "/home/pdutta/anaconda3/envs/GDC_VCF/lib/python3.10/site-packages/flask/app.py", line 882, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/home/pdutta/anaconda3/envs/GDC_VCF/lib/python3.10/site-packages/flask/app.py", line 880, in full_dispatch_request
    rv = self.dispatch_request()
  File "/home/pdutta/anaconda3/envs/GDC_VCF/lib/python3.10/site-packages/flask/app.py", line 865, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)  # type: ignore[no-any-return]
  File "/home/pdutta/anaconda3/envs/GDC_VCF/lib/python3.10/site-packages/dash/dash.py", line 1352, in dispatch
    ctx.run(
  File "/home/pdutta/anaconda3/envs/GDC_VCF/lib/python3.10