In [1]:
import dash
from dash import dcc, html, Input, Output, callback, State
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import os

In [2]:
# Base directory for data
BASE_DIR = '/home/pdutta/Data/Cancer_wiseGDC/New_data'

# Listing the main categories
#categories = [d for d in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d)) and not d.startswith('.')]

In [3]:
#categories

In [4]:
# Create a Dash app
app = dash.Dash(__name__)

In [5]:
# App layout
app.layout = html.Div([
    html.H1("Analysis of somatic variants across various GDC cancer patients", style={'text-align': 'center'}),
    dcc.Dropdown(
        id='cancer-type-dropdown',
        options=[
            {'label': 'Select Cancer Type', 'value': ''},
            {'label': 'Lung', 'value': 'Lung'},
            {'label': 'Brain', 'value': 'Brain'},
            {'label': 'Breast', 'value': 'Breast'}
        ],
        value='',  # Default value prompting user selection
        style={'width': '50%', 'margin': 'auto'}
    ),
    html.H3("Barplot of the Number of Unique Variants across various DNABERT fine-tuned models", style={'text-align': 'center'}),
    dcc.Graph(id='variants-bar-plot'),
    html.H3("Variant wise statistics", style={'text-align': 'center'}),
    dcc.RangeSlider(
        id='index-range-slider',
        min=0,
        max=100,  # Adjust based on your dataset size
        step=1,
        value=[0, 10],
        marks={i: str(i) for i in range(0, 101, 10)},
        tooltip={"placement": "bottom", "always_visible": True}
    ),
    dcc.Graph(id='variant-statistics-plot'),
    dcc.Store(id='variant-index', data={'start_index': 0})  # To store current index for pagination
])

In [1]:
# Assuming you have a DataFrame named 'df'
def calculation(df):
    # Calculate the maximum values for columns, starting from the third column, and sort them in descending order
    column_max = df.iloc[:, 1:].max()
    sorted_max = column_max.sort_values(ascending=False)
    #Get the names of the top 15 columns
    top_columns = sorted_max.index
    # Count the number of non-NaN values in each of the top 15 columns
    non_nan_counts = [df[col].notna().sum() for col in top_columns]
    #print(non_nan_counts)
    # Convert the Series of sorted max values to a DataFrame
    top_df = sorted_max.reset_index()
    top_df.columns = ['Column', 'Max Value']
    # Add the non-NaN counts as a new column to the DataFrame
    top_df['Non-NaN Count'] = non_nan_counts
    top_df = top_df.dropna(subset=['Max Value'])
    top_df = top_df.sort_values(by='Non-NaN Count', ascending=False)
    return(top_df)

In [7]:
def load_data(category):
    """Load df_log_odd_score.tsv files from specified category."""
    path = os.path.join(BASE_DIR, category, 'Generated_files', 'Variant_Analysis')
    data = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if 'df_log_odd_score.tsv' in file:
                subfolder = os.path.basename(root)
                full_path = os.path.join(root, file)
                df = pd.read_csv(full_path, sep='\t')
                data.append((subfolder, df))
    return data

In [8]:
def get_variant_statistics_subplots(calculated_data, start_index, end_index, subfolder_titles):
    subplots = make_subplots(rows=1, cols=4, subplot_titles=subfolder_titles)
    for i, (subfolder, data) in enumerate(calculated_data, start=1):
        data = data.sort_values(by='Non-NaN Count', ascending=False).iloc[start_index:end_index]
        subplots.add_trace(
            go.Bar(
                x=data['Column'],
                y=data['Non-NaN Count']
            ),
            row=1, col=i
        )
        subplots.update_xaxes(title_text='Variant Position', row=1, col=i)
        subplots.update_yaxes(title_text='Non-NaN Count', row=1, col=i)
    subplots.update_layout(title_text='Variant Wise Statistics', showlegend=False)
    return subplots

In [9]:
@app.callback(
    [Output('variants-bar-plot', 'figure'),
     Output('variant-statistics-plot', 'figure')],
    [Input('cancer-type-dropdown', 'value'),
     Input('index-range-slider', 'value')],
    [dash.dependencies.State('variant-index', 'data')]
)
def update_graphs(selected_category, slider_range, stored_data):
    # Return empty plots if no category is selected
    if not selected_category:
        return px.bar(), go.Figure()

    start_index, end_index = slider_range

    # Load data based on the selected cancer type
    cat_data = load_data(selected_category)
    calculated_data = []
    subfolder_titles = []

    # Process each subfolder's data
    for subfolder, df in cat_data:
        calculated_df = calculation(df)
        calculated_data.append((subfolder, calculated_df))
        subfolder_titles.append(subfolder)

    # Generate the first bar plot for unique variants
    data = []
    for subfolder, df in cat_data:
        num_patients = df.shape[0]
        num_variants = df.shape[1] - 1  # Subtract 1 to account for the index column
        data.append({
            'Non-coding regions': subfolder,
            'Unique Variants': num_variants,
            'label': f"{subfolder} ({num_patients} patients)"
        })
    df_plot = pd.DataFrame(data)
    fig_bar = px.bar(
        df_plot,
        x='Non-coding regions', 
        y='Unique Variants', 
        text='Unique Variants',
        labels={'Unique Variants': 'Number of Unique Variants', 'Non-coding regions': 'Non-coding Regions'},
        color='label'
    )
    fig_bar.update_traces(texttemplate='%{text}', textposition='outside')

    # Generate subplots for variant statistics using the selected range
    fig_subplots = get_variant_statistics_subplots(calculated_data, start_index, end_index, subfolder_titles)

    return fig_bar, fig_subplots

In [10]:
if __name__ == '__main__':
    app.run_server(host='0.0.0.0', port=7951, mode='external')