In [1]:
import dash
from dash import dcc, html, dash_table, Input, Output, callback
import plotly.express as px
import pandas as pd
import os

In [2]:
# Base directory for data
BASE_DIR = '/home/pdutta/Data/Cancer_wiseGDC/New_data'

# Listing the main categories
#categories = [d for d in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d)) and not d.startswith('.')]

In [3]:
#categories

In [4]:
# Create a Dash app
app = dash.Dash(__name__)

In [5]:
app.layout = html.Div([
    html.H1("Analysis of somatic variants across various GDC cancer patients", style={'text-align': 'center'}),
    dcc.Dropdown(
        id='cancer-type-dropdown',
        options=[
            {'label': 'Select Cancer Type', 'value': ''},
            {'label': 'Lung', 'value': 'Lung'},
            {'label': 'Brain', 'value': 'Brain'},
            {'label': 'Breast', 'value': 'Breast'}
        ],
        value='',  # Default value prompting user selection
        style={'width': '50%', 'margin': 'auto'}
    ),
    html.H3("Barplot of the Number of Unique Variants across various DNABERT fine-tuned models", style={'text-align': 'center'}),
    dcc.Graph(id='variants-bar-plot')
])

In [6]:
def load_data(category):
    """Load df_log_odd_score.tsv files from specified category."""
    path = os.path.join(BASE_DIR, category, 'Generated_files', 'Variant_Analysis')
    data = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if 'df_log_odd_score.tsv' in file:
                subfolder = os.path.basename(root)
                full_path = os.path.join(root, file)
                df = pd.read_csv(full_path, sep='\t')
                data.append((subfolder, df))
    return data

In [7]:
@callback(
    Output('variants-bar-plot', 'figure'),
    Input('cancer-type-dropdown', 'value'),
    prevent_initial_call=True
)
def update_graph(selected_category):
    if not selected_category:
        return px.bar()  # Returns an empty plot if no category is selected

    data = []
    cat_data = load_data(selected_category)
    for subfolder, df in cat_data:
        num_patients = df.shape[0]
        num_variants = df.shape[1]
        data.append({
            'Non-coding regions': subfolder,
            'Unique Variants': num_variants,
            'label': f"{subfolder} ({num_patients} patients)"  # Custom legend entry
        })

    df_plot = pd.DataFrame(data)
    fig = px.bar(
        df_plot, 
        x='Non-coding regions', 
        y='Unique Variants', 
        text='Unique Variants',
        labels={'Unique Variants': 'Number of Unique Variants', 'Non-coding regions': 'Non-coding Regions'},
        color='label',  # Use the custom legend entry for coloring
        color_continuous_scale='Viridis'  # Using Viridis color scale
        # color_discrete_map={
        #     'TATA (patients)': '#636EFA',
        #     'NonTATA (patients)': '#EF553B',
        #     'acceptor (patients)': '#00CC96',
        #     'donor (patients)': '#AB63FA'
        # }
    )
    fig.update_traces(texttemplate='%{text}', textposition='outside', opacity=1)  # Set opacity to 1 for exact color match
    return fig

In [8]:
if __name__ == '__main__':
    app.run_server(host='0.0.0.0', port=5123, mode='external')

Address already in use
Port 5123 is in use by another program. Either identify and stop that program, or start the server with a different port.


AttributeError: 'tuple' object has no attribute 'tb_frame'