In [1]:
import os, io
from collections import defaultdict
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd
from jupyter_dash import JupyterDash
import dash

In [2]:
parent_folder_path  = "/data/projects/VCF_files/BRCA/Download_files"

In [3]:
def get_vcf_gz_files_except_logs(root_folder):
    all_files = []

    # Walk through the directory tree
    for dirpath, dirnames, filenames in os.walk(root_folder):
        # If "logs" is in dirnames, remove it to avoid traversing it
        if 'logs' in dirnames:
            dirnames.remove('logs')

        # Add only the filenames with the extension .vcf.gz in the current directory to the all_files list
        for filename in filenames:
            if filename.endswith('.vcf.gz'):
                all_files.append(os.path.join(dirpath, filename))
    return all_files

In [4]:
files = get_vcf_gz_files_except_logs(parent_folder_path)
tag_counts = {
    'genome sequencing': defaultdict(int),
    'tools': defaultdict(int),
    'variant type': defaultdict(int)
}
for file in files:
    parts = file.split('.')
    if len(parts) >= 5:
        genome_seq, tool, variant_type =  parts[-5], parts[-4], parts[-3]
        if genome_seq and tool and variant_type:
            tag_counts['genome sequencing'][genome_seq] += 1
            tag_counts['tools'][tool] += 1
            tag_counts['variant type'][variant_type] += 1

In [5]:
for category, tags in tag_counts.items():
    print(f"\n{category.capitalize()}:")
    for tag, count in tags.items():
        print(f"{tag}: {count}")


Genome sequencing:
wxs: 2533

Tools:
mutect2: 318
muse: 318
pindel: 330
MuTect2: 331
Pindel: 311
varscan2: 307
MuSE: 290
VarScan2: 328

Variant type:
raw_somatic_mutation: 1273
somatic_annotation: 1260


In [6]:
# Dummy data: replace this with your actual file names and tags extraction
data = [
    {"file": "file1.vcf.gz", "genome sequencing": "wxs", "tools": "mutect2", "variant type": "raw_somatic_mutation"},
    {"file": "file2.vcf.gz", "genome sequencing": "wxs", "tools": "muse", "variant type": "raw_somatic_mutation"},
    # Add more data as required...
]

df = pd.DataFrame(data)

In [7]:
df

Unnamed: 0,file,genome sequencing,tools,variant type
0,file1.vcf.gz,wxs,mutect2,raw_somatic_mutation
1,file2.vcf.gz,wxs,muse,raw_somatic_mutation


In [9]:
#app = dash.Dash(__name__, external_stylesheets=["https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css"])
app = JupyterDash(__name__, external_stylesheets=["https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css"])

app.layout = html.Div([
    html.H2("Venn Diagram Dashboard"),
    html.Div([
        dcc.Checklist(
            id="genome-sequencing-checklist",
            options=[{"label": tag, "value": tag} for tag in df["genome sequencing"].unique()],
            inline=True,
            labelStyle={"margin-right": "20px"}
        ),
        dcc.Checklist(
            id="tools-checklist",
            options=[{"label": tag, "value": tag} for tag in df["tools"].unique()],
            inline=True,
            labelStyle={"margin-right": "20px"}
        ),
        dcc.Checklist(
            id="variant-type-checklist",
            options=[{"label": tag, "value": tag} for tag in df["variant type"].unique()],
            inline=True,
            labelStyle={"margin-right": "20px"}
        ),
    ], style={"margin-bottom": "20px"}),
    html.Div(id="file-output")
])

@app.callback(
    Output("file-output", "children"),
    Input("genome-sequencing-checklist", "value"),
    Input("tools-checklist", "value"),
    Input("variant-type-checklist", "value")
)
def update_output(genome_vals, tools_vals, variant_vals):
    filtered_df = df[
        df["genome sequencing"].isin(genome_vals) &
        df["tools"].isin(tools_vals) &
        df["variant type"].isin(variant_vals)
    ]

    children = []
    for idx, row in filtered_df.iterrows():
        children.append(html.Div(f"{row['file']}"))
    return children

if __name__ == "__main__":
    app.run_server(debug=True, mode='inline', host='0.0.0.0', port=6194)

In [23]:
!pip install dash dash-bootstrap-components plotly pandas jupyter-dash

Collecting jupyter-dash
  Downloading jupyter_dash-0.4.2-py3-none-any.whl (23 kB)
Installing collected packages: jupyter-dash
Successfully installed jupyter-dash-0.4.2
