In [3]:
import pandas as pd
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.graph_objects as go
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

# Read the dataset
data = pd.read_csv("data.csv")


# Create the Dash app
app = dash.Dash(__name__)

# Dropdown options
feature_options = [{'label': feature, 'value': feature} for feature in data.columns[2:-1]]

# Layout of the app
app.layout = html.Div([
    html.H1("Breast Cancer Data Dashboard"),
    html.P("The goal is to build a model that can predict whether a tumor is malignant - M (cancerous) or benign - B (non-cancerous) based on various features extracted from medical images of the tumor."),

    # Dropdown to select the feature
    dcc.Dropdown(
        id='feature-dropdown',
        options=feature_options,
        value=data.columns[2],  # Default value for the dropdown
        style={'width': '50%'}
    ),

    # Chart to show the distribution and correlation
    dcc.Graph(id='feature-plot'),

    # Correlation matrix plot
    html.H2("Correlation Matrix"),
    dcc.Graph(id='correlation-matrix-plot'),

    # Histogram of 'echo' feature distribution with mean, variance, and standard error
    dcc.Graph(id='echo-histogram'),

    # Box plot of selected feature
    dcc.Graph(id='box-plot'),

    # Violin plot of selected feature with both diagnoses
    dcc.Graph(id='violin-plot'),
])

# Callback to update the chart based on the selected feature
@app.callback(
    [Output('feature-plot', 'figure'),
     Output('echo-histogram', 'figure'),
     Output('box-plot', 'figure'),
     Output('violin-plot', 'figure')],
    [Input('feature-dropdown', 'value')]
)
def update_plots(selected_feature):
    fig1 = px.histogram(data, x=selected_feature, color='diagnosis', barmode='overlay', nbins=100,
                        category_orders={'diagnosis': [0, 1]}, color_discrete_map={0: 'blue', 1: 'red'})
    fig1.update_layout(title=f"Distribution of {selected_feature} and Correlation with Diagnosis",
                       xaxis_title=selected_feature,
                       yaxis_title="Count",
                       showlegend=True)

    # Calculate mean, variance, and standard error of the selected feature
    mean_val = data[selected_feature].mean()
    variance_val = data[selected_feature].var()
    std_err_val = data[selected_feature].sem()

    # Create histogram plot for the selected feature
    fig2 = go.Figure()
    fig2.add_trace(go.Histogram(x=data[selected_feature], nbinsx=50, histnorm='probability', name='Distribution'))
    fig2.add_trace(go.Scatter(x=[mean_val, mean_val], y=[0, 0.15], mode='lines', name='Mean',
                              line=dict(color='red', dash='dash')))
    fig2.add_trace(go.Scatter(x=[mean_val - std_err_val, mean_val + std_err_val], y=[0.05, 0.05], mode='lines',
                              fill='tozeroy', fillcolor='rgba(255, 0, 0, 0.3)', name='Standard Error'))
    fig2.update_layout(title=f"{selected_feature} Distribution with Mean, Variance, and Standard Error",
                       xaxis_title=selected_feature,
                       yaxis_title="Probability",
                       showlegend=True)

    # Create box plot for the selected feature
    fig3 = px.box(data, x='diagnosis', y=selected_feature, color='diagnosis', points='all')
    fig3.update_layout(title=f"Box Plot of {selected_feature} by Diagnosis",
                       xaxis_title='Diagnosis',
                       yaxis_title=selected_feature,
                       showlegend=False)

    # Merge the data for both diagnoses for the violin plot
    violin_data = pd.melt(data, id_vars="diagnosis", value_vars=selected_feature)

    # Create violin plot for the selected feature with both diagnoses
    fig4 = px.violin(violin_data, x='diagnosis', y='value', color='diagnosis', box=True, points="all",
                     hover_data=violin_data.columns, category_orders={'diagnosis': ['M', 'B']},
                     color_discrete_map={'M': 'red', 'B': 'blue'})

    fig4.update_layout(title=f"Violin Plot of {selected_feature} by Diagnosis",
                       xaxis_title='Diagnosis',
                       yaxis_title=selected_feature,
                       showlegend=False)

    # Set stripmode to 'overlay' to show data points for both diagnoses on the violin plot
    fig4.update_traces(marker=dict(size=4, opacity=0.6, line=dict(width=0.5)),
                        selector=dict(mode='markers'),
                        box_visible=True)

    return fig1, fig2, fig3, fig4

# Callback to update the correlation matrix plot
@app.callback(
    Output('correlation-matrix-plot', 'figure'),
    [Input('feature-dropdown', 'value')]
)
def update_correlation_matrix(selected_feature):
    # Select only numeric columns for correlation calculation
    numeric_df = data.select_dtypes(include=[float, int])

    # Drop columns with NaN values
    numeric_df = numeric_df.dropna(axis='columns')

    # Keep columns where there are more than 1 unique values
    numeric_df = numeric_df[[col for col in numeric_df if col != 'id' and numeric_df[col].nunique() > 1]]

    # Calculate the correlation matrix
    corr = numeric_df.corr()

    fig = go.Figure(data=go.Heatmap(z=corr.values,
                                    x=corr.columns,
                                    y=corr.columns,
                                    colorscale='RdPu',
                                    zmin=-1,
                                    zmax=1))
    fig.update_layout(title=f'Correlation Matrix',
                      xaxis_title='Features',
                      yaxis_title='Features',
                      width=800,
                      height=800)
    return fig

if __name__ == '__main__':
    app.run_server(port=8090, debug=True)