# **Milestone 3**

In [1]:
!pip install dash
!pip install plotly
!pip install flask-ngrok



## Data Loading and Preprocessing

The cleaned dataset should be loaded and preprocessed for dashboard integration in this section.

Clustering features in the dataset include cluster assignments and PCA-transformed components.

Any missing values have been addressed in previous milestones.

For efficiency, the data has been sampled and scaled.


In [41]:
from dash import Dash, dcc, html, Input, Output
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage
import plotly.express as px
import plotly.figure_factory as ff

# Load dataset
df = pd.read_csv('C:/Users/divya/Downloads/product+classification+and+clustering/pricerunner_aggregate.csv')

# Data Preprocessing: Fill missing values with the column mean for numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Select numeric features for clustering
features = df.select_dtypes(include=[np.number])

# Normalize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Apply PCA to reduce dimensionality for visualization (2 components)
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_features)

# Create a new DataFrame with PCA results
df['PCA1'] = pca_result[:, 0]
df['PCA2'] = pca_result[:, 1]

# Sample a subset of data if the dataset is too large (optional)
sample_size = 1000  # Adjust this size based on your dataset size
df_sampled = df.sample(n=sample_size, random_state=42)

# Perform K-Means Clustering using MiniBatchKMeans for larger datasets
kmeans = MiniBatchKMeans(n_clusters=3, random_state=42)  # Using MiniBatchKMeans for efficiency
df_sampled['KMeans_Cluster'] = kmeans.fit_predict(scaled_features[:sample_size])


## Interactive Dashboard Setup

The interactive product clustering dashboard will be hosted by a Dash application that is created here.

In the arrangement are:

-A drop-down menu with K-Means and Hierarchical clustering algorithms to choose from.

-A pair of visuals
1. A cluster visualization scatter plot.
2. A dendrogram for associations with a hierarchy.

### Code Components:

1. Dropdown Menu:

- Users are able to choose the clustering method.
- Hierarchical clustering and K-Means clustering are two options.

3. Graph Components:

- dcc.Graph is used to display the scatter plot and dendrogram.


In [42]:
# Create the Dash app
app = Dash(__name__)

# Create layout for the Dash app
app.layout = html.Div([
    html.H1('Interactive Product Clusters Dashboard'),

    # Dropdown for selecting the clustering method
    dcc.Dropdown(
        id='cluster-dropdown',
        options=[
            {'label': 'K-Means Clustering', 'value': 'kmeans'},
            {'label': 'Hierarchical Clustering', 'value': 'hierarchical'}
        ],
        value='kmeans',  # Default value
        style={'width': '50%'}
    ),

    # Scatter plot for K-Means or hierarchical clustering
    dcc.Graph(id='cluster-plot'),

    # Dendrogram for hierarchical clustering
    dcc.Graph(id='dendrogram-plot')
])



## Callback for Dynamic Plot Updates
The callback function uses the user's choices from the dropdown menu to dynamically change the visualizations.

 **Clustering with K-Means**:
  - Shows a scatter plot of PCA components with cluster assignments color-coded.

 **The use of hierarchical clustering**:
  - Presents a dendrogram that illustrates the products' hierarchical relationships.

### Key Features:
1. Input: Choose the clustering technique from the drop-down menu.
2.  Outcomes
- A scatter plot for clusters using K-Means.
- Dendrogram for Clustering Hierarchically.
3.   Reason:
- If K-Means is chosen, the dendrogram stays blank and the scatter plot is shown.
- The scatter plot stays empty and the dendrogram is shown if Hierarchical is chosen.


In [43]:
# Callback to update the cluster plot based on selected method
@app.callback(
    Output('cluster-plot', 'figure'),
    Output('dendrogram-plot', 'figure'),
    Input('cluster-dropdown', 'value')
)
def update_visualization(selected_cluster_method):
    # Plot for K-Means clustering
    if selected_cluster_method == 'kmeans':
        fig = px.scatter(df_sampled, x='PCA1', y='PCA2', color='KMeans_Cluster',
                         labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
                         title="K-Means Clusters")
        dendrogram_fig = {}

    # Plot for Hierarchical clustering
    elif selected_cluster_method == 'hierarchical':
        linkage_matrix = linkage(scaled_features[:sample_size], method='ward')
        dendrogram_fig = ff.create_dendrogram(linkage_matrix, labels=df_sampled['Product Title'].tolist())
        dendrogram_fig.update_layout(title="Hierarchical Clustering Dendrogram")
        fig = {}

    return fig, dendrogram_fig

# Run the app in Jupyter Notebook
app.run_server(mode="inline", port=8060)  # Use mode="inline" for Jupyter

In [44]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio

In [45]:
# Create a Dash app
app = dash.Dash(__name__)

# Save Plotly figures as HTML files
def save_plot_as_html(fig, filename):
    fig.write_html(filename)

# Create layout for the Dash app
app.layout = html.Div([
    html.H1('Interactive Product Clusters Dashboard'),

    # Dropdown for selecting the clustering method
    dcc.Dropdown(
        id='cluster-dropdown',
        options=[
            {'label': 'K-Means Clustering', 'value': 'kmeans'},
            {'label': 'Hierarchical Clustering', 'value': 'hierarchical'}
        ],
        value='kmeans',  # Default value
        style={'width': '50%'}
    ),

    # Scatter plot for K-Means or hierarchical clustering
    dcc.Graph(id='cluster-plot'),

    # Dendrogram for hierarchical clustering
    dcc.Graph(id='dendrogram-plot')
])



In [46]:
# Callback to update the cluster plot based on selected method
@app.callback(
    Output('cluster-plot', 'figure'),
    Output('dendrogram-plot', 'figure'),
    Input('cluster-dropdown', 'value')
)
def update_visualization(selected_cluster_method):
    # Plot for K-Means clustering
    if selected_cluster_method == 'kmeans':
        fig = px.scatter(df_sampled, x='PCA1', y='PCA2', color='KMeans_Cluster',
                         labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
                         title="K-Means Clusters")
        save_plot_as_html(fig, 'kmeans_clusters.html')  # Save K-Means plot as an HTML file
        dendrogram_fig = {}

    # Plot for Hierarchical clustering
    elif selected_cluster_method == 'hierarchical':
        linkage_matrix = linkage(scaled_features[:sample_size], method='ward')
        dendrogram_fig = ff.create_dendrogram(linkage_matrix, labels=df_sampled['Product Title'].tolist())
        dendrogram_fig.update_layout(title="Hierarchical Clustering Dendrogram")
        save_plot_as_html(dendrogram_fig, 'hierarchical_dendrogram.html')  # Save Dendrogram plot as HTML file
        fig = {}

    return fig, dendrogram_fig

# Run the app (without reloader) to start Dash app
if __name__ == '__main__':
    app.run_server(debug=False, use_reloader=False)  # Don't use reloader for ngrok compatibility
