In [1]:
import os
import subprocess
from pathlib import Path

"""
Dynamically find the project root (where .git exists) and set it as the current working directory.
"""
project_root = Path(subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], text=True).strip())
os.chdir(project_root)

In [6]:
import pandas as pd

from K_mean_cluster import run_clustering




In [None]:
#BA_US_knn_text = pd.read_csv('data/knnData/BA_US_knn_text.csv')

#run_clustering(BA_US_knn_text)

In [None]:
#RB_US_knn_text = pd.read_csv('data/knnData/RB_US_knn_text.csv')

#run_clustering(RB_US_knn_text)

In [None]:
#UMAP clustering
from utils.prepData import prep_data
BA_US_knn_text = pd.read_csv('data/knnData/BA_US_knn_text.csv')
df_total_clustering, states = prep_data(BA_US_knn_text)


In [63]:
import umap
import pandas as pd
import plotly.express as px
from sklearn.cluster import DBSCAN
import geopandas as gpd

# Step 1: Drop the 'user_state' column as it's just an identifier and not used for UMAP
df_total_clustering_cleaned = df_total_clustering.drop(columns=['user_state'])

print(df_total_clustering_cleaned.shape)
# Fill NaN values with 0
df_cleaned = df_total_clustering_cleaned.fillna(0)

# Apply UMAP to reduce dimensions to 3
umap_3d = umap.UMAP(n_components=3, random_state=42)
embedding_3d = umap_3d.fit_transform(df_cleaned)

# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.65, min_samples=3)  # Tune eps and min_samples for better clustering
dbscan_labels = dbscan.fit_predict(embedding_3d)

# Add the cluster labels back to the dataframe
df_cleaned['Cluster'] = dbscan_labels
df_cleaned['user_state'] = df_total_clustering['user_state']  # Assuming original_df has 'user_state'

# Define a consistent color map for clusters
cluster_color_map = {
    '0': "#3498DB",  # Sky Blue
    '1': "#F1C40F",  # Gold
    '2': "#E74C3C",  # Tomato Red
    '3': "#2ECC71",  # Emerald Green
    '4': "#9B59B6",  # Amethyst Purple
    '5': "#1F618D"   # Deep Blue
}

# Convert DBSCAN labels to a string format for Plotly (optional for cleaner hover info)
df_cleaned['Cluster'] = df_cleaned['Cluster'].astype(str)

# Plot the 3D UMAP with Plotly
fig = px.scatter_3d(
    df_cleaned,
    x=embedding_3d[:, 0],
    y=embedding_3d[:, 1],
    z=embedding_3d[:, 2],
    title="DBSCAN Clustering on UMAP",
    labels={'Cluster': 'Cluster Label'},
    hover_data={'user_state': True, 'Cluster': False},  # Only show user_state on hover, hide Cluster
    color = 'Cluster',
    color_discrete_map=cluster_color_map  # Apply custom colors
)

fig.show()



# Draw the map with consistent colors


(50, 824)



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [64]:
import pandas as pd
import plotly.express as px


def draw_map(df_cleaned):
    # Step 1: Define a mapping for state abbreviations
    state_to_abbr = {
        'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
        'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
        'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
        'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
        'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
        'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
        'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
        'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
        'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
        'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
    }

    # Step 2: Merge df_cleaned with state abbreviations
    df_cleaned['state_abbreviation'] = df_cleaned['user_state'].map(state_to_abbr)

    # Step 3: Convert Cluster to string to match keys in cluster_color_map
    df_cleaned['Cluster'] = df_cleaned['Cluster'].astype(str)

    # Step 4: Plot the choropleth map
    fig = px.choropleth(
        df_cleaned, 
        locations='state_abbreviation', 
        color='Cluster', 
        color_discrete_map=cluster_color_map,  # Apply the custom color map
        title="USA State Clusters Based on DBSCAN",
        locationmode="USA-states",
        hover_name='user_state',  # Hover shows the state name
        hover_data={'user_state': False, 'Cluster': False, 'state_abbreviation': False}  # Optional: Hide some data on hover
    )

    fig.update_layout(
        geo_scope='usa',
        width=900,  # Adjust width
        height=600,  # Adjust height
        title_font=dict(size=20),
        geo=dict(projection_type="albers usa")
    )

    fig.show()

# Call the function to draw the map
draw_map(df_cleaned)
