In [1]:
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as pltv
import gower
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import altair as alt
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [2]:
df = pd.read_pickle("../data/processed.pkl")
categorical_columns = df.select_dtypes(include=['category']).columns

In [5]:
import pandas as pd
import prince

# Load the dataset
data = df

# Select the categorical columns for MCA
mca_columns = categorical_columns  # Replace with your categorical column names

# Create an instance of the MCA class
mca = prince.MCA(
    n_components=20,
    n_iter=3,
    copy=True,
    check_input=True,
    engine='sklearn',
    random_state=42
)

# Fit the MCA model to the data
mca.fit(data[mca_columns])

In [6]:
row_coords = mca.row_coordinates(df)

In [7]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

optimal_k = 37

# Create a MiniBatchKMeans object with the optimal k value
mbk_optimal = MiniBatchKMeans(n_clusters=optimal_k, batch_size=100, random_state=42)

# Fit the model to the MCA row coordinates
mbk_optimal.fit(row_coords)

# Get the cluster labels for each data point
labels_optimal = mbk_optimal.labels_

# Get the cluster centers
centroids_optimal = mbk_optimal.cluster_centers_

# Print the cluster labels and centroids for the optimal clustering
print("Cluster Labels (Optimal):", labels_optimal)
print("Cluster Centroids (Optimal):", centroids_optimal)

Cluster Labels (Optimal): [11 11 11 ... 15 29 14]
Cluster Centroids (Optimal): [[ 2.49895926e-01 -3.40055024e-02 -1.54937403e-02 -6.89487333e-03
  -1.32629643e-02  4.71684103e-02  4.04059811e-03 -8.89039325e-02
  -1.84712633e-01 -1.37532741e-01 -8.68664826e-04  1.25499930e-01
  -6.76799585e-02  6.55209544e-02  4.83137283e-03  5.12863064e-02
   1.51806026e-02 -5.40773557e-03  7.86024343e-03  2.08232534e-02]
 [-3.32670555e-01 -1.48027950e-01  1.80820341e-01  3.82945292e-04
   2.88845228e-02  2.62283877e-02  2.26050552e-02  6.63352781e-02
   6.92266998e-01 -4.64570043e-01 -8.17221354e-01 -1.09300773e-01
   7.08977414e-02  4.70932960e-01  6.08796224e-02 -1.13575014e-01
   3.75410199e-01 -3.14866957e-01  1.56002981e-02  8.26668531e-02]
 [-4.99680889e-01  2.33406264e-01  2.76253968e-01 -1.24222346e-02
   1.24655456e-02 -1.37319708e-01 -2.26691648e-02 -3.15122350e-02
  -7.72209007e-04  1.73234887e-01 -2.45995987e-02  1.74279350e-02
  -2.04391420e-01 -2.43044722e-01 -1.53674855e-02  3.37569408

In [15]:
data['Cluster'] = labels_optimal

# Calculate the mode of each categorical column for each cluster
cluster_modes = {}
for cluster in range(optimal_k):
    cluster_data = data[data['Cluster'] == cluster]
    cluster_modes[cluster] = cluster_data[mca_columns].mode().iloc[0]

# Identify the columns where the mode differs from the overall mode
overall_modes = data[mca_columns].mode().iloc[0]

top_columns = {}
for cluster, modes in cluster_modes.items():
    top_columns[cluster] = [col for col in mca_columns if modes[col] != overall_modes[col]]

# Print the top contributing columns for each cluster
for cluster, columns in top_columns.items():
    print(f"Cluster {cluster}: Top Contributing Columns: {columns}")

Cluster 0: Top Contributing Columns: ['State FIPS Code (_STATE)', 'Are you male or female? (CELLSEX1)', 'Sex of Respondent (SEXVAR)', 'Income Level (INCOME3)', 'Ever Had Sigmoidoscopy/Colonoscopy (HADSIGM4)', 'Adult flu shot/spray past 12 mos (FLUSHOT7)', 'Have you lost employment or had hours reduced? (SDHEMPLY)', 'During the past 12 months have you received food stamps (FOODSTMP)', 'How often did the food that you bought not last, and you didn\x92t have money to get more? (SDHFOOD1)', 'Were you not able to pay your bills? (SDHBILLS)', 'Were you not able to pay utility bills or threatened to lose service? (SDHUTILS)', 'Has a lack of reliable transportation kept you from appointments, meetings, work, or getting things needed (SDHTRNSP)']
Cluster 1: Top Contributing Columns: ['State FIPS Code (_STATE)', 'General Health (GENHLTH)', 'What is Primary Source of Health Insurance? (PRIMINSR)', 'Employment Status (EMPLOY1)', 'Have You Ever Had a Mammogram (HADMAM)', 'Have you ever had a cervic