In [None]:
## Import postgresql username and password ##
import os
from dotenv import load_dotenv

load_dotenv()

username = os.getenv("username")
password = os.getenv("password")

In [None]:
## Implement K-Means Clustering with elbow and silhouette score for choosing K ##
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import joblib

def choose_k(data):
    k_range = range(2, 11)
    inertias = []
    silhouette_scores = []

    best_score = -1
    best_k = None

    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(data)

        inertia = kmeans.inertia_
        score = silhouette_score(data, labels)

        inertias.append(inertia)
        silhouette_scores.append(score)

        print(f"k={k} | inertia={inertia:.2f} | silhouette={score:.4f}")

        if score > best_score:
            best_score = score
            best_k = k
    
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.plot(k_range, inertias, marker='o', color='red')
    plt.xlabel('k (Number of clusters)')
    plt.ylabel('Inertia')

    plt.subplot(1, 2, 2)
    plt.plot(k_range, silhouette_scores, marker='o', color='blue')
    plt.xlabel('k (Number of clusters)')
    plt.ylabel('Silhouette Score')

    plt.tight_layout
    plt.savefig("clustering_scores.png", dpi=300)

    return best_k

# Data must be one-hot encoded!!
def k_means(data):
    best_k = choose_k(data)
    
    model = KMeans(n_clusters=best_k, random_state=42, n_init=10)
    pred_labels = model.fit(data)

    # Save the model
    joblib.dump(model, "kmeans_model.pkl")

    print(f"Best k: {best_k}")

In [None]:
## Load trained model from model file##
import joblib


def predict_w_kmeans(data):
    # Import the trained model
    kmeans_model = joblib.load("agglom.pkl")

    # Return predictions
    return kmeans_model.fit_predict(data)


In [None]:
## Implement Agglomerative Clustering w/ one-hot-encoding ##
from sklearn.cluster import AgglomerativeClustering

def agglom_clustering(data):
    model = AgglomerativeClustering(n_clusters=4, linkage='ward')
    labels = model.fit_predict(data)

    joblib.dump(model, "agglom.pkl")

    return labels

In [None]:
## Load and one-hot encode the dataset from postgresql db ##
import psycopg2
import pandas as pd
from scipy.stats import zscore

# Connect to postgresql db
conn = psycopg2.connect(
    dbname="hackku25",
    user=username,
    password=password,
    host="127.0.0.1",
    port="5432"
)

# Create cursor and load the data in
cur = conn.cursor()
query_all_users = "select * from checkins where trackable_type = 'Condition' and trackable_name in (select trackable_name from checkins group by trackable_name having count(*) >= 50);"
df = pd.read_sql(query_all_users, conn)

# One-hot encode the data
one_hot = pd.get_dummies(df['trackable_name'].str.lower())
df_one_hot = pd.concat([df['user_id'], one_hot], axis=1)
df_one_hot = df_one_hot.groupby("user_id", as_index=False).sum()
df_one_hot.drop(columns=['user_id', 'ibs', 'irritable bowel syndrome'], inplace=True)

# Compute and remove outliers
z_scores = zscore(df_one_hot)
abs_z_scores = abs(z_scores)
outliers = (abs_z_scores > 2).any(axis=1)
outlier_indices = []
for indx, user in enumerate(outliers):
    if user:
        outlier_indices.append(indx)

df_one_hot.drop(index=outlier_indices, inplace=True)

# Train the model with kmeans
agglom_clustering(df_one_hot)

In [None]:
## Visualize model ##
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


labels = predict_w_kmeans(df_one_hot)
df_one_hot['cluster'] = labels

pca = PCA(n_components=3)
X_pca = pca.fit_transform(df_one_hot)

fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

plt.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=labels, cmap='viridis', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
## Analyze clusters ##
df_one_hot.groupby('cluster').sum().T # How many in each cluster have each condition

In [None]:
## Looks at conditions that dominate the cluster ##
condition_sums = df_one_hot.groupby('cluster').sum().T
top_conditions = condition_sums.sum(axis=1).nlargest(30).index.tolist()

df_top_conditions = df_one_hot[top_conditions]
labels = predict_w_kmeans(df_one_hot)
df_top_conditions['cluster'] = labels
df_top_conditions = df_top_conditions.groupby('cluster').sum().T

df_one_hot

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(15, 12))
sns.heatmap(np.log(df_top_conditions + 2), cmap='Blues')
plt.title("Most Common Symptoms Across Disease Clusters")
plt.show()

In [None]:
## Assign each user_id their cluster number ##
## Load and one-hot encode the dataset from postgresql db ##
import psycopg2
import pandas as pd
from scipy.stats import zscore

# Connect to postgresql db
conn = psycopg2.connect(
    dbname="hackku25",
    user=username,
    password=password,
    host="127.0.0.1",
    port="5432"
)

# Create cursor and load the data in
cur = conn.cursor()
query_all_users = "select * from checkins where trackable_type = 'Condition' and trackable_name in (select trackable_name from checkins group by trackable_name having count(*) >= 50);"
df = pd.read_sql(query_all_users, conn)

# One-hot encode the data
one_hot = pd.get_dummies(df['trackable_name'].str.lower())
df_one_hot = pd.concat([df['user_id'], one_hot], axis=1)
df_one_hot = df_one_hot.groupby("user_id", as_index=False).sum()
user_ids = df_one_hot['user_id']
df_one_hot.drop(columns=['user_id', 'ibs', 'irritable bowel syndrome'], inplace=True)

# Compute and remove outliers
z_scores = zscore(df_one_hot)
abs_z_scores = abs(z_scores)
outliers = (abs_z_scores > 2).any(axis=1)
outlier_indices = []
for indx, user in enumerate(outliers):
    if user:
        outlier_indices.append(indx)

df_one_hot.drop(index=outlier_indices, inplace=True)
user_ids.drop(index=outlier_indices, inplace=True)

# Get labels
labels = predict_w_kmeans(df_one_hot).T

user_ids.tolist()
labels.tolist()

df_user_id_to_cluster = pd.DataFrame({"user_ids": user_ids.tolist(), "cluster": labels.tolist()})

cluster0_users = set()
cluster1_users = set()
cluster2_users = set()
cluster3_users = set()

for row in df_user_id_to_cluster.values:
    if row[1] == 0:
        cluster0_users.add(row[0])
    elif row[1] == 1:
        cluster1_users.add(row[0])
    elif row[1] == 2:
        cluster2_users.add(row[0])
    elif row[1] == 3:
        cluster3_users.add(row[0])

# Create cursor and load the data in
cur2 = conn.cursor()
query_all_users = "select (user_id, trackable_name) from checkins where trackable_type = 'Treatment' and trackable_name in (select trackable_name from checkins group by trackable_name having count(*) >= 50);"
df_medications = pd.read_sql(query_all_users, conn)

cluster0_medications = {}
cluster1_medications = {}
cluster2_medications = {}
cluster3_medications = {}
for row in df_medications.values:
    row = row[0].replace(")", "").replace("(", "").split(",")
    if row[0] in cluster0_users:
        if row[1] in cluster0_medications:
            cluster0_medications[row[1]] += 1
        else:
            cluster0_medications[row[1]] = 1
    if row[0] in cluster1_users:
        if row[1] in cluster1_medications:
            cluster1_medications[row[1]] += 1
        else:
            cluster1_medications[row[1]] = 1
    if row[0] in cluster2_users:
        if row[1] in cluster2_medications:
            cluster2_medications[row[1]] += 1
        else:
            cluster2_medications[row[1]] = 1
    if row[0] in cluster3_users:
        if row[1] in cluster3_medications:
            cluster3_medications[row[1]] += 1
        else:
            cluster3_medications[row[1]] = 1

# Make df for medications across clusters
df_medications_summary = pd.DataFrame({"Cluster0": cluster0_medications,
                                       "Cluster1": cluster1_medications,
                                       "Cluster2": cluster2_medications,
                                       "Cluster3": cluster3_medications,
                                       })

# Display heatmaps for medications
plt.figure(figsize=(20, 30))
sns.heatmap(np.log(df_medications_summary), cmap="Blues")
plt.title("Treatment by Cluster")