# Determining cluster labelling

In [1]:
import dask
import matplotlib.pyplot as plt
import pandas as pd
from dask.diagnostics import ProgressBar
from kmodes.kprototypes import KPrototypes

plt.style.use("seaborn-colorblind")

%matplotlib inline

In [2]:
n_clusters = 4
copd = pd.read_csv(
    "/Volumes/thesis-data/copd.csv", parse_dates=["admission_date", "discharge_date"]
)

clinicals = [
    "n_episodes",
    "n_wards",
    "n_consultants",
    "true_los",
    "n_pr_attendances",
    "n_sn_attendances",
    "n_copd_admissions_last_year",
    "charlson_gross",
    "n_icds",
    "intervention",
    "day_of_week",
    "gender",
    "deprivation_decile",
]

codes = [
    "infectious",
    "neoplasms",
    "blood",
    "endocrine",
    "mental",
    "nervous",
    "eye",
    "ear",
    "circulatory",
    "respiratory",
    "digestive",
    "skin",
    "muscoloskeletal",
    "genitourinary",
    "perinatal",
    "congenital",
    "abnormal_findings",
    "injury",
    "external_causes",
    "contact_factors",
    "special_use",
]

conditions = [
    "ami",
    "cva",
    "chf",
    "ctd",
    "dementia",
    "diabetes",
    "liver_disease",
    "peptic_ulcer",
    "pvd",
    "pulmonary_disease",
    "cancer",
    "diabetic_complications",
    "paraplegia",
    "renal_disease",
    "metastatic_cancer",
    "sever_liver_disease",
    "hiv",
    "cdiff",
    "mrsa",
    "obese",
    "sepsis",
]

cols = clinicals + codes + conditions

In [3]:
def clean_data(data, missing_prop=0.25, max_stay=365):
    """ Get rid of the columns where enough data is missing, and remove records
    that last too long or have any missing data. """

    for col in data.columns:
        if data[col].isnull().sum() > missing_prop * len(data):
            data = data.drop(col, axis=1)

    data = data[data["true_los"] <= max_stay]
    data = data.dropna()

    return data


def get_categorical(data):

    categorical = []
    for i, (_, dtype) in enumerate(dict(data.dtypes).items()):
        if dtype == "object":
            categorical.append(i)

    return categorical

In [4]:
data = copd[cols].copy()
data = clean_data(data, max_stay=10000)
categorical = get_categorical(data)

In [5]:
@dask.delayed
def kprototypes_matching(state):

    kp = KPrototypes(n_clusters, init="matching", n_init=1, random_state=state)
    kp.fit(data[cols], categorical=categorical)

    return kp.cost_, kp.labels_

In [None]:
tasks = (kprototypes_matching(state) for state in range(50))

with ProgressBar():
    results = dask.compute(*tasks, num_workers=4, scheduler="processes")

[                                        ] | 2% Completed | 24.4s

In [None]:
costs, all_labels = [], []
for c, ls in results:
    costs.append(c)
    all_labels.append(ls)

In [None]:
kp = KPrototypes(n_clusters, init="cao")
kp.fit(data[cols], categorical=categorical)

cao_cost, cao_labels = kp.cost_, kp.labels_

In [None]:
_, ax = plt.subplots(dpi=300)

ax.scatter([0] * len(costs), costs, label="matching")
ax.scatter([0], [cao_cost], alpha=0.5, label="cao")

ax.set_xlabel("Final cost")
ax.legend()

In [None]:
cao_cost, min(costs)