In [None]:
'''
To perfrom clustering in contineous numerical and categorical dataframe and evaluate their Silhouette Calinski-Harabasz score
******************************
1. elbow_kmeans and elbow_kprototype makes elbow plot with and without medication and diagnosis respectively 
2. without_med_diag_cluster_kmeans() does clustering without medication and diagnosis column, it has only numerical variables
3. with_med_diag_cluster_kmeans() does clustering after transforming medication and diagnosis columns into one-hot encoding respectively
4. with_med_diag_cluster_kprototype() does clustering using medication and diagnosis columns as categorical string and used gowar distance to measure Silhouette, Calinski-Harabasz score
5. To use script make change to the FILE1_PATH and FILE2_PATH and install requirments
******************************
'''

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import random
from IPython.display import display
from IPython.display import Image
from sklearn.cluster import KMeans
from kmodes.kprototypes import KPrototypes
import matplotlib.pyplot as plt
from sklearn import metrics
from gower import gower_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import gower

import os
from dotenv import load_dotenv
load_dotenv()



class Clustering:
    def __init__(self, file1, file2):
        self.file1 = file1
        self.file2 = file2
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data(self):
        original_df_1 = pd.read_csv(self.file1)
        original_df_2 = pd.read_csv(self.file2)

        self.merged_df = pd.merge(original_df_1, original_df_2, on='episode_id', how='inner')
        self.merged_df = self.merged_df.copy(deep=True)  
        
        self.merged_df.drop(columns=['pasient','episode_id','episode_start_date','episode_end_date','tillnextepisode','label','gender','age_group'],inplace=True)
        self.selected_column_merged_df = self.merged_df.copy()
        self.selected_column_merged_df[['actual_diag','actual_med']] = self.selected_column_merged_df[['actual_diag','actual_med']].astype(str)
        
        self.without_selected_column_merged_df = self.merged_df.copy()
        self.without_selected_column_merged_df.drop(columns=['actual_diag','actual_med'],inplace=True)
        self.without_diag_medic_selected_column_merged_df = self.without_selected_column_merged_df.copy()
        
        return self.merged_df, self.selected_column_merged_df, self.without_diag_medic_selected_column_merged_df
        
    def elbow_kmeans(self):
        k_range = range(1, 11)
        costs = []

        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=0)
            kmeans.fit(self.without_diag_medic_selected_column_merged_df)
            costs.append(kmeans.inertia_)  

        plt.plot(k_range, costs, marker='o')
        plt.xlabel('Number of clusters')
        plt.ylabel('Inertia (Cost)')
        plt.title('Elbow method for K-Means without Diagnosis and Medication')
        plt.savefig("elbow_kmeans.jpg", dpi=600, bbox_inches='tight')
        plt.show()
        
    def elbow_kprototype(self):   
        k_range = range(1, 11)
        costs = []
        for k in k_range:
            km = KPrototypes(n_clusters=k, init='random')
            km.fit(self.selected_column_merged_df,categorical=[40,41])
            costs.append(km.cost_)

        plt.plot(k_range, costs, marker='o')
        plt.xlabel('Number of clusters')
        plt.ylabel('Cost')
        plt.title('Elbow method for KPrototype with Diagnosis and Medication')
        plt.savefig("elbow_kprototype.jpg",dpi=600,bbox_inches='tight')
        plt.show()

    def without_med_diag_cluster_kmeans(self, k_range):
        print("******** Without Diagnosis & Medication ******** \n")
        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=0)
            clusters = kmeans.fit_predict(self.without_diag_medic_selected_column_merged_df)
            self.without_diag_medic_selected_column_merged_df["cluster"] = clusters
            print(self.without_diag_medic_selected_column_merged_df["cluster"].value_counts())
            silhouette = metrics.silhouette_score(self.without_diag_medic_selected_column_merged_df, clusters, metric="euclidean")
            calinski = metrics.calinski_harabasz_score(self.without_diag_medic_selected_column_merged_df, clusters)
            print(f"Silhouette score for k={k}: {silhouette}")
            print(f"Calinski-Harabasz score for k={k}: {calinski}")
            
    def with_med_diag_cluster_kmeans(self, combined_data, k_range):
        print("******** With Diagnosis & Medication in One Hot Encoded Form ******** \n")
        for k in k_range:
            kmeans = KMeans(n_clusters=k, init='k-means++', random_state=0)
            clusters = kmeans.fit_predict(combined_data)
            
            self.selected_column_merged_df["cluster"] = clusters
            print(self.selected_column_merged_df["cluster"].value_counts())

            silhouette = silhouette_score(combined_data, clusters, metric="euclidean")
            calinski = calinski_harabasz_score(combined_data, clusters)

            print(f"Silhouette score for k={k}: {silhouette}")
            print(f"Calinski-Harabasz score for k={k}: {calinski}")
            
    def with_med_diag_cluster_kprototype(self, original_selected_column_merged_df, k_range):
        print("******** With Diagnosis & Medication using gower distance ******** \n")
        for k in k_range:
            kproto = KPrototypes(n_clusters=k, init='Cao')
            clusters = kproto.fit_predict(original_selected_column_merged_df, categorical=[40,41])
            
            original_selected_column_merged_df["cluster"] = clusters
            print(original_selected_column_merged_df["cluster"].value_counts())

            dist_matrix = gower.gower_matrix(original_selected_column_merged_df)


            silhouette = silhouette_score(dist_matrix, clusters, metric='precomputed')
            calinski = calinski_harabasz_score(dist_matrix, clusters)

            print(f"Silhouette score for k={k}: {silhouette}")
            print(f"Calinski-Harabasz score for k={k}: {calinski}")

            
# Access and load files            
clustering_obj = Clustering(os.getenv("FILE1_PATH"), os.getenv("FILE2_PATH"))
merged_df, selected_column_merged_df, without_diag_medic_selected_column_merged_df = clustering_obj.load_data()
cat_cols = ['actual_diag','actual_med']
selected_column_merged_df[cat_cols] = selected_column_merged_df[cat_cols].astype(str)


# Plot elbow with and without medication and diagnosis
clustering_obj.elbow_kmeans()
clustering_obj.elbow_kprototype()

# Call without medication and diagnosis
clustering_obj.without_med_diag_cluster_kmeans(k_range=[2, 3, 4, 5, 6, 7, 8])

original_selected_column_merged_df = selected_column_merged_df.copy()

# Call with medication and diagnosis in One-hot encoded form
numeric_columns = ['MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
       'num_medications', 'Cat_LOE', 'Cat_TNE', 'Cat_CV', 'Therapy_ratio_x',
       'Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
       'Outpatient_ratio', 'Inpatient_ratio', 'Inpatient_day_ratio',
       'Inpatient_daynight_ratio', 'closingcode_0', 'closingcode_1',
       'closingcode_2', 'closingcode_3', 'closingcode_4', 'closingcode_5',
       'closingcode_6', 'closingcode_9', 'aftercode_1', 'aftercode_2',
       'aftercode_3', 'aftercode_4', 'aftercode_5', 'LOE_Norm', 'LOE_long',
       'LOE_short', 'No_more_episode', 'TNE_less180', 'TNE_more180',
       'length_of_episode', 'Count_visit', 'Therapy_ratio_y']
scaler = StandardScaler()
selected_column_merged_df[numeric_columns] = scaler.fit_transform(selected_column_merged_df[numeric_columns])
df_encoded = pd.get_dummies(selected_column_merged_df, columns= cat_cols)
combined_data = df_encoded.values
clustering_obj.with_med_diag_cluster_kmeans(combined_data,k_range=[2, 3, 4, 5, 6, 7, 8])

# Call with medication and diagnosis in One-hot encoded form
clustering_obj.with_med_diag_cluster_kprototype(original_selected_column_merged_df,k_range=[2, 3, 4, 5, 6, 7, 8])
