#### Task: To perfrom clustering in contineous numerical and categorical dataframe and evaluate their Silhouette & Calinski-Harabasz score

Dataset Used: `/mnt/work/workbench/dipendrp/new-data/final_episodes.csv`

Subtasks:
1. First script uses`fullHot_episodes.csv` with `'MiddleChildhood', 'Preschooler','Teenager'` insted of `age` 
2. Second script uses dataset  `final_episodes.csv` with `age` insted of `age_group`
<br>

Description of work done:
1. `load_data` loads data , does preprocessing, then returns the selected column for clustering to `elbow_kprototypes` & `without_med_diag_cluster_kprototypes`
2. `elbow_kprototypes` makes elbow plot with and with medication and diagnosis respectively as categorical
3. `with_med_diag_cluster_kprototypes` does clustering without medication and diagnosis column and measure Silhouette, Calinski-Harabasz score
4. To run the script replace if you are not using `dotenv` replace`os.getenv("FILE_Full_Phecode_ATC_PATH")` with the path of your file. 

In [None]:
%run -i /mnt/work/workbench/dipendrp/Paper2/Github/CAMHS_Readmission_Analytics/allpackages.py

class Clustering:
    def __init__(self, file1):
        self.file1 = file1
        self.merged_df = None
        self.with_diag_medic_selected_column_merged_df = None
    
    def load_data(self):
        original_df_1 = pd.read_csv(self.file1) 
        original_df_1 = original_df_1.head(500)
        # Rename the column var_no_dates_permonth with the Intensity_per_calendar_month
        original_df_1 = original_df_1.rename(columns={'var_no_dates_permonth':'Intensity_per_calendar_month'})
        
        # Encoding 'tillnextepisode' into labels: 'not-re-admitted' and time intervals *****
        le = LabelEncoder()
        original_df_1['tillnextepisode'] = le.fit_transform(pd.cut(original_df_1['tillnextepisode'],
                                                          bins=[float('-inf'), 0, 180, 365, 730, 1095, float('inf')],
                                                          labels=['not-re-admitted',
                                                                  're-admitted in 0-180 days',
                                                                  're-admitted in 180-365 days',
                                                                  're-admitted in 365-730 days',
                                                                  're-admitted in 730-1095 days',
                                                                  're-admitted in more than 1095 days']))
        # Fillna with zero
        original_df_1.fillna({'num_diagnoses': 0, 'num_medications': 0, 'Inpatient_daynight_ratio': 0, 'Inpatient_ratio': 0, 'TreatmentPlanning_ratio': 0,
                             'Therapy_ratio':0,'Examination_ratio':0, 'Advisory_ratio':0}, inplace=True)
        
        
        self.with_diag_medic_selected_column_merged_df = original_df_1[['MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M',
        'gender_0','Count_visit', 'num_diagnoses', 'num_medications', 'Length_of_Episode',
       'Inpatient_daynight_ratio', 'Intensity_per_calendar_month','Inpatient_ratio',
       'TreatmentPlanning_ratio', 'Therapy_ratio', 'Care_intensity',
       'Examination_ratio', 'Advisory_ratio','tillnextepisode','diagnosis','actual_med_Full_ATC']]
        
        numeric_columns = ['Count_visit', 'num_diagnoses', 'num_medications', 'Length_of_Episode',
       'Inpatient_daynight_ratio', 'Intensity_per_calendar_month','Inpatient_ratio',
       'TreatmentPlanning_ratio', 'Therapy_ratio', 'Care_intensity',
       'Examination_ratio', 'Advisory_ratio']
        
        scaler = StandardScaler()
        self.with_diag_medic_selected_column_merged_df[numeric_columns] = scaler.fit_transform(self.with_diag_medic_selected_column_merged_df[numeric_columns])
        cat_cols = ['diagnosis','actual_med_Full_ATC']
        self.with_diag_medic_selected_column_merged_df[cat_cols] = self.with_diag_medic_selected_column_merged_df[cat_cols].astype(str)
        return self.with_diag_medic_selected_column_merged_df
        
        
    def elbow_kprototype(self):   
        k_range = range(1, 11)
        costs = []
        for k in k_range:
            km = KPrototypes(n_clusters=k, init='random')
            km.fit(self.with_diag_medic_selected_column_merged_df,categorical=[0,1,2,3,4,5,18,19,20])
            costs.append(km.cost_)

        plt.plot(k_range, costs, marker='o')
        plt.xlabel('Number of clusters')
        plt.ylabel('Cost')
        plt.title('Elbow method for KPrototype with Diagnosis and Medication')
        plt.savefig("elbow_kprototype.jpg",dpi=600,bbox_inches='tight')
        plt.show()
            
    def with_med_diag_cluster_kprototype(self, original_selected_column_merged_df, k_range):
        print("******** With Diagnosis & Medication using gower distance ******** \n")
        for k in k_range:
            kproto = KPrototypes(n_clusters=k, init='Cao')
            clusters = kproto.fit_predict(original_selected_column_merged_df, categorical=[0,1,2,3,4,5,18,19,20])
            
            original_selected_column_merged_df["cluster"] = clusters
            print(original_selected_column_merged_df["cluster"].value_counts())

            dist_matrix = gower.gower_matrix(original_selected_column_merged_df)

            silhouette = silhouette_score(dist_matrix, clusters, metric='precomputed')
            calinski = calinski_harabasz_score(dist_matrix, clusters)

            print(f"Silhouette score for k={k}: {silhouette}")
            print(f"Calinski-Harabasz score for k={k}: {calinski}")

            
# Access and load files            
clustering_obj = Clustering("/mnt/work/workbench/dipendrp/new-data/Full_ICD10_ATC.csv")
with_diag_medic_selected_column_merged_df = clustering_obj.load_data()

# Plot elbow with and without medication and diagnosis
clustering_obj.elbow_kprototype()

# Call with medication and diagnosis in One-hot encoded form
clustering_obj.with_med_diag_cluster_kprototype(with_diag_medic_selected_column_merged_df,k_range=[2, 3, 4, 5, 6, 7, 8])