#### Task: To perfrom clustering in contineous numerical and categorical dataframe and evaluate their Silhouette & Calinski-Harabasz score

Dataset Used: `/mnt/work/workbench/dipendrp/new-data/final_episodes.csv`

Subtasks:
1. First script uses`Full_ICD10_ATC.csv` with `'MiddleChildhood', 'Preschooler','Teenager'` insted of `age` 
2. Second script uses dataset  `final_episodes.csv` with `age` insted of `age_group`
<br>

Description of work done:
1. `load_data` loads data , does preprocessing, then returns the selected column for clustering to `elbow_kprototypes` & `without_med_diag_cluster_kprototypes`
2. `elbow_kprototypes` makes elbow plot with and without medication and diagnosis respectively 
3. `without_med_diag_cluster_kprototypes` does clustering without medication and diagnosis column and measure Silhouette, Calinski-Harabasz score
4. To run the script replace if you are not using `dotenv` replace`os.getenv("FILE_Full_Phecode_ATC_PATH")` with the path of your file. 


**1. Using `Full_ICD10_ATC.csv` with `'MiddleChildhood', 'Preschooler','Teenager'` insted of `age`**

In [None]:
%run -i /mnt/work/workbench/dipendrp/Paper2/Github/CAMHS_Readmission_Analytics/allpackages.py
class Clustering:
    def __init__(self, file1):
        self.file1 = file1
        self.merged_df = None
        self.with_diag_medic_selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data(self):
        original_df_1 = pd.read_csv(self.file1)
        original_df_1 = original_df_1.head(600)
        # Rename the column var_no_dates_permonth with the Intensity_per_calendar_month
        original_df_1 = original_df_1.rename(columns={'var_no_dates_permonth':'Intensity_per_calendar_month'})
        
        # Encoding 'tillnextepisode' into labels: 'not-re-admitted' and time intervals *****
        le = LabelEncoder()
        original_df_1['tillnextepisode'] = le.fit_transform(pd.cut(original_df_1['tillnextepisode'],
                                                          bins=[float('-inf'), 0, 180, 365, 730, 1095, float('inf')],
                                                          labels=['not-re-admitted',
                                                                  're-admitted in 0-180 days',
                                                                  're-admitted in 180-365 days',
                                                                  're-admitted in 365-730 days',
                                                                  're-admitted in 730-1095 days',
                                                                  're-admitted in more than 1095 days']))
        # Fillna with zero
        original_df_1.fillna({'num_diagnoses': 0, 'num_medications': 0, 'Inpatient_daynight_ratio': 0, 'Inpatient_ratio': 0, 'TreatmentPlanning_ratio': 0,
                             'Therapy_ratio':0,'Examination_ratio':0, 'Advisory_ratio':0}, inplace=True)

        
        # Select columns used to clusters
        self.without_diag_medic_selected_column_merged_df = original_df_1[['Count_visit', 'num_diagnoses', 'num_medications', 'Length_of_Episode',
       'Inpatient_daynight_ratio', 'Intensity_per_calendar_month','Inpatient_ratio',
       'TreatmentPlanning_ratio', 'Therapy_ratio', 'Care_intensity',
       'Examination_ratio', 'Advisory_ratio', 'MiddleChildhood', 'Preschooler','Teenager','gender_0', 'F', 'M','tillnextepisode']]        
        return self.without_diag_medic_selected_column_merged_df
        
    # Function to plot the elbow curve for k-prototypes
    def elbow_kprototypes(self):
        k_range = range(1, 11)
        costs = []

        for k in k_range:
            kprototypes = KPrototypes(n_clusters=k, random_state=0)
            kprototypes.fit(self.without_diag_medic_selected_column_merged_df, categorical=[12,13,14,15,16,17,18]) 
            costs.append(kprototypes.cost_)  

        # Plot the elbow curve
        plt.plot(k_range, costs, marker='o')
        plt.xlabel('Number of clusters')
        plt.ylabel('Cost')
        plt.title('Elbow method for K-Prototypes without Diagnosis and Medication')
        plt.savefig("elbow_kprototypes.jpg", dpi=600, bbox_inches='tight')
        plt.show()

    # Function to cluster the data using k-prototypes
    def without_med_diag_cluster_kprototypes(self, k_range):
        print("******** Without Diagnosis & Medication ******** \n")
        for k in k_range:
            kprototypes = KPrototypes(n_clusters=k, random_state=0)
            clusters = kprototypes.fit_predict(self.without_diag_medic_selected_column_merged_df, categorical=[12,13,14,15,16,17,18]) 
            self.without_diag_medic_selected_column_merged_df["cluster"] = clusters
            print(self.without_diag_medic_selected_column_merged_df["cluster"].value_counts())
            # Compute the silhouette score for the clusters
            silhouette = metrics.silhouette_score(self.without_diag_medic_selected_column_merged_df, clusters, metric="euclidean")
            # Compute the Calinski-Harabasz score for the clusters
            calinski = metrics.calinski_harabasz_score(self.without_diag_medic_selected_column_merged_df, clusters)
            # Print the scores
            print(f"Silhouette score for k={k}: {silhouette}")
            print(f"Calinski-Harabasz score for k={k}: {calinski}")

            
            
# Access and load files            
clustering_obj = Clustering("/mnt/work/workbench/dipendrp/new-data/Full_ICD10_ATC.csv")
without_diag_medic_selected_column_merged_df = clustering_obj.load_data()

# Plot elbow with and without medication and diagnosis
clustering_obj.elbow_kprototypes()

# Call without medication and diagnosis
clustering_obj.without_med_diag_cluster_kprototypes(k_range=[2, 3, 4, 5, 6, 7, 8])