#### Task: to plot the trajectory of patients (relative and actual) by reading a csv file based on episode_start_date and episode_end_date along with other info
##### Subtasks:
1. View the actual trajectory and relative trajectory (days since first case) of patients with 'gender','age_group','Count_visit', 'actual_diag' infomation of each episode for each cluster label.

Description of work done:

1. To be written

- Other Important Info: 
    2. Diagnosis and medication column names are `'actual_diag_ICD10', 'actual_med_Full_ATC'` in the dataset
- Dataset Used: `/mnt/work/workbench/dipendrp/new-data/Full_ICD10_ATC.csv`

**Subtask 1. View the actual trajectory and relative trajectory (days since first case) of patients with 'gender','age_group','Count_visit', 'actual_diag' infomation of each episode based on/for each cluster label.**

***SubSub Task 1.1: Generate data with cluster label(`'cluster'`) and this information (`'pasient','episode_start_date','episode_end_date','gender','age','var_no_dates_permonth','Count_visit','diagnosis','actual_med_Full_ATC'`)***

In [None]:
%run -i /mnt/work/workbench/dipendrp/Paper2/Github/CAMHS_Readmission_Analytics/allpackages.py

class Clustering:
    def __init__(self, file1):
        self.file1 = file1
        self.merged_df = None
        self.with_diag_medic_selected_column_merged_df = None
    
    def load_data(self):
        original_df_1 = pd.read_csv(self.file1) 
        original_df_1 = original_df_1.head(500)
        self.merged_df = original_df_1.head(500)
        # Rename the column var_no_dates_permonth with the Intensity_per_calendar_month
        original_df_1 = original_df_1.rename(columns={'var_no_dates_permonth':'Intensity_per_calendar_month'})
        
        # Encoding 'tillnextepisode' into labels: 'not-re-admitted' and time intervals *****
        le = LabelEncoder()
        original_df_1['tillnextepisode'] = le.fit_transform(pd.cut(original_df_1['tillnextepisode'],
                                                          bins=[float('-inf'), 0, 180, 365, 730, 1095, float('inf')],
                                                          labels=['not-re-admitted',
                                                                  're-admitted in 0-180 days',
                                                                  're-admitted in 180-365 days',
                                                                  're-admitted in 365-730 days',
                                                                  're-admitted in 730-1095 days',
                                                                  're-admitted in more than 1095 days']))
        # Fillna with zero
        original_df_1.fillna({'num_diagnoses': 0, 'num_medications': 0, 'Inpatient_daynight_ratio': 0, 'Inpatient_ratio': 0, 'TreatmentPlanning_ratio': 0,
                             'Therapy_ratio':0,'Examination_ratio':0, 'Advisory_ratio':0}, inplace=True)
        
        
        self.with_diag_medic_selected_column_merged_df = original_df_1[['MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M',
        'gender_0','Count_visit', 'num_diagnoses', 'num_medications', 'Length_of_Episode',
       'Inpatient_daynight_ratio', 'Intensity_per_calendar_month','Inpatient_ratio',
       'TreatmentPlanning_ratio', 'Therapy_ratio', 'Care_intensity',
       'Examination_ratio', 'Advisory_ratio','tillnextepisode','diagnosis','actual_med_Full_ATC']]
        
        numeric_columns = ['Count_visit', 'num_diagnoses', 'num_medications', 'Length_of_Episode',
       'Inpatient_daynight_ratio', 'Intensity_per_calendar_month','Inpatient_ratio',
       'TreatmentPlanning_ratio', 'Therapy_ratio', 'Care_intensity',
       'Examination_ratio', 'Advisory_ratio']
        
        scaler = StandardScaler()
        self.with_diag_medic_selected_column_merged_df[numeric_columns] = scaler.fit_transform(self.with_diag_medic_selected_column_merged_df[numeric_columns])
        cat_cols = ['diagnosis','actual_med_Full_ATC']
        self.with_diag_medic_selected_column_merged_df[cat_cols] = self.with_diag_medic_selected_column_merged_df[cat_cols].astype(str)
        return self.with_diag_medic_selected_column_merged_df
        
            
    def with_med_diag_cluster_kprototype(self, original_selected_column_merged_df):
        print("******** With Diagnosis & Medication using gower distance ******** \n")
        kproto = KPrototypes(n_clusters=7, init='Cao')
        clusters = kproto.fit_predict(original_selected_column_merged_df, categorical=[0,1,2,3,4,5,18,19,20])

        original_selected_column_merged_df["cluster"] = clusters
        self.merged_df["cluster"] = clusters
        self.merged_df[['pasient','episode_start_date','episode_end_date','gender','age','var_no_dates_permonth','Count_visit','diagnosis','actual_med_Full_ATC','cluster']].to_csv('Full_ICD10_ATC_with_cluster.csv', index=False)
        print(original_selected_column_merged_df["cluster"].value_counts())
        print(self.merged_df["cluster"].value_counts())

            
# Access and load files            
clustering_obj = Clustering("/mnt/work/workbench/dipendrp/new-data/Full_ICD10_ATC.csv")
with_diag_medic_selected_column_merged_df = clustering_obj.load_data()

# Call with medication and diagnosis in One-hot encoded form
clustering_obj.with_med_diag_cluster_kprototype(with_diag_medic_selected_column_merged_df)

***SubSub Task 1.2: Plot the patient trajectory of topmost patient for each cluster, based on cluster label(`'cluster'`) and this information (`'pasient','episode_start_date','episode_end_date','gender','age','var_no_dates_permonth','Count_visit','diagnosis','actual_med_Full_ATC'`)***

In [None]:
%run -i /mnt/work/workbench/dipendrp/Paper2/Github/CAMHS_Readmission_Analytics/allpackages.py
# Method that load file
def load_data():
    original_df = pd.read_csv("Full_ICD10_ATC_with_cluster.csv")
    original_df = original_df[['pasient', 'episode_start_date', 'episode_end_date', 'gender', 'age',
                               'var_no_dates_permonth', 'Count_visit', 'diagnosis',
                               'actual_med_Full_ATC', 'cluster']]
    
    original_df['episode_start_date'] = pd.to_datetime(original_df['episode_start_date'])
    original_df['episode_end_date'] = pd.to_datetime(original_df['episode_end_date'], errors='coerce')
    original_df = original_df.sort_values(by=['pasient', 'episode_start_date'])
    original_df = original_df.dropna(subset=['episode_start_date'])
    
    cluster_dfs = {}
    for cluster_label in range(0, 7):  
        cluster_dfs[f'original_df{cluster_label}'] = original_df[original_df['cluster'] == cluster_label].copy()
    return cluster_dfs

# Plots the trajectory in years
def patient_timeline_plot_yearly(original_df,ax,cmap,unique_patients):
    for i, patient_id in enumerate(unique_patients):
        patient_data = original_df[original_df['pasient'] == patient_id]
        for _, case in patient_data.iterrows():
            case_start = case['episode_start_date']
            case_end = case['episode_end_date']
            case_gender = case['gender']
            case_age = case['age']
            case_Intensity_per_calendar_month = case['var_no_dates_permonth']
            case_Intensity_per_calendar_month = round(case_Intensity_per_calendar_month, 2) # Rounding off the intensity
            case_diag = case['diagnosis']
            case_medication = case['actual_med_Full_ATC']
            label_demographics = f"{case_age} : {case_gender}:{case_Intensity_per_calendar_month} "
            label_diagnosis = f"{case_medication} :{case_diag}"
            ax.plot([case_start, case_end], [i, i], linewidth=15,color=cmap(i)) #color=cmap(case_axis)
            ax.annotate(label_demographics, (case_start + pd.DateOffset(days=2), i), xytext=(-8, 11), textcoords='offset points', ha='left', fontsize=8,
                        bbox=dict(boxstyle='round,pad=0.1', edgecolor='none', facecolor='white'))
            ax.annotate(label_diagnosis, (case_start + pd.DateOffset(days=2), i), xytext=(-8, -16), textcoords='offset points', ha='left', fontsize=8,
                        bbox=dict(boxstyle='round,pad=0.1', edgecolor='none', facecolor='white'))
            
    ax.set_yticks(range(len(unique_patients)))
    ax.set_yticklabels(unique_patients)
    ax.grid(True, linewidth=0.1) 
    ax.set_xlabel('Actual Date')
    ax.set_ylabel('Patient ID')
    ax.xaxis_date()

# Plots the time relatively
def patient_timeline_plot_relative(original_df,ax,cmap,unique_patients):
    starting_dates = {}
    for i, patient_id in enumerate(unique_patients):
        patient_data = original_df[original_df['pasient'] == patient_id]
        starting_dates[patient_id] = patient_data.iloc[0]['episode_start_date']
        for _, case in patient_data.iterrows():
            case_start = case['episode_start_date']
            case_end = case['episode_end_date']
            case_gender = case['gender']
            case_age = case['age']
            case_Intensity_per_calendar_month = case['var_no_dates_permonth']
            case_Intensity_per_calendar_month = round(case_Intensity_per_calendar_month, 2) # Rounding off the intensity
            case_diag = case['diagnosis']
            case_medication = case['actual_med_Full_ATC']
            label_demographics = f"{case_age} : {case_gender}:{case_Intensity_per_calendar_month} "
            label_diagnosis = f"{case_medication} :{case_diag}"
            relative_start = (case_start - starting_dates[patient_id]).days
            relative_end = (case_end - starting_dates[patient_id]).days
            ax.plot([relative_start, relative_end], [i, i], linewidth=15,color=cmap(i))
            ax.annotate(label_demographics, (relative_start + 2, i), xytext=(-8, 11), textcoords='offset points', ha='left', fontsize=8,
                        bbox=dict(boxstyle='round,pad=0.3', edgecolor='none', facecolor='white'))
            
            ax.annotate(label_diagnosis, (relative_start + 2, i), xytext=(-8, -16), textcoords='offset points', ha='left', fontsize=8,
                        bbox=dict(boxstyle='round,pad=0.3', edgecolor='none', facecolor='white'))

    ax.set_yticks(range(len(unique_patients)))
    ax.set_yticklabels(unique_patients)
    ax.grid(True, linewidth=0.1) 
    ax.set_xlabel('Days since first case')
    ax.set_ylabel('Patient ID')
    ax.xaxis.set_major_locator(plt.MaxNLocator(integer=True))
    
# Run the above code only
if __name__ == "__main__":
    original_df = load_data()

    original_df_0 = original_df['original_df0']
    original_df_0 = original_df_0.head(20)

    original_df_1 = original_df['original_df1']
    original_df_1 = original_df_1.head(20)

    original_df_2 = original_df['original_df2']
    original_df_2 = original_df_2.head(20)

    original_df_3 = original_df['original_df3']
    original_df_3 = original_df_3.head(20)

    original_df_4 = original_df['original_df4']
    original_df_4 = original_df_4.head(20)

    original_df_5 = original_df['original_df5']
    original_df_5 = original_df_5.head(20)

    original_df_6 = original_df['original_df6']
    original_df_6 = original_df_6.head(20)
    
    
    unique_patients = original_df_1['pasient'].unique()
    cmap = plt.get_cmap('tab20')
    fig, axes = plt.subplots(1, 2, figsize=(30, 14))
    fig.suptitle("********************** Cluster 0 **********************")
    patient_timeline_plot_yearly(original_df_1, axes[0], cmap, unique_patients)
    patient_timeline_plot_relative(original_df_1, axes[1], cmap, unique_patients)
    
    unique_patients = original_df_1['pasient'].unique()
    cmap = plt.get_cmap('tab20')
    fig, axes = plt.subplots(1, 2, figsize=(30, 14))
    fig.suptitle("********************** Cluster 1 **********************")
    patient_timeline_plot_yearly(original_df_1, axes[0], cmap, unique_patients)
    patient_timeline_plot_relative(original_df_1, axes[1], cmap, unique_patients)

    unique_patients = original_df_2['pasient'].unique()
    cmap = plt.get_cmap('tab20')
    fig, axes = plt.subplots(1, 2, figsize=(30, 14))
    fig.suptitle("********************** Cluster 2 **********************")
    patient_timeline_plot_yearly(original_df_2, axes[0], cmap, unique_patients)
    patient_timeline_plot_relative(original_df_2, axes[1], cmap, unique_patients)
   
    unique_patients = original_df_3['pasient'].unique()
    cmap = plt.get_cmap('tab20')
    fig, axes = plt.subplots(1, 2, figsize=(30, 14))
    fig.suptitle("********************** Cluster 3 **********************")
    patient_timeline_plot_yearly(original_df_3, axes[0], cmap, unique_patients)
    patient_timeline_plot_relative(original_df_3, axes[1], cmap, unique_patients)

    unique_patients = original_df_4['pasient'].unique()
    cmap = plt.get_cmap('tab20')
    fig, axes = plt.subplots(1, 2, figsize=(30, 14))
    fig.suptitle("********************** Cluster 4 **********************")
    patient_timeline_plot_yearly(original_df_4, axes[0], cmap, unique_patients)
    patient_timeline_plot_relative(original_df_4, axes[1], cmap, unique_patients)
    
    unique_patients = original_df_5['pasient'].unique()
    cmap = plt.get_cmap('tab20')
    fig, axes = plt.subplots(1, 2, figsize=(30, 14))
    fig.suptitle("********************** Cluster 5 **********************")
    patient_timeline_plot_yearly(original_df_5, axes[0], cmap, unique_patients)
    patient_timeline_plot_relative(original_df_5, axes[1], cmap, unique_patients)
         
    unique_patients = original_df_6['pasient'].unique()
    cmap = plt.get_cmap('tab20')
    fig, axes = plt.subplots(1, 2, figsize=(30, 14))
    fig.suptitle("********************** Cluster 6 **********************")
    patient_timeline_plot_yearly(original_df_6, axes[0], cmap, unique_patients)
    patient_timeline_plot_relative(original_df_6, axes[1], cmap, unique_patients)
    
    plt.tight_layout()
    plt.show()
    