#### Task: Classification and Predicition of readmission with medication and diagnosis columns, based on Phenocode and ATC codes of 4 digit length
##### Subtasks:
1. Classify and Predict for 180 days
2. Classify and Predict for 1 year
3. Classify and Predict for 2 year
4. Classify and Predict for 3 year

Description of work done:

1. In this scripts the first class `SplitMedicationDiagnosisInUniquePieces` splits all the diagnosia and medication column into unique pieces.
2. Then it saves the new data frame with unique diagnosis and medication as new `CSV` file named as `Split_Full_ICD10_ATC`
3. Then the second class `ClassifyReadmissionWithMedicationDiagnosis` takes the new `Split_Full_ICD10_ATC` dataset and perform classiciation and prediction
4. To run the script replace if you are not using `dotenv` replace`os.getenv("FILE_Full_Phecode_ATC_PATH")` with the path of your file.

- Other Important Info: 
    1. Dataset has NaN values. NaN has been replaced with 0
    2. Diagnosis and medication column names
- Dataset Used: `/mnt/work/workbench/dipendrp/new-data/Full_Phecode_ATC4.csv`

##### 1. Classify and Predict for 180 days

**`TNE_BO_180` values (0 and 1 are used to classify readmission, where 1 means readmission). Further for prediction only the readmitted patient data is taken and a linear regression model is trained on it to predict the number of days for readmission.**
- Dataset Used: `/mnt/work/workbench/dipendrp/new-data/Full_Phecode_ATC4.csv`
- Independent variable: `'age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                        'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio','TreatmentPlanning_ratio', 'Outpatient_ratio',
                        'Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications', 
                        and unique 'actual_diag_Phe', 'actual_med_ATC4' `
- Dependent Variable: For classification its `TNE_BO_180` and for prediction its `TNE_NO_180` 

In [None]:
import re
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import random
from IPython.display import display
from IPython.display import Image
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

import os
from dotenv import load_dotenv
load_dotenv()


######### Split Medication and Diagnosis In Unique Pieces ############
#                                                                    #
#                                                                    #
#                                                                    #
######################################################################

class SplitMedicationDiagnosisInUniquePieces:
    def __init__(self, file1):
        self.file1 = file1
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data_to_split(self):
        Full_Phecode_ATC4 = pd.read_csv(self.file1)
        self.merged_df = Full_Phecode_ATC4
        # Specify the names of columns you are interested in here diagnosis and medication column name in the CSV files
        self.merged_df = self.merged_df[['episode_id', 'num_diagnoses', 'num_medications', 'pasient', 'age',
                                       'remaining_time_countdown', 'var_no_dates_permonth', 'gender',
                                       'episode_order', 'islast', 'closingcode', 'aftercode',
                                       'episode_start_date', 'episode_end_date', 'tillnextepisode',
                                       'Length_of_Episode', 'Cat_LOE', 'TNE_BO_180', 'TNE_NO_180',
                                       'TNE_BO_365', 'TNE_NO_365', 'TNE_BO_730', 'TNE_NO_730', 'TNE_BO_1095',
                                       'TNE_NO_1095', 'Cat_LOE_desc', 'Count_visit', 'Cat_CV', 'Therapy_ratio',
                                       'Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
                                       'Outpatient_ratio', 'Inpatient_ratio', 'Inpatient_day_ratio',
                                       'Inpatient_daynight_ratio', 'Care_intensity', 'age_group',
                                       'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3',
                                       'closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
                                       'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4',
                                       'aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler',
                                       'Teenager', 'actual_diag_Phe', 'actual_med_ATC4']]
        
        self.merged_df = self.merged_df.copy(deep=True)
        return self.merged_df

        
    def split_diagnosis_medication_in_unique_pieces(self):
        newmerged_df = SplitMedicationDiagnosisInUniquePieces_Obj.load_data_to_split()
        # Specify the exact names of diagnosis and medication column to perform the split
        columns = ['actual_diag_Phe', 'actual_med_ATC4']
        for col in columns:
            newmerged_df[col] = newmerged_df[col].apply(lambda d:[] if pd.isnull(d) else d)
            newmerged_df[col] = newmerged_df[col].str.replace("[", "")
            newmerged_df[col] = newmerged_df[col].str.replace("]", "")
            newmerged_df[col] = newmerged_df[col].str.replace("'", "")
            newmerged_df[col] = newmerged_df[col].str.replace(" ", "")
            newmerged_df[col] = newmerged_df[col].str.split(',')
            newmerged_df = newmerged_df.explode(col)
            
            # Show only top 100 columns names
            temp_store_100 = newmerged_df[col].value_counts()
            names_only = temp_store_100.index.tolist()
            
            original_columns = newmerged_df.columns.tolist()
            dummies = newmerged_df[col].str.get_dummies(sep=',')
            newmerged_df = pd.concat([newmerged_df, dummies], axis=1)
            newmerged_df = newmerged_df.drop(col, axis=1)
            
        newmerged_df.to_csv('Split_Full_Phecode_ATC4.csv',index=False)
        
    
class ClassifyReadmissionWithMedicationDiagnosis:
    def __init__(self, file2):
        self.file2 = file2
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data(self):
        original_df_1 = pd.read_csv(self.file2)
        self.merged_df = original_df_1[['num_diagnoses', 'num_medications', 'remaining_time_countdown', 'var_no_dates_permonth','Length_of_Episode', 'Count_visit', 'Therapy_ratio','Examination_ratio', 'Advisory_ratio',
        'TreatmentPlanning_ratio','Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3','closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
        'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4','aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler','Teenager', 'TNE_NO_180','TNE_BO_180',
        '010', '0703', '0704', '079', '0792', '1000', '1008',
       '1009', '1011', '1013', '1015', '136', '1499', '1551', '1701', '1702',
       '18911', '1911', '19111', '1994', '201', '2022', '20411', '20421',
       '2441', '2442', '2444', '2445', '24521', '2501', '25011', '2502',
       '2531', '25311', '2532', '2553', '2564', '259', '260', '2602', '2612',
       '2614', '262', '2701', '2713', '27614', '2781', '2791', '2801', '28731',
       '288', '28811', '2903', '2911', '2914', '2922', '295', '2951', '2952',
       '2953', '296', '2961', '29622', '2972', '300', '3001', '3003', '3004',
       '3008', '3009', '301', '3011', '3012', '302', '303', '3031', '3033',
       '3034', '304', '3052', '306', '3069', '312', '3123', '313', '3131',
       '3132', '3133', '315', '3151', '3152', '3153', '316', '317', '323',
       '324', '327', '3274', '32741', '3275', '3276', '3311', '339', '340',
       '3401', '341', '343', '344', '345', '3451', '34511', '34512', '3453',
       '347', '3484', '357', '3591', '3592', '3621', '365', '3671', '3672',
       '3678', '3679', '368', '3681', '3684', '3695', '3711', '3781', '3782',
       '3811', '389', '3891', '3892', '3894', '3895', '42711', '4332', '4338',
       '4589', '472', '4741', '4742', '476', '4801', '495', '496', '499',
       '506', '5091', '510', '53011', '53014', '5302', '5305', '535', '5551',
       '5552', '55521', '558', '5611', '563', '564', '5641', '565', '57181',
       '5772', '591', '5921', '5965', '5994', '6041', '62611', '6264', '637',
       '6491', '656', '6562', '6563', '6564', '657', '658', '661', '691',
       '6941', '69542', '6964', '7061', '71011', '7142', '7169', '7282',
       '7321', '733', '736', '7373', '739', '745', '7471', '74711', '74712',
       '74713', '7472', '748', '749', '7491', '7492', '75011', '75013',
       '75021', '75112', '7512', '75121', '75211', '7522', '7531', '7551',
       '7554', '7556', '75561', '756', '7561', '758', '7581', '759', '760',
       '770', '773', '7981', '8003', '802', '8032', '8033', '805', '817',
       '818', '819', '830', '835', '871', '907', '915', '930', '939', '946',
       '947', '949', '969', '979', '981', 'A02B', 'A03F', 'A06A', 'A06B',
       'A07A', 'A10B', 'A11E', 'A12A', 'A12B', 'B03A', 'B03B', 'C02A', 'C07A',
       'C09A', 'D01A', 'D06A', 'D06B', 'D07A', 'D07B', 'D07X', 'D09A', 'D10A',
       'G03A', 'G03C', 'G03F', 'G03H', 'H01B', 'H02A', 'J01C', 'J01F', 'J05A',
       'M01A', 'N01B', 'N02A', 'N02B', 'N02C', 'N03A', 'N05A', 'N05B', 'N05C',
       'N06A', 'N06B', 'N07B', 'NO6B', 'R01A', 'R03A', 'R03B', 'R05D', 'R06A',
       'S01A', 'S01G']]
        
        
       
        # fill nan values in independent column
        self.merged_df['num_diagnoses'].fillna(0, inplace=True)
        self.merged_df['num_medications'].fillna(0, inplace=True)
        self.merged_df['Outpatient_ratio'].fillna(0, inplace=True)
        self.merged_df['Inpatient_daynight_ratio'].fillna(0, inplace=True)  
        self.merged_df['Therapy_ratio'].fillna(0, inplace=True)
        self.merged_df['Examination_ratio'].fillna(0, inplace=True)
        self.merged_df['Advisory_ratio'].fillna(0, inplace=True)
        self.merged_df['TreatmentPlanning_ratio'].fillna(0, inplace=True)
        
        
        # define dependent variables
        self.dependent_variable_TNE_BO_180 = self.merged_df[['TNE_BO_180']]
        
        self.independent_variable = self.merged_df[['num_diagnoses', 'num_medications', 'remaining_time_countdown', 'var_no_dates_permonth','Length_of_Episode', 'Count_visit', 'Therapy_ratio','Examination_ratio', 'Advisory_ratio',
        'TreatmentPlanning_ratio','Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3','closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
        'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4','aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler','Teenager',
        '010', '0703', '0704', '079', '0792', '1000', '1008',
       '1009', '1011', '1013', '1015', '136', '1499', '1551', '1701', '1702',
       '18911', '1911', '19111', '1994', '201', '2022', '20411', '20421',
       '2441', '2442', '2444', '2445', '24521', '2501', '25011', '2502',
       '2531', '25311', '2532', '2553', '2564', '259', '260', '2602', '2612',
       '2614', '262', '2701', '2713', '27614', '2781', '2791', '2801', '28731',
       '288', '28811', '2903', '2911', '2914', '2922', '295', '2951', '2952',
       '2953', '296', '2961', '29622', '2972', '300', '3001', '3003', '3004',
       '3008', '3009', '301', '3011', '3012', '302', '303', '3031', '3033',
       '3034', '304', '3052', '306', '3069', '312', '3123', '313', '3131',
       '3132', '3133', '315', '3151', '3152', '3153', '316', '317', '323',
       '324', '327', '3274', '32741', '3275', '3276', '3311', '339', '340',
       '3401', '341', '343', '344', '345', '3451', '34511', '34512', '3453',
       '347', '3484', '357', '3591', '3592', '3621', '365', '3671', '3672',
       '3678', '3679', '368', '3681', '3684', '3695', '3711', '3781', '3782',
       '3811', '389', '3891', '3892', '3894', '3895', '42711', '4332', '4338',
       '4589', '472', '4741', '4742', '476', '4801', '495', '496', '499',
       '506', '5091', '510', '53011', '53014', '5302', '5305', '535', '5551',
       '5552', '55521', '558', '5611', '563', '564', '5641', '565', '57181',
       '5772', '591', '5921', '5965', '5994', '6041', '62611', '6264', '637',
       '6491', '656', '6562', '6563', '6564', '657', '658', '661', '691',
       '6941', '69542', '6964', '7061', '71011', '7142', '7169', '7282',
       '7321', '733', '736', '7373', '739', '745', '7471', '74711', '74712',
       '74713', '7472', '748', '749', '7491', '7492', '75011', '75013',
       '75021', '75112', '7512', '75121', '75211', '7522', '7531', '7551',
       '7554', '7556', '75561', '756', '7561', '758', '7581', '759', '760',
       '770', '773', '7981', '8003', '802', '8032', '8033', '805', '817',
       '818', '819', '830', '835', '871', '907', '915', '930', '939', '946',
       '947', '949', '969', '979', '981', 'A02B', 'A03F', 'A06A', 'A06B',
       'A07A', 'A10B', 'A11E', 'A12A', 'A12B', 'B03A', 'B03B', 'C02A', 'C07A',
       'C09A', 'D01A', 'D06A', 'D06B', 'D07A', 'D07B', 'D07X', 'D09A', 'D10A',
       'G03A', 'G03C', 'G03F', 'G03H', 'H01B', 'H02A', 'J01C', 'J01F', 'J05A',
       'M01A', 'N01B', 'N02A', 'N02B', 'N02C', 'N03A', 'N05A', 'N05B', 'N05C',
       'N06A', 'N06B', 'N07B', 'NO6B', 'R01A', 'R03A', 'R03B', 'R05D', 'R06A',
       'S01A', 'S01G']]
        
        return self.merged_df, self.dependent_variable_TNE_BO_180, self.independent_variable 
    
    # To train and view the reasult of binary and multiclass classifier        
    def train_classifier_with_medication_diagnosis(self):
        dependent_variables = [self.dependent_variable_TNE_BO_180]
        
        # to define the weight of output class, to resolve imbalanced output/dataset
        dependent_variable_names = {'self.dependent_variable_TNE_BO_180': 'TNE_BO_180'}
        class_weights = {'TNE_BO_180': {0:10, 1:270}}
              
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            logistic_prediction_model = LogisticRegression(class_weight=class_weights[variable_name])
            X_train, X_test, y_train, y_test = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            logistic_prediction_model.fit(X_train,y_train)
            y_pred = logistic_prediction_model.predict(X_test)
            
            # Checks if the output category for classification is binary or multiclass
            category_count = int(y_train.nunique())
            print(f'\n****** Evaluation result {variable_name} as dependent variable ******')
            # For binary class classification 
            if category_count == 2:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='binary')
                recall = recall_score(y_test, y_pred, average='binary')
                f1 = f1_score(y_test, y_pred, average='binary')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                print("Confused matrix:")
                conf_matrix = confusion_matrix(y_test, y_pred)
                tn, fp, fn, tp = conf_matrix.ravel()
                print(conf_matrix)
                print(f"True Negative (TN): {tn}")
                print(f"False Positive (FP): {fp}")
                print(f"False Negative (FN): {fn}")
                print(f"True Positive (TP): {tp}")
                print(classification_report(y_test, y_pred))
            
            # For binary class classification 
            else:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                conf_matrix = confusion_matrix(y_test, y_pred)
                print("Confused matrix:")
                print(conf_matrix)
                print(classification_report(y_test, y_pred))
                
    def prediction_with_medication_diagnosis(self):
        # Select all corresponding columns with values in TNE_NO_180
        self.merged_df = self.merged_df.dropna(subset=['TNE_NO_180'])
        
        # define dependent variables
        self.dependent_variable_TNE_NO_180 = self.merged_df[['TNE_NO_180']]
        # define independent variables 
        self.independent_variable = self.merged_df[['num_diagnoses', 'num_medications', 'remaining_time_countdown', 'var_no_dates_permonth','Length_of_Episode', 'Count_visit', 'Therapy_ratio','Examination_ratio', 'Advisory_ratio',
        'TreatmentPlanning_ratio','Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3','closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
        'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4','aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler','Teenager',
        '010', '0703', '0704', '079', '0792', '1000', '1008',
       '1009', '1011', '1013', '1015', '136', '1499', '1551', '1701', '1702',
       '18911', '1911', '19111', '1994', '201', '2022', '20411', '20421',
       '2441', '2442', '2444', '2445', '24521', '2501', '25011', '2502',
       '2531', '25311', '2532', '2553', '2564', '259', '260', '2602', '2612',
       '2614', '262', '2701', '2713', '27614', '2781', '2791', '2801', '28731',
       '288', '28811', '2903', '2911', '2914', '2922', '295', '2951', '2952',
       '2953', '296', '2961', '29622', '2972', '300', '3001', '3003', '3004',
       '3008', '3009', '301', '3011', '3012', '302', '303', '3031', '3033',
       '3034', '304', '3052', '306', '3069', '312', '3123', '313', '3131',
       '3132', '3133', '315', '3151', '3152', '3153', '316', '317', '323',
       '324', '327', '3274', '32741', '3275', '3276', '3311', '339', '340',
       '3401', '341', '343', '344', '345', '3451', '34511', '34512', '3453',
       '347', '3484', '357', '3591', '3592', '3621', '365', '3671', '3672',
       '3678', '3679', '368', '3681', '3684', '3695', '3711', '3781', '3782',
       '3811', '389', '3891', '3892', '3894', '3895', '42711', '4332', '4338',
       '4589', '472', '4741', '4742', '476', '4801', '495', '496', '499',
       '506', '5091', '510', '53011', '53014', '5302', '5305', '535', '5551',
       '5552', '55521', '558', '5611', '563', '564', '5641', '565', '57181',
       '5772', '591', '5921', '5965', '5994', '6041', '62611', '6264', '637',
       '6491', '656', '6562', '6563', '6564', '657', '658', '661', '691',
       '6941', '69542', '6964', '7061', '71011', '7142', '7169', '7282',
       '7321', '733', '736', '7373', '739', '745', '7471', '74711', '74712',
       '74713', '7472', '748', '749', '7491', '7492', '75011', '75013',
       '75021', '75112', '7512', '75121', '75211', '7522', '7531', '7551',
       '7554', '7556', '75561', '756', '7561', '758', '7581', '759', '760',
       '770', '773', '7981', '8003', '802', '8032', '8033', '805', '817',
       '818', '819', '830', '835', '871', '907', '915', '930', '939', '946',
       '947', '949', '969', '979', '981', 'A02B', 'A03F', 'A06A', 'A06B',
       'A07A', 'A10B', 'A11E', 'A12A', 'A12B', 'B03A', 'B03B', 'C02A', 'C07A',
       'C09A', 'D01A', 'D06A', 'D06B', 'D07A', 'D07B', 'D07X', 'D09A', 'D10A',
       'G03A', 'G03C', 'G03F', 'G03H', 'H01B', 'H02A', 'J01C', 'J01F', 'J05A',
       'M01A', 'N01B', 'N02A', 'N02B', 'N02C', 'N03A', 'N05A', 'N05B', 'N05C',
       'N06A', 'N06B', 'N07B', 'NO6B', 'R01A', 'R03A', 'R03B', 'R05D', 'R06A',
       'S01A', 'S01G']]
        
        dependent_variables = [self.dependent_variable_TNE_NO_180]
        dependent_variable_names = {'self.dependent_variable_TNE_NO_180':'TNE_NO_180'}
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            linear_prediction_model = LinearRegression()
            X_train, X_temp, y_train, y_temp = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            X_test,X_eval,y_test,y_eval = train_test_split(X_temp,y_temp,test_size=0.33)
            linear_prediction_model.fit(X_train,y_train)
            y_pred = linear_prediction_model.predict(X_test)
            print(f'\n****** Prediction Model Evaluation result  {variable_name} as dependent variable ******')
            y_pred = linear_prediction_model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            print('R-squared:', r2)
            print('Mean squared error:', mse)
                

SplitMedicationDiagnosisInUniquePieces_Obj = SplitMedicationDiagnosisInUniquePieces(os.getenv("FILE_Full_Phecode_ATC_PATH"))
SplitMedicationDiagnosisInUniquePieces_Obj.split_diagnosis_medication_in_unique_pieces()

ClassifyReadmissionWithMedicationDiagnosis_Obj = ClassifyReadmissionWithMedicationDiagnosis('Split_Full_Phecode_ATC4.csv')
merged_df, dependent_variable_TNE_BO_180, independent_variable  = ClassifyReadmissionWithMedicationDiagnosis_Obj.load_data()
ClassifyReadmissionWithMedicationDiagnosis_Obj.train_classifier_with_medication_diagnosis()
ClassifyReadmissionWithMedicationDiagnosis_Obj.prediction_with_medication_diagnosis()

##### 2. Classify and Predict for 1 year

**`TNE_BO_365` values (0 and 1 are used to classify readmission, where 1 means readmission). Further for prediction only the readmitted patient data is taken and a linear regression model is trained on it to predict the number of days for readmission.**
- Dataset Used: `/mnt/work/workbench/dipendrp/new-data/Full_Phecode_ATC4.csv`
- Independent variable: `'age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                        'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio','TreatmentPlanning_ratio', 'Outpatient_ratio',
                        'Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications', 
                        and unique 'actual_diag_Phe', 'actual_med_ATC4' `
- Dependent Variable: For classification its `TNE_BO_365` and for prediction its `TNE_NO_365`

In [None]:
import re
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import random
from IPython.display import display
from IPython.display import Image
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

import os
from dotenv import load_dotenv
load_dotenv()


######### Split Medication and Diagnosis In Unique Pieces ############
#                                                                    #
#                                                                    #
#                                                                    #
######################################################################

class SplitMedicationDiagnosisInUniquePieces:
    def __init__(self, file1):
        self.file1 = file1
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data_to_split(self):
        Full_Phecode_ATC4 = pd.read_csv(self.file1)
        self.merged_df = Full_Phecode_ATC4
        # Specify the names of columns you are interested in here diagnosis and medication column name in the CSV files
        self.merged_df = self.merged_df[['episode_id', 'num_diagnoses', 'num_medications', 'pasient', 'age',
                                       'remaining_time_countdown', 'var_no_dates_permonth', 'gender',
                                       'episode_order', 'islast', 'closingcode', 'aftercode',
                                       'episode_start_date', 'episode_end_date', 'tillnextepisode',
                                       'Length_of_Episode', 'Cat_LOE', 'TNE_BO_180', 'TNE_NO_180',
                                       'TNE_BO_365', 'TNE_NO_365', 'TNE_BO_730', 'TNE_NO_730', 'TNE_BO_1095',
                                       'TNE_NO_1095', 'Cat_LOE_desc', 'Count_visit', 'Cat_CV', 'Therapy_ratio',
                                       'Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
                                       'Outpatient_ratio', 'Inpatient_ratio', 'Inpatient_day_ratio',
                                       'Inpatient_daynight_ratio', 'Care_intensity', 'age_group',
                                       'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3',
                                       'closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
                                       'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4',
                                       'aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler',
                                       'Teenager', 'actual_diag_Phe', 'actual_med_ATC4']]
        
        self.merged_df = self.merged_df.copy(deep=True)
        return self.merged_df

        
    def split_diagnosis_medication_in_unique_pieces(self):
        newmerged_df = SplitMedicationDiagnosisInUniquePieces_Obj.load_data_to_split()
        # Specify the exact names of diagnosis and medication column to perform the split
        columns = ['actual_diag_Phe', 'actual_med_ATC4']
        for col in columns:
            newmerged_df[col] = newmerged_df[col].apply(lambda d:[] if pd.isnull(d) else d)
            newmerged_df[col] = newmerged_df[col].str.replace("[", "")
            newmerged_df[col] = newmerged_df[col].str.replace("]", "")
            newmerged_df[col] = newmerged_df[col].str.replace("'", "")
            newmerged_df[col] = newmerged_df[col].str.replace(" ", "")
            newmerged_df[col] = newmerged_df[col].str.split(',')
            newmerged_df = newmerged_df.explode(col)
            
            # Show only top 100 columns names
            temp_store_100 = newmerged_df[col].value_counts()
            names_only = temp_store_100.index.tolist()
            
            original_columns = newmerged_df.columns.tolist()
            dummies = newmerged_df[col].str.get_dummies(sep=',')
            newmerged_df = pd.concat([newmerged_df, dummies], axis=1)
            newmerged_df = newmerged_df.drop(col, axis=1)
            
        newmerged_df.to_csv('Split_Full_Phecode_ATC4.csv',index=False)
        
    
class ClassifyReadmissionWithMedicationDiagnosis:
    def __init__(self, file2):
        self.file2 = file2
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data(self):
        original_df_1 = pd.read_csv(self.file2)
        self.merged_df = original_df_1[['num_diagnoses', 'num_medications', 'remaining_time_countdown', 'var_no_dates_permonth','Length_of_Episode', 'Count_visit', 'Therapy_ratio','Examination_ratio', 'Advisory_ratio',
        'TreatmentPlanning_ratio','Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3','closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
        'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4','aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler','Teenager', 'TNE_NO_365','TNE_BO_365',
        '010', '0703', '0704', '079', '0792', '1000', '1008',
       '1009', '1011', '1013', '1015', '136', '1499', '1551', '1701', '1702',
       '18911', '1911', '19111', '1994', '201', '2022', '20411', '20421',
       '2441', '2442', '2444', '2445', '24521', '2501', '25011', '2502',
       '2531', '25311', '2532', '2553', '2564', '259', '260', '2602', '2612',
       '2614', '262', '2701', '2713', '27614', '2781', '2791', '2801', '28731',
       '288', '28811', '2903', '2911', '2914', '2922', '295', '2951', '2952',
       '2953', '296', '2961', '29622', '2972', '300', '3001', '3003', '3004',
       '3008', '3009', '301', '3011', '3012', '302', '303', '3031', '3033',
       '3034', '304', '3052', '306', '3069', '312', '3123', '313', '3131',
       '3132', '3133', '315', '3151', '3152', '3153', '316', '317', '323',
       '324', '327', '3274', '32741', '3275', '3276', '3311', '339', '340',
       '3401', '341', '343', '344', '345', '3451', '34511', '34512', '3453',
       '347', '3484', '357', '3591', '3592', '3621', '365', '3671', '3672',
       '3678', '3679', '368', '3681', '3684', '3695', '3711', '3781', '3782',
       '3811', '389', '3891', '3892', '3894', '3895', '42711', '4332', '4338',
       '4589', '472', '4741', '4742', '476', '4801', '495', '496', '499',
       '506', '5091', '510', '53011', '53014', '5302', '5305', '535', '5551',
       '5552', '55521', '558', '5611', '563', '564', '5641', '565', '57181',
       '5772', '591', '5921', '5965', '5994', '6041', '62611', '6264', '637',
       '6491', '656', '6562', '6563', '6564', '657', '658', '661', '691',
       '6941', '69542', '6964', '7061', '71011', '7142', '7169', '7282',
       '7321', '733', '736', '7373', '739', '745', '7471', '74711', '74712',
       '74713', '7472', '748', '749', '7491', '7492', '75011', '75013',
       '75021', '75112', '7512', '75121', '75211', '7522', '7531', '7551',
       '7554', '7556', '75561', '756', '7561', '758', '7581', '759', '760',
       '770', '773', '7981', '8003', '802', '8032', '8033', '805', '817',
       '818', '819', '830', '835', '871', '907', '915', '930', '939', '946',
       '947', '949', '969', '979', '981', 'A02B', 'A03F', 'A06A', 'A06B',
       'A07A', 'A10B', 'A11E', 'A12A', 'A12B', 'B03A', 'B03B', 'C02A', 'C07A',
       'C09A', 'D01A', 'D06A', 'D06B', 'D07A', 'D07B', 'D07X', 'D09A', 'D10A',
       'G03A', 'G03C', 'G03F', 'G03H', 'H01B', 'H02A', 'J01C', 'J01F', 'J05A',
       'M01A', 'N01B', 'N02A', 'N02B', 'N02C', 'N03A', 'N05A', 'N05B', 'N05C',
       'N06A', 'N06B', 'N07B', 'NO6B', 'R01A', 'R03A', 'R03B', 'R05D', 'R06A',
       'S01A', 'S01G']]
        
        
       
        # fill nan values in independent column
        self.merged_df['num_diagnoses'].fillna(0, inplace=True)
        self.merged_df['num_medications'].fillna(0, inplace=True)
        self.merged_df['Outpatient_ratio'].fillna(0, inplace=True)
        self.merged_df['Inpatient_daynight_ratio'].fillna(0, inplace=True)  
        self.merged_df['Therapy_ratio'].fillna(0, inplace=True)
        self.merged_df['Examination_ratio'].fillna(0, inplace=True)
        self.merged_df['Advisory_ratio'].fillna(0, inplace=True)
        self.merged_df['TreatmentPlanning_ratio'].fillna(0, inplace=True)
        
        
        # define dependent variables
        self.dependent_variable_TNE_BO_365 = self.merged_df[['TNE_BO_365']]
        
        self.independent_variable = self.merged_df[['num_diagnoses', 'num_medications', 'remaining_time_countdown', 'var_no_dates_permonth','Length_of_Episode', 'Count_visit', 'Therapy_ratio','Examination_ratio', 'Advisory_ratio',
        'TreatmentPlanning_ratio','Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3','closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
        'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4','aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler','Teenager',
        '010', '0703', '0704', '079', '0792', '1000', '1008',
       '1009', '1011', '1013', '1015', '136', '1499', '1551', '1701', '1702',
       '18911', '1911', '19111', '1994', '201', '2022', '20411', '20421',
       '2441', '2442', '2444', '2445', '24521', '2501', '25011', '2502',
       '2531', '25311', '2532', '2553', '2564', '259', '260', '2602', '2612',
       '2614', '262', '2701', '2713', '27614', '2781', '2791', '2801', '28731',
       '288', '28811', '2903', '2911', '2914', '2922', '295', '2951', '2952',
       '2953', '296', '2961', '29622', '2972', '300', '3001', '3003', '3004',
       '3008', '3009', '301', '3011', '3012', '302', '303', '3031', '3033',
       '3034', '304', '3052', '306', '3069', '312', '3123', '313', '3131',
       '3132', '3133', '315', '3151', '3152', '3153', '316', '317', '323',
       '324', '327', '3274', '32741', '3275', '3276', '3311', '339', '340',
       '3401', '341', '343', '344', '345', '3451', '34511', '34512', '3453',
       '347', '3484', '357', '3591', '3592', '3621', '365', '3671', '3672',
       '3678', '3679', '368', '3681', '3684', '3695', '3711', '3781', '3782',
       '3811', '389', '3891', '3892', '3894', '3895', '42711', '4332', '4338',
       '4589', '472', '4741', '4742', '476', '4801', '495', '496', '499',
       '506', '5091', '510', '53011', '53014', '5302', '5305', '535', '5551',
       '5552', '55521', '558', '5611', '563', '564', '5641', '565', '57181',
       '5772', '591', '5921', '5965', '5994', '6041', '62611', '6264', '637',
       '6491', '656', '6562', '6563', '6564', '657', '658', '661', '691',
       '6941', '69542', '6964', '7061', '71011', '7142', '7169', '7282',
       '7321', '733', '736', '7373', '739', '745', '7471', '74711', '74712',
       '74713', '7472', '748', '749', '7491', '7492', '75011', '75013',
       '75021', '75112', '7512', '75121', '75211', '7522', '7531', '7551',
       '7554', '7556', '75561', '756', '7561', '758', '7581', '759', '760',
       '770', '773', '7981', '8003', '802', '8032', '8033', '805', '817',
       '818', '819', '830', '835', '871', '907', '915', '930', '939', '946',
       '947', '949', '969', '979', '981', 'A02B', 'A03F', 'A06A', 'A06B',
       'A07A', 'A10B', 'A11E', 'A12A', 'A12B', 'B03A', 'B03B', 'C02A', 'C07A',
       'C09A', 'D01A', 'D06A', 'D06B', 'D07A', 'D07B', 'D07X', 'D09A', 'D10A',
       'G03A', 'G03C', 'G03F', 'G03H', 'H01B', 'H02A', 'J01C', 'J01F', 'J05A',
       'M01A', 'N01B', 'N02A', 'N02B', 'N02C', 'N03A', 'N05A', 'N05B', 'N05C',
       'N06A', 'N06B', 'N07B', 'NO6B', 'R01A', 'R03A', 'R03B', 'R05D', 'R06A',
       'S01A', 'S01G']]
        
        return self.merged_df, self.dependent_variable_TNE_BO_365, self.independent_variable 
    
    # To train and view the reasult of binary and multiclass classifier        
    def train_classifier_with_medication_diagnosis(self):
        dependent_variables = [self.dependent_variable_TNE_BO_365]
        
        # to define the weight of output class, to resolve imbalanced output/dataset
        dependent_variable_names = {'self.dependent_variable_TNE_BO_365': 'TNE_BO_365'}
        class_weights = {'TNE_BO_365': {0:10, 1:270}}
              
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            logistic_prediction_model = LogisticRegression(class_weight=class_weights[variable_name])
            X_train, X_test, y_train, y_test = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            logistic_prediction_model.fit(X_train,y_train)
            y_pred = logistic_prediction_model.predict(X_test)
            
            # Checks if the output category for classification is binary or multiclass
            category_count = int(y_train.nunique())
            print(f'\n****** Evaluation result {variable_name} as dependent variable ******')
            # For binary class classification 
            if category_count == 2:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='binary')
                recall = recall_score(y_test, y_pred, average='binary')
                f1 = f1_score(y_test, y_pred, average='binary')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                print("Confused matrix:")
                conf_matrix = confusion_matrix(y_test, y_pred)
                tn, fp, fn, tp = conf_matrix.ravel()
                print(conf_matrix)
                print(f"True Negative (TN): {tn}")
                print(f"False Positive (FP): {fp}")
                print(f"False Negative (FN): {fn}")
                print(f"True Positive (TP): {tp}")
                print(classification_report(y_test, y_pred))
            
            # For binary class classification 
            else:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                conf_matrix = confusion_matrix(y_test, y_pred)
                print("Confused matrix:")
                print(conf_matrix)
                print(classification_report(y_test, y_pred))
                
    def prediction_with_medication_diagnosis(self):
        # Select all corresponding columns with values in TNE_NO_365
        self.merged_df = self.merged_df.dropna(subset=['TNE_NO_365'])
        
        # define dependent variables
        self.dependent_variable_TNE_NO_365 = self.merged_df[['TNE_NO_365']]
        # define independent variables 
        self.independent_variable = self.merged_df[['num_diagnoses', 'num_medications', 'remaining_time_countdown', 'var_no_dates_permonth','Length_of_Episode', 'Count_visit', 'Therapy_ratio','Examination_ratio', 'Advisory_ratio',
        'TreatmentPlanning_ratio','Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3','closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
        'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4','aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler','Teenager',
        '010', '0703', '0704', '079', '0792', '1000', '1008',
       '1009', '1011', '1013', '1015', '136', '1499', '1551', '1701', '1702',
       '18911', '1911', '19111', '1994', '201', '2022', '20411', '20421',
       '2441', '2442', '2444', '2445', '24521', '2501', '25011', '2502',
       '2531', '25311', '2532', '2553', '2564', '259', '260', '2602', '2612',
       '2614', '262', '2701', '2713', '27614', '2781', '2791', '2801', '28731',
       '288', '28811', '2903', '2911', '2914', '2922', '295', '2951', '2952',
       '2953', '296', '2961', '29622', '2972', '300', '3001', '3003', '3004',
       '3008', '3009', '301', '3011', '3012', '302', '303', '3031', '3033',
       '3034', '304', '3052', '306', '3069', '312', '3123', '313', '3131',
       '3132', '3133', '315', '3151', '3152', '3153', '316', '317', '323',
       '324', '327', '3274', '32741', '3275', '3276', '3311', '339', '340',
       '3401', '341', '343', '344', '345', '3451', '34511', '34512', '3453',
       '347', '3484', '357', '3591', '3592', '3621', '365', '3671', '3672',
       '3678', '3679', '368', '3681', '3684', '3695', '3711', '3781', '3782',
       '3811', '389', '3891', '3892', '3894', '3895', '42711', '4332', '4338',
       '4589', '472', '4741', '4742', '476', '4801', '495', '496', '499',
       '506', '5091', '510', '53011', '53014', '5302', '5305', '535', '5551',
       '5552', '55521', '558', '5611', '563', '564', '5641', '565', '57181',
       '5772', '591', '5921', '5965', '5994', '6041', '62611', '6264', '637',
       '6491', '656', '6562', '6563', '6564', '657', '658', '661', '691',
       '6941', '69542', '6964', '7061', '71011', '7142', '7169', '7282',
       '7321', '733', '736', '7373', '739', '745', '7471', '74711', '74712',
       '74713', '7472', '748', '749', '7491', '7492', '75011', '75013',
       '75021', '75112', '7512', '75121', '75211', '7522', '7531', '7551',
       '7554', '7556', '75561', '756', '7561', '758', '7581', '759', '760',
       '770', '773', '7981', '8003', '802', '8032', '8033', '805', '817',
       '818', '819', '830', '835', '871', '907', '915', '930', '939', '946',
       '947', '949', '969', '979', '981', 'A02B', 'A03F', 'A06A', 'A06B',
       'A07A', 'A10B', 'A11E', 'A12A', 'A12B', 'B03A', 'B03B', 'C02A', 'C07A',
       'C09A', 'D01A', 'D06A', 'D06B', 'D07A', 'D07B', 'D07X', 'D09A', 'D10A',
       'G03A', 'G03C', 'G03F', 'G03H', 'H01B', 'H02A', 'J01C', 'J01F', 'J05A',
       'M01A', 'N01B', 'N02A', 'N02B', 'N02C', 'N03A', 'N05A', 'N05B', 'N05C',
       'N06A', 'N06B', 'N07B', 'NO6B', 'R01A', 'R03A', 'R03B', 'R05D', 'R06A',
       'S01A', 'S01G']]
        
        dependent_variables = [self.dependent_variable_TNE_NO_365]
        dependent_variable_names = {'self.dependent_variable_TNE_NO_365':'TNE_NO_365'}
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            linear_prediction_model = LinearRegression()
            X_train, X_temp, y_train, y_temp = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            X_test,X_eval,y_test,y_eval = train_test_split(X_temp,y_temp,test_size=0.33)
            linear_prediction_model.fit(X_train,y_train)
            y_pred = linear_prediction_model.predict(X_test)
            print(f'\n****** Prediction Model Evaluation result  {variable_name} as dependent variable ******')
            y_pred = linear_prediction_model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            print('R-squared:', r2)
            print('Mean squared error:', mse)
                

SplitMedicationDiagnosisInUniquePieces_Obj = SplitMedicationDiagnosisInUniquePieces(os.getenv("FILE_Full_Phecode_ATC_PATH"))
SplitMedicationDiagnosisInUniquePieces_Obj.split_diagnosis_medication_in_unique_pieces()

ClassifyReadmissionWithMedicationDiagnosis_Obj = ClassifyReadmissionWithMedicationDiagnosis('Split_Full_Phecode_ATC4.csv')
merged_df, dependent_variable_TNE_BO_365, independent_variable  = ClassifyReadmissionWithMedicationDiagnosis_Obj.load_data()
ClassifyReadmissionWithMedicationDiagnosis_Obj.train_classifier_with_medication_diagnosis()
ClassifyReadmissionWithMedicationDiagnosis_Obj.prediction_with_medication_diagnosis()

##### 3. Classify and Predict for 2 year

**`TNE_BO_730` values (0 and 1 are used to classify readmission, where 1 means readmission). Further for prediction only the readmitted patient data is taken and a linear regression model is trained on it to predict the number of days for readmission.**
- Dataset Used: `/mnt/work/workbench/dipendrp/new-data/Full_Phecode_ATC4.csv`
- Independent variable: `'age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                        'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio','TreatmentPlanning_ratio', 'Outpatient_ratio',
                        'Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications', 
                        and unique 'actual_diag_Phe', 'actual_med_ATC4'`
- Dependent Variable: For classification its `TNE_BO_730` and for prediction its `TNE_NO_730`

In [None]:
import re
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import random
from IPython.display import display
from IPython.display import Image
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

import os
from dotenv import load_dotenv
load_dotenv()


######### Split Medication and Diagnosis In Unique Pieces ############
#                                                                    #
#                                                                    #
#                                                                    #
######################################################################

class SplitMedicationDiagnosisInUniquePieces:
    def __init__(self, file1):
        self.file1 = file1
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data_to_split(self):
        Full_Phecode_ATC4 = pd.read_csv(self.file1)
        self.merged_df = Full_Phecode_ATC4
        # Specify the names of columns you are interested in here diagnosis and medication column name in the CSV files
        self.merged_df = self.merged_df[['episode_id', 'num_diagnoses', 'num_medications', 'pasient', 'age',
                                       'remaining_time_countdown', 'var_no_dates_permonth', 'gender',
                                       'episode_order', 'islast', 'closingcode', 'aftercode',
                                       'episode_start_date', 'episode_end_date', 'tillnextepisode',
                                       'Length_of_Episode', 'Cat_LOE', 'TNE_BO_180', 'TNE_NO_180',
                                       'TNE_BO_365', 'TNE_NO_365', 'TNE_BO_730', 'TNE_NO_730', 'TNE_BO_1095',
                                       'TNE_NO_1095', 'Cat_LOE_desc', 'Count_visit', 'Cat_CV', 'Therapy_ratio',
                                       'Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
                                       'Outpatient_ratio', 'Inpatient_ratio', 'Inpatient_day_ratio',
                                       'Inpatient_daynight_ratio', 'Care_intensity', 'age_group',
                                       'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3',
                                       'closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
                                       'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4',
                                       'aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler',
                                       'Teenager', 'actual_diag_Phe', 'actual_med_ATC4']]
        
        self.merged_df = self.merged_df.copy(deep=True)
        return self.merged_df

        
    def split_diagnosis_medication_in_unique_pieces(self):
        newmerged_df = SplitMedicationDiagnosisInUniquePieces_Obj.load_data_to_split()
        # Specify the exact names of diagnosis and medication column to perform the split
        columns = ['actual_diag_Phe', 'actual_med_ATC4']
        for col in columns:
            newmerged_df[col] = newmerged_df[col].apply(lambda d:[] if pd.isnull(d) else d)
            newmerged_df[col] = newmerged_df[col].str.replace("[", "")
            newmerged_df[col] = newmerged_df[col].str.replace("]", "")
            newmerged_df[col] = newmerged_df[col].str.replace("'", "")
            newmerged_df[col] = newmerged_df[col].str.replace(" ", "")
            newmerged_df[col] = newmerged_df[col].str.split(',')
            newmerged_df = newmerged_df.explode(col)
            
            # Show only top 100 columns names
            temp_store_100 = newmerged_df[col].value_counts()
            names_only = temp_store_100.index.tolist()
            
            original_columns = newmerged_df.columns.tolist()
            dummies = newmerged_df[col].str.get_dummies(sep=',')
            newmerged_df = pd.concat([newmerged_df, dummies], axis=1)
            newmerged_df = newmerged_df.drop(col, axis=1)
            
        newmerged_df.to_csv('Split_Full_Phecode_ATC4.csv',index=False)
        
    
class ClassifyReadmissionWithMedicationDiagnosis:
    def __init__(self, file2):
        self.file2 = file2
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data(self):
        original_df_1 = pd.read_csv(self.file2)
        self.merged_df = original_df_1[['num_diagnoses', 'num_medications', 'remaining_time_countdown', 'var_no_dates_permonth','Length_of_Episode', 'Count_visit', 'Therapy_ratio','Examination_ratio', 'Advisory_ratio',
        'TreatmentPlanning_ratio','Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3','closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
        'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4','aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler','Teenager', 'TNE_NO_730','TNE_BO_730',
        '010', '0703', '0704', '079', '0792', '1000', '1008',
       '1009', '1011', '1013', '1015', '136', '1499', '1551', '1701', '1702',
       '18911', '1911', '19111', '1994', '201', '2022', '20411', '20421',
       '2441', '2442', '2444', '2445', '24521', '2501', '25011', '2502',
       '2531', '25311', '2532', '2553', '2564', '259', '260', '2602', '2612',
       '2614', '262', '2701', '2713', '27614', '2781', '2791', '2801', '28731',
       '288', '28811', '2903', '2911', '2914', '2922', '295', '2951', '2952',
       '2953', '296', '2961', '29622', '2972', '300', '3001', '3003', '3004',
       '3008', '3009', '301', '3011', '3012', '302', '303', '3031', '3033',
       '3034', '304', '3052', '306', '3069', '312', '3123', '313', '3131',
       '3132', '3133', '315', '3151', '3152', '3153', '316', '317', '323',
       '324', '327', '3274', '32741', '3275', '3276', '3311', '339', '340',
       '3401', '341', '343', '344', '345', '3451', '34511', '34512', '3453',
       '347', '3484', '357', '3591', '3592', '3621', '365', '3671', '3672',
       '3678', '3679', '368', '3681', '3684', '3695', '3711', '3781', '3782',
       '3811', '389', '3891', '3892', '3894', '3895', '42711', '4332', '4338',
       '4589', '472', '4741', '4742', '476', '4801', '495', '496', '499',
       '506', '5091', '510', '53011', '53014', '5302', '5305', '535', '5551',
       '5552', '55521', '558', '5611', '563', '564', '5641', '565', '57181',
       '5772', '591', '5921', '5965', '5994', '6041', '62611', '6264', '637',
       '6491', '656', '6562', '6563', '6564', '657', '658', '661', '691',
       '6941', '69542', '6964', '7061', '71011', '7142', '7169', '7282',
       '7321', '733', '736', '7373', '739', '745', '7471', '74711', '74712',
       '74713', '7472', '748', '749', '7491', '7492', '75011', '75013',
       '75021', '75112', '7512', '75121', '75211', '7522', '7531', '7551',
       '7554', '7556', '75561', '756', '7561', '758', '7581', '759', '760',
       '770', '773', '7981', '8003', '802', '8032', '8033', '805', '817',
       '818', '819', '830', '835', '871', '907', '915', '930', '939', '946',
       '947', '949', '969', '979', '981', 'A02B', 'A03F', 'A06A', 'A06B',
       'A07A', 'A10B', 'A11E', 'A12A', 'A12B', 'B03A', 'B03B', 'C02A', 'C07A',
       'C09A', 'D01A', 'D06A', 'D06B', 'D07A', 'D07B', 'D07X', 'D09A', 'D10A',
       'G03A', 'G03C', 'G03F', 'G03H', 'H01B', 'H02A', 'J01C', 'J01F', 'J05A',
       'M01A', 'N01B', 'N02A', 'N02B', 'N02C', 'N03A', 'N05A', 'N05B', 'N05C',
       'N06A', 'N06B', 'N07B', 'NO6B', 'R01A', 'R03A', 'R03B', 'R05D', 'R06A',
       'S01A', 'S01G']]
        
        
       
        # fill nan values in independent column
        self.merged_df['num_diagnoses'].fillna(0, inplace=True)
        self.merged_df['num_medications'].fillna(0, inplace=True)
        self.merged_df['Outpatient_ratio'].fillna(0, inplace=True)
        self.merged_df['Inpatient_daynight_ratio'].fillna(0, inplace=True)  
        self.merged_df['Therapy_ratio'].fillna(0, inplace=True)
        self.merged_df['Examination_ratio'].fillna(0, inplace=True)
        self.merged_df['Advisory_ratio'].fillna(0, inplace=True)
        self.merged_df['TreatmentPlanning_ratio'].fillna(0, inplace=True)
        
        
        # define dependent variables
        self.dependent_variable_TNE_BO_730 = self.merged_df[['TNE_BO_730']]
        
        self.independent_variable = self.merged_df[['num_diagnoses', 'num_medications', 'remaining_time_countdown', 'var_no_dates_permonth','Length_of_Episode', 'Count_visit', 'Therapy_ratio','Examination_ratio', 'Advisory_ratio',
        'TreatmentPlanning_ratio','Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3','closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
        'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4','aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler','Teenager',
        '010', '0703', '0704', '079', '0792', '1000', '1008',
       '1009', '1011', '1013', '1015', '136', '1499', '1551', '1701', '1702',
       '18911', '1911', '19111', '1994', '201', '2022', '20411', '20421',
       '2441', '2442', '2444', '2445', '24521', '2501', '25011', '2502',
       '2531', '25311', '2532', '2553', '2564', '259', '260', '2602', '2612',
       '2614', '262', '2701', '2713', '27614', '2781', '2791', '2801', '28731',
       '288', '28811', '2903', '2911', '2914', '2922', '295', '2951', '2952',
       '2953', '296', '2961', '29622', '2972', '300', '3001', '3003', '3004',
       '3008', '3009', '301', '3011', '3012', '302', '303', '3031', '3033',
       '3034', '304', '3052', '306', '3069', '312', '3123', '313', '3131',
       '3132', '3133', '315', '3151', '3152', '3153', '316', '317', '323',
       '324', '327', '3274', '32741', '3275', '3276', '3311', '339', '340',
       '3401', '341', '343', '344', '345', '3451', '34511', '34512', '3453',
       '347', '3484', '357', '3591', '3592', '3621', '365', '3671', '3672',
       '3678', '3679', '368', '3681', '3684', '3695', '3711', '3781', '3782',
       '3811', '389', '3891', '3892', '3894', '3895', '42711', '4332', '4338',
       '4589', '472', '4741', '4742', '476', '4801', '495', '496', '499',
       '506', '5091', '510', '53011', '53014', '5302', '5305', '535', '5551',
       '5552', '55521', '558', '5611', '563', '564', '5641', '565', '57181',
       '5772', '591', '5921', '5965', '5994', '6041', '62611', '6264', '637',
       '6491', '656', '6562', '6563', '6564', '657', '658', '661', '691',
       '6941', '69542', '6964', '7061', '71011', '7142', '7169', '7282',
       '7321', '733', '736', '7373', '739', '745', '7471', '74711', '74712',
       '74713', '7472', '748', '749', '7491', '7492', '75011', '75013',
       '75021', '75112', '7512', '75121', '75211', '7522', '7531', '7551',
       '7554', '7556', '75561', '756', '7561', '758', '7581', '759', '760',
       '770', '773', '7981', '8003', '802', '8032', '8033', '805', '817',
       '818', '819', '830', '835', '871', '907', '915', '930', '939', '946',
       '947', '949', '969', '979', '981', 'A02B', 'A03F', 'A06A', 'A06B',
       'A07A', 'A10B', 'A11E', 'A12A', 'A12B', 'B03A', 'B03B', 'C02A', 'C07A',
       'C09A', 'D01A', 'D06A', 'D06B', 'D07A', 'D07B', 'D07X', 'D09A', 'D10A',
       'G03A', 'G03C', 'G03F', 'G03H', 'H01B', 'H02A', 'J01C', 'J01F', 'J05A',
       'M01A', 'N01B', 'N02A', 'N02B', 'N02C', 'N03A', 'N05A', 'N05B', 'N05C',
       'N06A', 'N06B', 'N07B', 'NO6B', 'R01A', 'R03A', 'R03B', 'R05D', 'R06A',
       'S01A', 'S01G']]
        
        return self.merged_df, self.dependent_variable_TNE_BO_730, self.independent_variable 
    
    # To train and view the reasult of binary and multiclass classifier        
    def train_classifier_with_medication_diagnosis(self):
        dependent_variables = [self.dependent_variable_TNE_BO_730]
        
        # to define the weight of output class, to resolve imbalanced output/dataset
        dependent_variable_names = {'self.dependent_variable_TNE_BO_730': 'TNE_BO_730'}
        class_weights = {'TNE_BO_730': {0:10, 1:270}}
              
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            logistic_prediction_model = LogisticRegression(class_weight=class_weights[variable_name])
            X_train, X_test, y_train, y_test = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            logistic_prediction_model.fit(X_train,y_train)
            y_pred = logistic_prediction_model.predict(X_test)
            
            # Checks if the output category for classification is binary or multiclass
            category_count = int(y_train.nunique())
            print(f'\n****** Evaluation result {variable_name} as dependent variable ******')
            # For binary class classification 
            if category_count == 2:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='binary')
                recall = recall_score(y_test, y_pred, average='binary')
                f1 = f1_score(y_test, y_pred, average='binary')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                print("Confused matrix:")
                conf_matrix = confusion_matrix(y_test, y_pred)
                tn, fp, fn, tp = conf_matrix.ravel()
                print(conf_matrix)
                print(f"True Negative (TN): {tn}")
                print(f"False Positive (FP): {fp}")
                print(f"False Negative (FN): {fn}")
                print(f"True Positive (TP): {tp}")
                print(classification_report(y_test, y_pred))
            
            # For binary class classification 
            else:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                conf_matrix = confusion_matrix(y_test, y_pred)
                print("Confused matrix:")
                print(conf_matrix)
                print(classification_report(y_test, y_pred))
                
    def prediction_with_medication_diagnosis(self):
        # Select all corresponding columns with values in TNE_NO_730
        self.merged_df = self.merged_df.dropna(subset=['TNE_NO_730'])
        
        # define dependent variables
        self.dependent_variable_TNE_NO_730 = self.merged_df[['TNE_NO_730']]
        # define independent variables 
        self.independent_variable = self.merged_df[['num_diagnoses', 'num_medications', 'remaining_time_countdown', 'var_no_dates_permonth','Length_of_Episode', 'Count_visit', 'Therapy_ratio','Examination_ratio', 'Advisory_ratio',
        'TreatmentPlanning_ratio','Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3','closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
        'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4','aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler','Teenager',
        '010', '0703', '0704', '079', '0792', '1000', '1008',
       '1009', '1011', '1013', '1015', '136', '1499', '1551', '1701', '1702',
       '18911', '1911', '19111', '1994', '201', '2022', '20411', '20421',
       '2441', '2442', '2444', '2445', '24521', '2501', '25011', '2502',
       '2531', '25311', '2532', '2553', '2564', '259', '260', '2602', '2612',
       '2614', '262', '2701', '2713', '27614', '2781', '2791', '2801', '28731',
       '288', '28811', '2903', '2911', '2914', '2922', '295', '2951', '2952',
       '2953', '296', '2961', '29622', '2972', '300', '3001', '3003', '3004',
       '3008', '3009', '301', '3011', '3012', '302', '303', '3031', '3033',
       '3034', '304', '3052', '306', '3069', '312', '3123', '313', '3131',
       '3132', '3133', '315', '3151', '3152', '3153', '316', '317', '323',
       '324', '327', '3274', '32741', '3275', '3276', '3311', '339', '340',
       '3401', '341', '343', '344', '345', '3451', '34511', '34512', '3453',
       '347', '3484', '357', '3591', '3592', '3621', '365', '3671', '3672',
       '3678', '3679', '368', '3681', '3684', '3695', '3711', '3781', '3782',
       '3811', '389', '3891', '3892', '3894', '3895', '42711', '4332', '4338',
       '4589', '472', '4741', '4742', '476', '4801', '495', '496', '499',
       '506', '5091', '510', '53011', '53014', '5302', '5305', '535', '5551',
       '5552', '55521', '558', '5611', '563', '564', '5641', '565', '57181',
       '5772', '591', '5921', '5965', '5994', '6041', '62611', '6264', '637',
       '6491', '656', '6562', '6563', '6564', '657', '658', '661', '691',
       '6941', '69542', '6964', '7061', '71011', '7142', '7169', '7282',
       '7321', '733', '736', '7373', '739', '745', '7471', '74711', '74712',
       '74713', '7472', '748', '749', '7491', '7492', '75011', '75013',
       '75021', '75112', '7512', '75121', '75211', '7522', '7531', '7551',
       '7554', '7556', '75561', '756', '7561', '758', '7581', '759', '760',
       '770', '773', '7981', '8003', '802', '8032', '8033', '805', '817',
       '818', '819', '830', '835', '871', '907', '915', '930', '939', '946',
       '947', '949', '969', '979', '981', 'A02B', 'A03F', 'A06A', 'A06B',
       'A07A', 'A10B', 'A11E', 'A12A', 'A12B', 'B03A', 'B03B', 'C02A', 'C07A',
       'C09A', 'D01A', 'D06A', 'D06B', 'D07A', 'D07B', 'D07X', 'D09A', 'D10A',
       'G03A', 'G03C', 'G03F', 'G03H', 'H01B', 'H02A', 'J01C', 'J01F', 'J05A',
       'M01A', 'N01B', 'N02A', 'N02B', 'N02C', 'N03A', 'N05A', 'N05B', 'N05C',
       'N06A', 'N06B', 'N07B', 'NO6B', 'R01A', 'R03A', 'R03B', 'R05D', 'R06A',
       'S01A', 'S01G']]
        
        dependent_variables = [self.dependent_variable_TNE_NO_730]
        dependent_variable_names = {'self.dependent_variable_TNE_NO_730':'TNE_NO_730'}
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            linear_prediction_model = LinearRegression()
            X_train, X_temp, y_train, y_temp = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            X_test,X_eval,y_test,y_eval = train_test_split(X_temp,y_temp,test_size=0.33)
            linear_prediction_model.fit(X_train,y_train)
            y_pred = linear_prediction_model.predict(X_test)
            print(f'\n****** Prediction Model Evaluation result  {variable_name} as dependent variable ******')
            y_pred = linear_prediction_model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            print('R-squared:', r2)
            print('Mean squared error:', mse)
                

SplitMedicationDiagnosisInUniquePieces_Obj = SplitMedicationDiagnosisInUniquePieces(os.getenv("FILE_Full_Phecode_ATC_PATH"))
SplitMedicationDiagnosisInUniquePieces_Obj.split_diagnosis_medication_in_unique_pieces()

ClassifyReadmissionWithMedicationDiagnosis_Obj = ClassifyReadmissionWithMedicationDiagnosis('Split_Full_Phecode_ATC4.csv')
merged_df, dependent_variable_TNE_BO_730, independent_variable  = ClassifyReadmissionWithMedicationDiagnosis_Obj.load_data()
ClassifyReadmissionWithMedicationDiagnosis_Obj.train_classifier_with_medication_diagnosis()
ClassifyReadmissionWithMedicationDiagnosis_Obj.prediction_with_medication_diagnosis()

##### 4. Classify and Predict for 3 year

**`TNE_BO_1095` values (0 and 1 are used to classify readmission, where 1 means readmission). Further for prediction only the readmitted patient data is taken and a linear regression model is trained on it to predict the number of days for readmission.**
- Dataset Used: `/mnt/work/workbench/dipendrp/new-data/Full_Phecode_ATC4.csv`
- Independent variable: `'age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                        'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio','TreatmentPlanning_ratio', 'Outpatient_ratio',
                        'Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications', 
                        and unique 'actual_diag_Phe', 'actual_med_ATC4'`
- Dependent Variable: For classification its `TNE_BO_1095` and for prediction its `TNE_NO_1095`

In [None]:
import re
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import random
from IPython.display import display
from IPython.display import Image
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

import os
from dotenv import load_dotenv
load_dotenv()


######### Split Medication and Diagnosis In Unique Pieces ############
#                                                                    #
#                                                                    #
#                                                                    #
######################################################################

class SplitMedicationDiagnosisInUniquePieces:
    def __init__(self, file1):
        self.file1 = file1
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data_to_split(self):
        Full_Phecode_ATC4 = pd.read_csv(self.file1)
        self.merged_df = Full_Phecode_ATC4
        # Specify the names of columns you are interested in here diagnosis and medication column name in the CSV files
        self.merged_df = self.merged_df[['episode_id', 'num_diagnoses', 'num_medications', 'pasient', 'age',
                                       'remaining_time_countdown', 'var_no_dates_permonth', 'gender',
                                       'episode_order', 'islast', 'closingcode', 'aftercode',
                                       'episode_start_date', 'episode_end_date', 'tillnextepisode',
                                       'Length_of_Episode', 'Cat_LOE', 'TNE_BO_180', 'TNE_NO_180',
                                       'TNE_BO_365', 'TNE_NO_365', 'TNE_BO_730', 'TNE_NO_730', 'TNE_BO_1095',
                                       'TNE_NO_1095', 'Cat_LOE_desc', 'Count_visit', 'Cat_CV', 'Therapy_ratio',
                                       'Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
                                       'Outpatient_ratio', 'Inpatient_ratio', 'Inpatient_day_ratio',
                                       'Inpatient_daynight_ratio', 'Care_intensity', 'age_group',
                                       'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3',
                                       'closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
                                       'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4',
                                       'aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler',
                                       'Teenager', 'actual_diag_Phe', 'actual_med_ATC4']]
        
        self.merged_df = self.merged_df.copy(deep=True)
        return self.merged_df

        
    def split_diagnosis_medication_in_unique_pieces(self):
        newmerged_df = SplitMedicationDiagnosisInUniquePieces_Obj.load_data_to_split()
        # Specify the exact names of diagnosis and medication column to perform the split
        columns = ['actual_diag_Phe', 'actual_med_ATC4']
        for col in columns:
            newmerged_df[col] = newmerged_df[col].apply(lambda d:[] if pd.isnull(d) else d)
            newmerged_df[col] = newmerged_df[col].str.replace("[", "")
            newmerged_df[col] = newmerged_df[col].str.replace("]", "")
            newmerged_df[col] = newmerged_df[col].str.replace("'", "")
            newmerged_df[col] = newmerged_df[col].str.replace(" ", "")
            newmerged_df[col] = newmerged_df[col].str.split(',')
            newmerged_df = newmerged_df.explode(col)
            
            # Show only top 100 columns names
            temp_store_100 = newmerged_df[col].value_counts()
            names_only = temp_store_100.index.tolist()
            
            original_columns = newmerged_df.columns.tolist()
            dummies = newmerged_df[col].str.get_dummies(sep=',')
            newmerged_df = pd.concat([newmerged_df, dummies], axis=1)
            newmerged_df = newmerged_df.drop(col, axis=1)
            
        newmerged_df.to_csv('Split_Full_Phecode_ATC4.csv',index=False)
        
    
class ClassifyReadmissionWithMedicationDiagnosis:
    def __init__(self, file2):
        self.file2 = file2
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data(self):
        original_df_1 = pd.read_csv(self.file2)
        self.merged_df = original_df_1[['num_diagnoses', 'num_medications', 'remaining_time_countdown', 'var_no_dates_permonth','Length_of_Episode', 'Count_visit', 'Therapy_ratio','Examination_ratio', 'Advisory_ratio',
        'TreatmentPlanning_ratio','Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3','closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
        'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4','aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler','Teenager', 'TNE_NO_1095','TNE_BO_1095',
        '010', '0703', '0704', '079', '0792', '1000', '1008',
       '1009', '1011', '1013', '1015', '136', '1499', '1551', '1701', '1702',
       '18911', '1911', '19111', '1994', '201', '2022', '20411', '20421',
       '2441', '2442', '2444', '2445', '24521', '2501', '25011', '2502',
       '2531', '25311', '2532', '2553', '2564', '259', '260', '2602', '2612',
       '2614', '262', '2701', '2713', '27614', '2781', '2791', '2801', '28731',
       '288', '28811', '2903', '2911', '2914', '2922', '295', '2951', '2952',
       '2953', '296', '2961', '29622', '2972', '300', '3001', '3003', '3004',
       '3008', '3009', '301', '3011', '3012', '302', '303', '3031', '3033',
       '3034', '304', '3052', '306', '3069', '312', '3123', '313', '3131',
       '3132', '3133', '315', '3151', '3152', '3153', '316', '317', '323',
       '324', '327', '3274', '32741', '3275', '3276', '3311', '339', '340',
       '3401', '341', '343', '344', '345', '3451', '34511', '34512', '3453',
       '347', '3484', '357', '3591', '3592', '3621', '365', '3671', '3672',
       '3678', '3679', '368', '3681', '3684', '3695', '3711', '3781', '3782',
       '3811', '389', '3891', '3892', '3894', '3895', '42711', '4332', '4338',
       '4589', '472', '4741', '4742', '476', '4801', '495', '496', '499',
       '506', '5091', '510', '53011', '53014', '5302', '5305', '535', '5551',
       '5552', '55521', '558', '5611', '563', '564', '5641', '565', '57181',
       '5772', '591', '5921', '5965', '5994', '6041', '62611', '6264', '637',
       '6491', '656', '6562', '6563', '6564', '657', '658', '661', '691',
       '6941', '69542', '6964', '7061', '71011', '7142', '7169', '7282',
       '7321', '733', '736', '7373', '739', '745', '7471', '74711', '74712',
       '74713', '7472', '748', '749', '7491', '7492', '75011', '75013',
       '75021', '75112', '7512', '75121', '75211', '7522', '7531', '7551',
       '7554', '7556', '75561', '756', '7561', '758', '7581', '759', '760',
       '770', '773', '7981', '8003', '802', '8032', '8033', '805', '817',
       '818', '819', '830', '835', '871', '907', '915', '930', '939', '946',
       '947', '949', '969', '979', '981', 'A02B', 'A03F', 'A06A', 'A06B',
       'A07A', 'A10B', 'A11E', 'A12A', 'A12B', 'B03A', 'B03B', 'C02A', 'C07A',
       'C09A', 'D01A', 'D06A', 'D06B', 'D07A', 'D07B', 'D07X', 'D09A', 'D10A',
       'G03A', 'G03C', 'G03F', 'G03H', 'H01B', 'H02A', 'J01C', 'J01F', 'J05A',
       'M01A', 'N01B', 'N02A', 'N02B', 'N02C', 'N03A', 'N05A', 'N05B', 'N05C',
       'N06A', 'N06B', 'N07B', 'NO6B', 'R01A', 'R03A', 'R03B', 'R05D', 'R06A',
       'S01A', 'S01G']]
        
        
       
        # fill nan values in independent column
        self.merged_df['num_diagnoses'].fillna(0, inplace=True)
        self.merged_df['num_medications'].fillna(0, inplace=True)
        self.merged_df['Outpatient_ratio'].fillna(0, inplace=True)
        self.merged_df['Inpatient_daynight_ratio'].fillna(0, inplace=True)  
        self.merged_df['Therapy_ratio'].fillna(0, inplace=True)
        self.merged_df['Examination_ratio'].fillna(0, inplace=True)
        self.merged_df['Advisory_ratio'].fillna(0, inplace=True)
        self.merged_df['TreatmentPlanning_ratio'].fillna(0, inplace=True)
        
        
        # define dependent variables
        self.dependent_variable_TNE_BO_1095 = self.merged_df[['TNE_BO_1095']]
        
        self.independent_variable = self.merged_df[['num_diagnoses', 'num_medications', 'remaining_time_countdown', 'var_no_dates_permonth','Length_of_Episode', 'Count_visit', 'Therapy_ratio','Examination_ratio', 'Advisory_ratio',
        'TreatmentPlanning_ratio','Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3','closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
        'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4','aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler','Teenager',
        '010', '0703', '0704', '079', '0792', '1000', '1008',
       '1009', '1011', '1013', '1015', '136', '1499', '1551', '1701', '1702',
       '18911', '1911', '19111', '1994', '201', '2022', '20411', '20421',
       '2441', '2442', '2444', '2445', '24521', '2501', '25011', '2502',
       '2531', '25311', '2532', '2553', '2564', '259', '260', '2602', '2612',
       '2614', '262', '2701', '2713', '27614', '2781', '2791', '2801', '28731',
       '288', '28811', '2903', '2911', '2914', '2922', '295', '2951', '2952',
       '2953', '296', '2961', '29622', '2972', '300', '3001', '3003', '3004',
       '3008', '3009', '301', '3011', '3012', '302', '303', '3031', '3033',
       '3034', '304', '3052', '306', '3069', '312', '3123', '313', '3131',
       '3132', '3133', '315', '3151', '3152', '3153', '316', '317', '323',
       '324', '327', '3274', '32741', '3275', '3276', '3311', '339', '340',
       '3401', '341', '343', '344', '345', '3451', '34511', '34512', '3453',
       '347', '3484', '357', '3591', '3592', '3621', '365', '3671', '3672',
       '3678', '3679', '368', '3681', '3684', '3695', '3711', '3781', '3782',
       '3811', '389', '3891', '3892', '3894', '3895', '42711', '4332', '4338',
       '4589', '472', '4741', '4742', '476', '4801', '495', '496', '499',
       '506', '5091', '510', '53011', '53014', '5302', '5305', '535', '5551',
       '5552', '55521', '558', '5611', '563', '564', '5641', '565', '57181',
       '5772', '591', '5921', '5965', '5994', '6041', '62611', '6264', '637',
       '6491', '656', '6562', '6563', '6564', '657', '658', '661', '691',
       '6941', '69542', '6964', '7061', '71011', '7142', '7169', '7282',
       '7321', '733', '736', '7373', '739', '745', '7471', '74711', '74712',
       '74713', '7472', '748', '749', '7491', '7492', '75011', '75013',
       '75021', '75112', '7512', '75121', '75211', '7522', '7531', '7551',
       '7554', '7556', '75561', '756', '7561', '758', '7581', '759', '760',
       '770', '773', '7981', '8003', '802', '8032', '8033', '805', '817',
       '818', '819', '830', '835', '871', '907', '915', '930', '939', '946',
       '947', '949', '969', '979', '981', 'A02B', 'A03F', 'A06A', 'A06B',
       'A07A', 'A10B', 'A11E', 'A12A', 'A12B', 'B03A', 'B03B', 'C02A', 'C07A',
       'C09A', 'D01A', 'D06A', 'D06B', 'D07A', 'D07B', 'D07X', 'D09A', 'D10A',
       'G03A', 'G03C', 'G03F', 'G03H', 'H01B', 'H02A', 'J01C', 'J01F', 'J05A',
       'M01A', 'N01B', 'N02A', 'N02B', 'N02C', 'N03A', 'N05A', 'N05B', 'N05C',
       'N06A', 'N06B', 'N07B', 'NO6B', 'R01A', 'R03A', 'R03B', 'R05D', 'R06A',
       'S01A', 'S01G']]
        
        return self.merged_df, self.dependent_variable_TNE_BO_1095, self.independent_variable 
    
    # To train and view the reasult of binary and multiclass classifier        
    def train_classifier_with_medication_diagnosis(self):
        dependent_variables = [self.dependent_variable_TNE_BO_1095]
        
        # to define the weight of output class, to resolve imbalanced output/dataset
        dependent_variable_names = {'self.dependent_variable_TNE_BO_1095': 'TNE_BO_1095'}
        class_weights = {'TNE_BO_1095': {0:10, 1:270}}
              
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            logistic_prediction_model = LogisticRegression(class_weight=class_weights[variable_name])
            X_train, X_test, y_train, y_test = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            logistic_prediction_model.fit(X_train,y_train)
            y_pred = logistic_prediction_model.predict(X_test)
            
            # Checks if the output category for classification is binary or multiclass
            category_count = int(y_train.nunique())
            print(f'\n****** Evaluation result {variable_name} as dependent variable ******')
            # For binary class classification 
            if category_count == 2:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='binary')
                recall = recall_score(y_test, y_pred, average='binary')
                f1 = f1_score(y_test, y_pred, average='binary')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                print("Confused matrix:")
                conf_matrix = confusion_matrix(y_test, y_pred)
                tn, fp, fn, tp = conf_matrix.ravel()
                print(conf_matrix)
                print(f"True Negative (TN): {tn}")
                print(f"False Positive (FP): {fp}")
                print(f"False Negative (FN): {fn}")
                print(f"True Positive (TP): {tp}")
                print(classification_report(y_test, y_pred))
            
            # For binary class classification 
            else:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                conf_matrix = confusion_matrix(y_test, y_pred)
                print("Confused matrix:")
                print(conf_matrix)
                print(classification_report(y_test, y_pred))
                
    def prediction_with_medication_diagnosis(self):
        # Select all corresponding columns with values in TNE_NO_1095
        self.merged_df = self.merged_df.dropna(subset=['TNE_NO_1095'])
        
        # define dependent variables
        self.dependent_variable_TNE_NO_1095 = self.merged_df[['TNE_NO_1095']]
        # define independent variables 
        self.independent_variable = self.merged_df[['num_diagnoses', 'num_medications', 'remaining_time_countdown', 'var_no_dates_permonth','Length_of_Episode', 'Count_visit', 'Therapy_ratio','Examination_ratio', 'Advisory_ratio',
        'TreatmentPlanning_ratio','Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'closingcode_0', 'closingcode_1', 'closingcode_2', 'closingcode_3','closingcode_4', 'closingcode_5', 'closingcode_6', 'closingcode_9',
        'aftercode_1', 'aftercode_2', 'aftercode_3', 'aftercode_4','aftercode_5', 'gender_0', 'F', 'M', 'MiddleChildhood', 'Preschooler','Teenager',
        '010', '0703', '0704', '079', '0792', '1000', '1008',
       '1009', '1011', '1013', '1015', '136', '1499', '1551', '1701', '1702',
       '18911', '1911', '19111', '1994', '201', '2022', '20411', '20421',
       '2441', '2442', '2444', '2445', '24521', '2501', '25011', '2502',
       '2531', '25311', '2532', '2553', '2564', '259', '260', '2602', '2612',
       '2614', '262', '2701', '2713', '27614', '2781', '2791', '2801', '28731',
       '288', '28811', '2903', '2911', '2914', '2922', '295', '2951', '2952',
       '2953', '296', '2961', '29622', '2972', '300', '3001', '3003', '3004',
       '3008', '3009', '301', '3011', '3012', '302', '303', '3031', '3033',
       '3034', '304', '3052', '306', '3069', '312', '3123', '313', '3131',
       '3132', '3133', '315', '3151', '3152', '3153', '316', '317', '323',
       '324', '327', '3274', '32741', '3275', '3276', '3311', '339', '340',
       '3401', '341', '343', '344', '345', '3451', '34511', '34512', '3453',
       '347', '3484', '357', '3591', '3592', '3621', '365', '3671', '3672',
       '3678', '3679', '368', '3681', '3684', '3695', '3711', '3781', '3782',
       '3811', '389', '3891', '3892', '3894', '3895', '42711', '4332', '4338',
       '4589', '472', '4741', '4742', '476', '4801', '495', '496', '499',
       '506', '5091', '510', '53011', '53014', '5302', '5305', '535', '5551',
       '5552', '55521', '558', '5611', '563', '564', '5641', '565', '57181',
       '5772', '591', '5921', '5965', '5994', '6041', '62611', '6264', '637',
       '6491', '656', '6562', '6563', '6564', '657', '658', '661', '691',
       '6941', '69542', '6964', '7061', '71011', '7142', '7169', '7282',
       '7321', '733', '736', '7373', '739', '745', '7471', '74711', '74712',
       '74713', '7472', '748', '749', '7491', '7492', '75011', '75013',
       '75021', '75112', '7512', '75121', '75211', '7522', '7531', '7551',
       '7554', '7556', '75561', '756', '7561', '758', '7581', '759', '760',
       '770', '773', '7981', '8003', '802', '8032', '8033', '805', '817',
       '818', '819', '830', '835', '871', '907', '915', '930', '939', '946',
       '947', '949', '969', '979', '981', 'A02B', 'A03F', 'A06A', 'A06B',
       'A07A', 'A10B', 'A11E', 'A12A', 'A12B', 'B03A', 'B03B', 'C02A', 'C07A',
       'C09A', 'D01A', 'D06A', 'D06B', 'D07A', 'D07B', 'D07X', 'D09A', 'D10A',
       'G03A', 'G03C', 'G03F', 'G03H', 'H01B', 'H02A', 'J01C', 'J01F', 'J05A',
       'M01A', 'N01B', 'N02A', 'N02B', 'N02C', 'N03A', 'N05A', 'N05B', 'N05C',
       'N06A', 'N06B', 'N07B', 'NO6B', 'R01A', 'R03A', 'R03B', 'R05D', 'R06A',
       'S01A', 'S01G']]
        
        dependent_variables = [self.dependent_variable_TNE_NO_1095]
        dependent_variable_names = {'self.dependent_variable_TNE_NO_1095':'TNE_NO_1095'}
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            linear_prediction_model = LinearRegression()
            X_train, X_temp, y_train, y_temp = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            X_test,X_eval,y_test,y_eval = train_test_split(X_temp,y_temp,test_size=0.33)
            linear_prediction_model.fit(X_train,y_train)
            y_pred = linear_prediction_model.predict(X_test)
            print(f'\n****** Prediction Model Evaluation result  {variable_name} as dependent variable ******')
            y_pred = linear_prediction_model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            print('R-squared:', r2)
            print('Mean squared error:', mse)
                

SplitMedicationDiagnosisInUniquePieces_Obj = SplitMedicationDiagnosisInUniquePieces(os.getenv("FILE_Full_Phecode_ATC_PATH"))
SplitMedicationDiagnosisInUniquePieces_Obj.split_diagnosis_medication_in_unique_pieces()

ClassifyReadmissionWithMedicationDiagnosis_Obj = ClassifyReadmissionWithMedicationDiagnosis('Split_Full_Phecode_ATC4.csv')
merged_df, dependent_variable_TNE_BO_1095, independent_variable  = ClassifyReadmissionWithMedicationDiagnosis_Obj.load_data()
ClassifyReadmissionWithMedicationDiagnosis_Obj.train_classifier_with_medication_diagnosis()
ClassifyReadmissionWithMedicationDiagnosis_Obj.prediction_with_medication_diagnosis()