#### Main Task: Classification and Predicition of readmission without medication and diagnosis columns.
##### Sub tasks
1. Classify and Predict for 180 days
2. Classify and Predict for 1 year
3. Classify and Predict for 2 year
4. Classify and Predict for 3 year

- **Dataset Used:** `/mnt/work/workbench/dipendrp/new-data/final_episodes.csv`
- To run the script replace if you are not using `dotenv` replace`os.getenv("FILE_Full_Phecode_ATC_PATH")` with the path of your file.

##### 1. Classify and Predict for 180 days

**`TNE_BO_180` values (0 and 1 are used to classify readmission, where 1 means readmission). Further for prediction only the readmitted patient data is taken and a linear regression model is trained on it to predict the number of days for readmission.**
- Independent variable: `'age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                                                   'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                                   'TreatmentPlanning_ratio', 'Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications'`
- Dependent Variable: For classification its `TNE_BO_180` and for prediction its `TNE_NO_180` 
- Other Important Info: Dataset has NaN values. NaN has been replaced with 0

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import random
from IPython.display import display
from IPython.display import Image

from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt

import os
from dotenv import load_dotenv
load_dotenv()

################## Without Medication and Diagnosis ##################
#                                                                    #
#                                                                    #
#                                                                    #
######################################################################

class ClassifyReadmissionWithoutMedicationDiagnosis:
    def __init__(self, file1):
        self.file1 = file1
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    # to read dataframe, select useful column and seperate dependent and independent columns
    def load_data(self):
        original_df_1 = pd.read_csv(self.file1)
        # select few appropriate columns
        self.merged_df = original_df_1[['age', 'remaining_time_countdown',
                                       'var_no_dates_permonth', 'gender', 'episode_order',
                                       'closingcode', 'aftercode', 'tillnextepisode', 'Length_of_Episode', 'Cat_LOE', 'TNE_BO_180',
                                       'TNE_NO_180', 'TNE_BO_365', 'TNE_NO_365', 'TNE_BO_730', 'TNE_NO_730',
                                       'TNE_BO_1095', 'TNE_NO_1095', 'Cat_LOE_desc', 'Count_visit', 'Cat_CV',
                                       'Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                       'TreatmentPlanning_ratio', 'Outpatient_ratio', 'Inpatient_ratio',
                                       'Inpatient_day_ratio', 'Inpatient_daynight_ratio', 'Care_intensity', 
                                        'num_diagnoses', 'num_medications']]

       
        # fill nan values in independent column
        self.merged_df['num_diagnoses'].fillna(0, inplace=True)
        self.merged_df['num_medications'].fillna(0, inplace=True)
        self.merged_df['Outpatient_ratio'].fillna(0, inplace=True)
        self.merged_df['Inpatient_daynight_ratio'].fillna(0, inplace=True)  
        self.merged_df['Therapy_ratio'].fillna(0, inplace=True)
        self.merged_df['Examination_ratio'].fillna(0, inplace=True)
        self.merged_df['Advisory_ratio'].fillna(0, inplace=True)
        self.merged_df['TreatmentPlanning_ratio'].fillna(0, inplace=True)
        
        
        # define dependent variables
        self.dependent_variable_TNE_BO_180 = self.merged_df[['TNE_BO_180']]
        

        # define independent variables 
        self.independent_variable = self.merged_df[['age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                                                   'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                                   'TreatmentPlanning_ratio', 'Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications']]
        
        # return selected dataframe, independent and dependent variables
        return self.merged_df, self.dependent_variable_TNE_BO_180, self.independent_variable
        
    # To train and view the reasult of binary and multiclass classifier    
    def train_classifier_without_medication_diagnosis(self):
        dependent_variables = [self.dependent_variable_TNE_BO_180]
        
        # to define the weight of output class, to resolve imbalanced output/dataset
        dependent_variable_names = {'self.dependent_variable_TNE_BO_180': 'TNE_BO_180'}
        class_weights = {'TNE_BO_180': {0:10, 1:270}}
        
        
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            logistic_prediction_model = LogisticRegression(class_weight=class_weights[variable_name])
            X_train, X_test, y_train, y_test = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            logistic_prediction_model.fit(X_train,y_train)
            y_pred = logistic_prediction_model.predict(X_test)
            
            # To checks if the output is binary or multiclass
            category_count = int(y_train.nunique())
            print(f'\n****** Evaluation result {variable_name} as dependent variable ******')
            
            # For binary class classification 
            if category_count == 2:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='binary')
                recall = recall_score(y_test, y_pred, average='binary')
                f1 = f1_score(y_test, y_pred, average='binary')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                print("Confused matrix:")
                conf_matrix = confusion_matrix(y_test, y_pred)
                tn, fp, fn, tp = conf_matrix.ravel()
                print(conf_matrix)
                print(f"True Negative (TN): {tn}")
                print(f"False Positive (FP): {fp}")
                print(f"False Negative (FN): {fn}")
                print(f"True Positive (TP): {tp}")
                print(classification_report(y_test, y_pred))
            
            # For multiclass/non binary classification 
            else:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                conf_matrix = confusion_matrix(y_test, y_pred)
                print("Confused matrix:")
                print(conf_matrix)
                print(classification_report(y_test, y_pred))
    
    # To predict the number of days for readmission which are numerical values
    def prediction_without_medication_diagnosis(self):
        # Select all corresponding columns with values in TNE_NO_180
        self.merged_df = self.merged_df.dropna(subset=['TNE_NO_180'])
        
        # define dependent variables
        self.dependent_variable_TNE_NO_180 = self.merged_df[['TNE_NO_180']]
        # define independent variables 
        self.independent_variable = self.merged_df[['age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                                                   'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                                   'TreatmentPlanning_ratio', 'Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications']]
        
        dependent_variables = [self.dependent_variable_TNE_NO_180]
        dependent_variable_names = {'self.dependent_variable_TNE_NO_180':'TNE_NO_180'}
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            linear_prediction_model = LinearRegression()
            X_train, X_test, y_train, y_test = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            linear_prediction_model.fit(X_train,y_train)
            y_pred = linear_prediction_model.predict(X_test)
            print(f'\n****** Prediction Model Evaluation result  {variable_name} as dependent variable ******')
            y_pred = linear_prediction_model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            print('R-squared:', r2)
            print('Mean squared error:', mse)
                

# Class object and accessing dataframes               
ClassifyReadmissionWithoutMedicationDiagnosis_Obj = ClassifyReadmissionWithoutMedicationDiagnosis(os.getenv("FILE_Full_Phecode_ATC_PATH"))
merged_df, dependent_variable_TNE_BO_180, independent_variable  = ClassifyReadmissionWithoutMedicationDiagnosis_Obj.load_data()

# Function call
ClassifyReadmissionWithoutMedicationDiagnosis_Obj.train_classifier_without_medication_diagnosis()
ClassifyReadmissionWithoutMedicationDiagnosis_Obj.prediction_without_medication_diagnosis()

##### 2. Classify and Predict for 1 year

**`TNE_BO_365` values (0 and 1 are used to classify readmission, where 1 means readmission). Further for prediction only the readmitted patient data is taken and a linear regression model is trained on it to predict the number of days for readmission.**
- Independent variable: `'age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                                                   'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                                   'TreatmentPlanning_ratio', 'Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications'`
- Dependent Variable: For classification its `TNE_BO_365` and for prediction its `TNE_NO_365` 
- Other Important Info: Dataset has NaN values. NaN has been replaced with 0

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import random
from IPython.display import display
from IPython.display import Image

from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt

import os
from dotenv import load_dotenv
load_dotenv()

################## Without Medication and Diagnosis ##################
#                                                                    #
#                                                                    #
#                                                                    #
######################################################################

class ClassifyReadmissionWithoutMedicationDiagnosis:
    def __init__(self, file1):
        self.file1 = file1
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    # to read dataframe, select useful column and seperate dependent and independent columns
    def load_data(self):
        original_df_1 = pd.read_csv(self.file1)
        # select few appropriate columns
        self.merged_df = original_df_1[['age', 'remaining_time_countdown',
                                       'var_no_dates_permonth', 'gender', 'episode_order',
                                       'closingcode', 'aftercode', 'tillnextepisode', 'Length_of_Episode', 'Cat_LOE', 'TNE_BO_180',
                                       'TNE_NO_180', 'TNE_BO_365', 'TNE_NO_365', 'TNE_BO_730', 'TNE_NO_730',
                                       'TNE_BO_1095', 'TNE_NO_1095', 'Cat_LOE_desc', 'Count_visit', 'Cat_CV',
                                       'Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                       'TreatmentPlanning_ratio', 'Outpatient_ratio', 'Inpatient_ratio',
                                       'Inpatient_day_ratio', 'Inpatient_daynight_ratio', 'Care_intensity', 
                                        'num_diagnoses', 'num_medications']]

       
        # fill nan values in independent column
        self.merged_df['num_diagnoses'].fillna(0, inplace=True)
        self.merged_df['num_medications'].fillna(0, inplace=True)
        self.merged_df['Outpatient_ratio'].fillna(0, inplace=True)
        self.merged_df['Inpatient_daynight_ratio'].fillna(0, inplace=True)  
        self.merged_df['Therapy_ratio'].fillna(0, inplace=True)
        self.merged_df['Examination_ratio'].fillna(0, inplace=True)
        self.merged_df['Advisory_ratio'].fillna(0, inplace=True)
        self.merged_df['TreatmentPlanning_ratio'].fillna(0, inplace=True)
        
        
        # define dependent variables
        self.dependent_variable_TNE_BO_365 = self.merged_df[['TNE_BO_365']]
        

        # define independent variables 
        self.independent_variable = self.merged_df[['age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                                                   'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                                   'TreatmentPlanning_ratio', 'Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications']]
        
        # return selected dataframe, independent and dependent variables
        return self.merged_df, self.dependent_variable_TNE_BO_365, self.independent_variable
        
    # To train and view the reasult of binary and multiclass classifier    
    def train_classifier_without_medication_diagnosis(self):
        dependent_variables = [self.dependent_variable_TNE_BO_365]
        
        # to define the weight of output class, to resolve imbalanced output/dataset
        dependent_variable_names = {'self.dependent_variable_TNE_BO_365': 'TNE_BO_365'}
        class_weights = {'TNE_BO_365': {0:10, 1:270}}
        
        
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            logistic_prediction_model = LogisticRegression(class_weight=class_weights[variable_name])
            X_train, X_test, y_train, y_test = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            logistic_prediction_model.fit(X_train,y_train)
            y_pred = logistic_prediction_model.predict(X_test)
            
            # To checks if the output is binary or multiclass
            category_count = int(y_train.nunique())
            print(f'\n****** Evaluation result {variable_name} as dependent variable ******')
            
            # For binary class classification 
            if category_count == 2:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='binary')
                recall = recall_score(y_test, y_pred, average='binary')
                f1 = f1_score(y_test, y_pred, average='binary')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                print("Confused matrix:")
                conf_matrix = confusion_matrix(y_test, y_pred)
                tn, fp, fn, tp = conf_matrix.ravel()
                print(conf_matrix)
                print(f"True Negative (TN): {tn}")
                print(f"False Positive (FP): {fp}")
                print(f"False Negative (FN): {fn}")
                print(f"True Positive (TP): {tp}")
                print(classification_report(y_test, y_pred))
            
            # For multiclass/non binary classification 
            else:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                conf_matrix = confusion_matrix(y_test, y_pred)
                print("Confused matrix:")
                print(conf_matrix)
                print(classification_report(y_test, y_pred))
    
    # To predict the number of days for readmission which are numerical values
    def prediction_without_medication_diagnosis(self):
        # Select all corresponding columns with values in TNE_NO_365
        self.merged_df = self.merged_df.dropna(subset=['TNE_NO_365'])
        
        # define dependent variables
        self.dependent_variable_TNE_NO_365 = self.merged_df[['TNE_NO_365']]
        # define independent variables 
        self.independent_variable = self.merged_df[['age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                                                   'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                                   'TreatmentPlanning_ratio', 'Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications']]
        
        dependent_variables = [self.dependent_variable_TNE_NO_365]
        dependent_variable_names = {'self.dependent_variable_TNE_NO_365':'TNE_NO_365'}
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            linear_prediction_model = LinearRegression()
            X_train, X_test, y_train, y_test = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            linear_prediction_model.fit(X_train,y_train)
            y_pred = linear_prediction_model.predict(X_test)
            print(f'\n****** Prediction Model Evaluation result  {variable_name} as dependent variable ******')
            y_pred = linear_prediction_model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            print('R-squared:', r2)
            print('Mean squared error:', mse)
                

# Class object and accessing dataframes               
ClassifyReadmissionWithoutMedicationDiagnosis_Obj = ClassifyReadmissionWithoutMedicationDiagnosis(os.getenv("FILE_Full_Phecode_ATC_PATH"))
merged_df, dependent_variable_TNE_BO_365, independent_variable  = ClassifyReadmissionWithoutMedicationDiagnosis_Obj.load_data()

# Function call
ClassifyReadmissionWithoutMedicationDiagnosis_Obj.train_classifier_without_medication_diagnosis()
ClassifyReadmissionWithoutMedicationDiagnosis_Obj.prediction_without_medication_diagnosis()

##### 3. Classify and Predict for 2 year

**`TNE_BO_730` values (0 and 1 are used to classify readmission, where 1 means readmission). Further for prediction only the readmitted patient data is taken and a linear regression model is trained on it to predict the number of days for readmission.**
- Independent variable: `'age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                                                   'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                                   'TreatmentPlanning_ratio', 'Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications'`
- Dependent Variable: For classification its `TNE_BO_730` and for prediction its `TNE_NO_730` 
- Other Important Info: Dataset has NaN values. NaN has been replaced with 0

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import random
from IPython.display import display
from IPython.display import Image

from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt

import os
from dotenv import load_dotenv
load_dotenv()

################## Without Medication and Diagnosis ##################
#                                                                    #
#                                                                    #
#                                                                    #
######################################################################

class ClassifyReadmissionWithoutMedicationDiagnosis:
    def __init__(self, file1):
        self.file1 = file1
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    # to read dataframe, select useful column and seperate dependent and independent columns
    def load_data(self):
        original_df_1 = pd.read_csv(self.file1)
        # select few appropriate columns
        self.merged_df = original_df_1[['age', 'remaining_time_countdown',
                                       'var_no_dates_permonth', 'gender', 'episode_order',
                                       'closingcode', 'aftercode', 'tillnextepisode', 'Length_of_Episode', 'Cat_LOE', 'TNE_BO_180',
                                       'TNE_NO_180', 'TNE_BO_365', 'TNE_NO_365', 'TNE_BO_730', 'TNE_NO_730',
                                       'TNE_BO_1095', 'TNE_NO_1095', 'Cat_LOE_desc', 'Count_visit', 'Cat_CV',
                                       'Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                       'TreatmentPlanning_ratio', 'Outpatient_ratio', 'Inpatient_ratio',
                                       'Inpatient_day_ratio', 'Inpatient_daynight_ratio', 'Care_intensity', 
                                        'num_diagnoses', 'num_medications']]

       
        # fill nan values in independent column
        self.merged_df['num_diagnoses'].fillna(0, inplace=True)
        self.merged_df['num_medications'].fillna(0, inplace=True)
        self.merged_df['Outpatient_ratio'].fillna(0, inplace=True)
        self.merged_df['Inpatient_daynight_ratio'].fillna(0, inplace=True)  
        self.merged_df['Therapy_ratio'].fillna(0, inplace=True)
        self.merged_df['Examination_ratio'].fillna(0, inplace=True)
        self.merged_df['Advisory_ratio'].fillna(0, inplace=True)
        self.merged_df['TreatmentPlanning_ratio'].fillna(0, inplace=True)
        
        
        # define dependent variables
        self.dependent_variable_TNE_BO_730 = self.merged_df[['TNE_BO_730']]
        

        # define independent variables 
        self.independent_variable = self.merged_df[['age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                                                   'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                                   'TreatmentPlanning_ratio', 'Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications']]
        
        # return selected dataframe, independent and dependent variables
        return self.merged_df, self.dependent_variable_TNE_BO_730, self.independent_variable
        
    # To train and view the reasult of binary and multiclass classifier    
    def train_classifier_without_medication_diagnosis(self):
        dependent_variables = [self.dependent_variable_TNE_BO_730]
        
        # to define the weight of output class, to resolve imbalanced output/dataset
        dependent_variable_names = {'self.dependent_variable_TNE_BO_730': 'TNE_BO_730'}
        class_weights = {'TNE_BO_730': {0:10, 1:270}}
        
        
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            logistic_prediction_model = LogisticRegression(class_weight=class_weights[variable_name])
            X_train, X_test, y_train, y_test = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            logistic_prediction_model.fit(X_train,y_train)
            y_pred = logistic_prediction_model.predict(X_test)
            
            # To checks if the output is binary or multiclass
            category_count = int(y_train.nunique())
            print(f'\n****** Evaluation result {variable_name} as dependent variable ******')
            
            # For binary class classification 
            if category_count == 2:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='binary')
                recall = recall_score(y_test, y_pred, average='binary')
                f1 = f1_score(y_test, y_pred, average='binary')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                print("Confused matrix:")
                conf_matrix = confusion_matrix(y_test, y_pred)
                tn, fp, fn, tp = conf_matrix.ravel()
                print(conf_matrix)
                print(f"True Negative (TN): {tn}")
                print(f"False Positive (FP): {fp}")
                print(f"False Negative (FN): {fn}")
                print(f"True Positive (TP): {tp}")
                print(classification_report(y_test, y_pred))
            
            # For multiclass/non binary classification 
            else:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                conf_matrix = confusion_matrix(y_test, y_pred)
                print("Confused matrix:")
                print(conf_matrix)
                print(classification_report(y_test, y_pred))
    
    # To predict the number of days for readmission which are numerical values
    def prediction_without_medication_diagnosis(self):
        # Select all corresponding columns with values in TNE_NO_730
        self.merged_df = self.merged_df.dropna(subset=['TNE_NO_730'])
        
        # define dependent variables
        self.dependent_variable_TNE_NO_730 = self.merged_df[['TNE_NO_730']]
        # define independent variables 
        self.independent_variable = self.merged_df[['age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                                                   'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                                   'TreatmentPlanning_ratio', 'Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications']]
        
        dependent_variables = [self.dependent_variable_TNE_NO_730]
        dependent_variable_names = {'self.dependent_variable_TNE_NO_730':'TNE_NO_730'}
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            linear_prediction_model = LinearRegression()
            X_train, X_test, y_train, y_test = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            linear_prediction_model.fit(X_train,y_train)
            y_pred = linear_prediction_model.predict(X_test)
            print(f'\n****** Prediction Model Evaluation result  {variable_name} as dependent variable ******')
            y_pred = linear_prediction_model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            print('R-squared:', r2)
            print('Mean squared error:', mse)
                

# Class object and accessing dataframes               
ClassifyReadmissionWithoutMedicationDiagnosis_Obj = ClassifyReadmissionWithoutMedicationDiagnosis(os.getenv("FILE_Full_Phecode_ATC_PATH"))
merged_df, dependent_variable_TNE_BO_730, independent_variable  = ClassifyReadmissionWithoutMedicationDiagnosis_Obj.load_data()

# Function call
ClassifyReadmissionWithoutMedicationDiagnosis_Obj.train_classifier_without_medication_diagnosis()
ClassifyReadmissionWithoutMedicationDiagnosis_Obj.prediction_without_medication_diagnosis()

##### 4. Classify and Predict for 3 year

**`TNE_BO_1095` values (0 and 1 are used to classify readmission, where 1 means readmission). Further for prediction only the readmitted patient data is taken and a linear regression model is trained on it to predict the number of days for readmission.**
- Independent variable: `'age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                                                   'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                                   'TreatmentPlanning_ratio', 'Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications'`
- Dependent Variable: For classification its `TNE_BO_1095` and for prediction its `TNE_NO_1095` 
- Other Important Info: Dataset has NaN values. NaN has been replaced with 0

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import random
from IPython.display import display
from IPython.display import Image

from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt

import os
from dotenv import load_dotenv
load_dotenv()

################## Without Medication and Diagnosis ##################
#                                                                    #
#                                                                    #
#                                                                    #
######################################################################

class ClassifyReadmissionWithoutMedicationDiagnosis:
    def __init__(self, file1):
        self.file1 = file1
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    # to read dataframe, select useful column and seperate dependent and independent columns
    def load_data(self):
        original_df_1 = pd.read_csv(self.file1)
        # select few appropriate columns
        self.merged_df = original_df_1[['age', 'remaining_time_countdown',
                                       'var_no_dates_permonth', 'gender', 'episode_order',
                                       'closingcode', 'aftercode', 'tillnextepisode', 'Length_of_Episode', 'Cat_LOE', 'TNE_BO_180',
                                       'TNE_NO_180', 'TNE_BO_365', 'TNE_NO_365', 'TNE_BO_730', 'TNE_NO_730',
                                       'TNE_BO_1095', 'TNE_NO_1095', 'Cat_LOE_desc', 'Count_visit', 'Cat_CV',
                                       'Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                       'TreatmentPlanning_ratio', 'Outpatient_ratio', 'Inpatient_ratio',
                                       'Inpatient_day_ratio', 'Inpatient_daynight_ratio', 'Care_intensity', 
                                        'num_diagnoses', 'num_medications']]

       
        # fill nan values in independent column
        self.merged_df['num_diagnoses'].fillna(0, inplace=True)
        self.merged_df['num_medications'].fillna(0, inplace=True)
        self.merged_df['Outpatient_ratio'].fillna(0, inplace=True)
        self.merged_df['Inpatient_daynight_ratio'].fillna(0, inplace=True)  
        self.merged_df['Therapy_ratio'].fillna(0, inplace=True)
        self.merged_df['Examination_ratio'].fillna(0, inplace=True)
        self.merged_df['Advisory_ratio'].fillna(0, inplace=True)
        self.merged_df['TreatmentPlanning_ratio'].fillna(0, inplace=True)
        
        
        # define dependent variables
        self.dependent_variable_TNE_BO_1095 = self.merged_df[['TNE_BO_1095']]
        

        # define independent variables 
        self.independent_variable = self.merged_df[['age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                                                   'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                                   'TreatmentPlanning_ratio', 'Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications']]
        
        # return selected dataframe, independent and dependent variables
        return self.merged_df, self.dependent_variable_TNE_BO_1095, self.independent_variable
        
    # To train and view the reasult of binary and multiclass classifier    
    def train_classifier_without_medication_diagnosis(self):
        dependent_variables = [self.dependent_variable_TNE_BO_1095]
        
        # to define the weight of output class, to resolve imbalanced output/dataset
        dependent_variable_names = {'self.dependent_variable_TNE_BO_1095': 'TNE_BO_1095'}
        class_weights = {'TNE_BO_1095': {0:10, 1:270}}
        
        
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            logistic_prediction_model = LogisticRegression(class_weight=class_weights[variable_name])
            X_train, X_test, y_train, y_test = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            logistic_prediction_model.fit(X_train,y_train)
            y_pred = logistic_prediction_model.predict(X_test)
            
            # To checks if the output is binary or multiclass
            category_count = int(y_train.nunique())
            print(f'\n****** Evaluation result {variable_name} as dependent variable ******')
            
            # For binary class classification 
            if category_count == 2:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='binary')
                recall = recall_score(y_test, y_pred, average='binary')
                f1 = f1_score(y_test, y_pred, average='binary')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                print("Confused matrix:")
                conf_matrix = confusion_matrix(y_test, y_pred)
                tn, fp, fn, tp = conf_matrix.ravel()
                print(conf_matrix)
                print(f"True Negative (TN): {tn}")
                print(f"False Positive (FP): {fp}")
                print(f"False Negative (FN): {fn}")
                print(f"True Positive (TP): {tp}")
                print(classification_report(y_test, y_pred))
            
            # For multiclass/non binary classification 
            else:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                conf_matrix = confusion_matrix(y_test, y_pred)
                print("Confused matrix:")
                print(conf_matrix)
                print(classification_report(y_test, y_pred))
    
    # To predict the number of days for readmission which are numerical values
    def prediction_without_medication_diagnosis(self):
        # Select all corresponding columns with values in TNE_NO_1095
        self.merged_df = self.merged_df.dropna(subset=['TNE_NO_1095'])
        
        # define dependent variables
        self.dependent_variable_TNE_NO_1095 = self.merged_df[['TNE_NO_1095']]
        # define independent variables 
        self.independent_variable = self.merged_df[['age', 'remaining_time_countdown','var_no_dates_permonth', 'gender','closingcode', 'aftercode', 'Length_of_Episode', 
                                                   'Count_visit','Therapy_ratio', 'Examination_ratio', 'Advisory_ratio',
                                                   'TreatmentPlanning_ratio', 'Outpatient_ratio','Inpatient_daynight_ratio', 'Care_intensity', 'num_diagnoses', 'num_medications']]
        
        dependent_variables = [self.dependent_variable_TNE_NO_1095]
        dependent_variable_names = {'self.dependent_variable_TNE_NO_1095':'TNE_NO_1095'}
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            linear_prediction_model = LinearRegression()
            X_train, X_test, y_train, y_test = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            linear_prediction_model.fit(X_train,y_train)
            y_pred = linear_prediction_model.predict(X_test)
            print(f'\n****** Prediction Model Evaluation result  {variable_name} as dependent variable ******')
            y_pred = linear_prediction_model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            print('R-squared:', r2)
            print('Mean squared error:', mse)
                

# Class object and accessing dataframes               
ClassifyReadmissionWithoutMedicationDiagnosis_Obj = ClassifyReadmissionWithoutMedicationDiagnosis(os.getenv("FILE_Full_Phecode_ATC_PATH"))
merged_df, dependent_variable_TNE_BO_1095, independent_variable  = ClassifyReadmissionWithoutMedicationDiagnosis_Obj.load_data()

# Function call
ClassifyReadmissionWithoutMedicationDiagnosis_Obj.train_classifier_without_medication_diagnosis()
ClassifyReadmissionWithoutMedicationDiagnosis_Obj.prediction_without_medication_diagnosis()