### Logistic Regression

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import random
from IPython.display import display
from IPython.display import Image
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

import os
from dotenv import load_dotenv
load_dotenv()


################## Without Medication and Diagnosis ##################
#                                                                    #
#                                                                    #
#                                                                    #
######################################################################
'''
This script is to train a classifier to classify readmission of patients Without Diagnosis & Medication.
    *******************************************************************************************
    # First iteration except tillnextepisode
dependent_variable = [tillnextepisode,Cat_TNE,No_more_episode,TNE_more180,TNE_less180]
independent_variable = ['MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
       'num_medications', 'Cat_LOE', 'Cat_CV', 'Therapy_ratio_x',
       'Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
       'Outpatient_ratio', 'Inpatient_ratio', 'Inpatient_day_ratio',
       'Inpatient_daynight_ratio', 'closingcode_0', 'closingcode_1',
       'closingcode_2', 'closingcode_3', 'closingcode_4', 'closingcode_5',
       'closingcode_6', 'closingcode_9', 'aftercode_1', 'aftercode_2',
       'aftercode_3', 'aftercode_4', 'aftercode_5', 'LOE_Norm', 'LOE_long',
       'LOE_short','length_of_episode', 'Count_visit']
    *******************************************************************************************
'''

class ClassifyReadmissionWithoutMedicationDiagnosis:
    def __init__(self, file1, file2):
        self.file1 = file1
        self.file2 = file2
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data(self):
        original_df_1 = pd.read_csv(self.file1)
        original_df_2 = pd.read_csv(self.file2)

        self.merged_df = pd.merge(original_df_1, original_df_2, on='episode_id', how='inner')
        self.merged_df = self.merged_df.copy(deep=True)  
        
        self.dependent_variable_tillnextepisode = self.merged_df[['tillnextepisode']]
        self.dependent_variable_Cat_TNE = self.merged_df[['Cat_TNE']]
        self.dependent_variable_No_more_episode = self.merged_df[['No_more_episode']]
        self.dependent_variable_TNE_more180 = self.merged_df[['TNE_more180']]
        self.dependent_variable_TNE_less180 = self.merged_df[['TNE_less180']]
        self.independent_variable = self.merged_df[['MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
       'num_medications', 'Cat_LOE', 'Cat_CV', 'Therapy_ratio_x',
       'Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
       'Outpatient_ratio', 'Inpatient_ratio', 'Inpatient_day_ratio',
       'Inpatient_daynight_ratio', 'closingcode_0', 'closingcode_1',
       'closingcode_2', 'closingcode_3', 'closingcode_4', 'closingcode_5',
       'closingcode_6', 'closingcode_9', 'aftercode_1', 'aftercode_2',
       'aftercode_3', 'aftercode_4', 'aftercode_5', 'LOE_Norm', 'LOE_long',
       'LOE_short','length_of_episode', 'Count_visit']]
        
        return self.merged_df, self.dependent_variable_tillnextepisode, self.dependent_variable_Cat_TNE,self.dependent_variable_No_more_episode,self.dependent_variable_TNE_more180,self.dependent_variable_TNE_less180,self.independent_variable
        
    def train_classifier_without_medication_diagnosis(self):
        dependent_variables = [self.dependent_variable_Cat_TNE,self.dependent_variable_No_more_episode,self.dependent_variable_TNE_more180,self.dependent_variable_TNE_less180]
        dependent_variable_names = {'self.dependent_variable_Cat_TNE': 'Cat_TNE', 'self.dependent_variable_No_more_episode': 'No_more_episode', 'self.dependent_variable_TNE_more180': 'TNE_more180', 'self.dependent_variable_TNE_less180': 'TNE_less180'}
              
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            logistic_prediction_model = LogisticRegression()
            X_train, X_temp, y_train, y_temp = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            X_test,X_eval,y_test,y_eval = train_test_split(X_temp,y_temp,test_size=0.33)
            logistic_prediction_model.fit(X_train,y_train)
            y_pred = logistic_prediction_model.predict(X_test)
            
            
            # Checks if the output category for classification is binary or multiclass
            category_count = int(y_train.nunique())
            print(f'\n****** Evaluation result Without Medication Diagnosis (only numeric column) as independent variable and  {variable_name} as dependent variable ******')
            # For binary class classification 
            if category_count == 2:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='binary')
                recall = recall_score(y_test, y_pred, average='binary')
                f1 = f1_score(y_test, y_pred, average='binary')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                print("Confused matrix:")
                conf_matrix = confusion_matrix(y_test, y_pred)
                tn, fp, fn, tp = conf_matrix.ravel()
                print(conf_matrix)
                print(f"True Negative (TN): {tn}")
                print(f"False Positive (FP): {fp}")
                print(f"False Negative (FN): {fn}")
                print(f"True Positive (TP): {tp}")
            
            # For binary class classification 
            else:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                conf_matrix = confusion_matrix(y_test, y_pred)
                print("Confused matrix:")
                print(conf_matrix)
                

ClassifyReadmissionWithoutMedicationDiagnosis_Obj = ClassifyReadmissionWithoutMedicationDiagnosis(os.getenv("FILE1_PATH"), os.getenv("FILE2_PATH"))
merged_df, dependent_variable_tillnextepisode, dependent_variable_Cat_TNE,dependent_variable_No_more_episode,dependent_variable_TNE_more180,dependent_variable_TNE_less180,independent_variable  = ClassifyReadmissionWithoutMedicationDiagnosis_Obj.load_data()

print(f''' 
************************************************************************************************************************************************
************************************************************************************************************************************************
Independent variable Used: 
    'MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M',
    'num_diagnoses','num_medications', 'Cat_LOE', 'Cat_CV', 'Therapy_ratio_x','Examination_ratio', 
    'Advisory_ratio', 'TreatmentPlanning_ratio','Outpatient_ratio', 'Inpatient_ratio', 
    'Inpatient_day_ratio','Inpatient_daynight_ratio', 'closingcode_0', 'closingcode_1',
    'closingcode_2', 'closingcode_3', 'closingcode_4', 'closingcode_5','closingcode_6', 
    'closingcode_9', 'aftercode_1', 'aftercode_2','aftercode_3', 'aftercode_4', 'aftercode_5',
    'LOE_Norm', 'LOE_long','LOE_short','length_of_episode', 'Count_visit'
    
Algorithm Used: 
    Logistic Regression 
************************************************************************************************************************************************
************************************************************************************************************************************************
''')
ClassifyReadmissionWithoutMedicationDiagnosis_Obj.train_classifier_without_medication_diagnosis()







#################### With Medication and Diagnosis ###################
#                                                                    #
#                                                                    #
#                                                                    #
######################################################################
'''
This script is to train a classifier to classify readmission of patients With Diagnosis & Medication using count vectorizer in medication and diagnosis column.
    *******************************************************************************************
# Second iteration except tillnextepisode
dependent_variable = [tillnextepisode,Cat_TNE,No_more_episode,TNE_more180,TNE_less180]
independent_variable = ['MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
       'num_medications', 'Cat_LOE', 'Cat_CV', 'Therapy_ratio_x',
       'Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
       'Outpatient_ratio', 'Inpatient_ratio', 'Inpatient_day_ratio',
       'Inpatient_daynight_ratio', 'closingcode_0', 'closingcode_1',
       'closingcode_2', 'closingcode_3', 'closingcode_4', 'closingcode_5',
       'closingcode_6', 'closingcode_9', 'aftercode_1', 'aftercode_2',
       'aftercode_3', 'aftercode_4', 'aftercode_5', 'LOE_Norm', 'LOE_long',
       'LOE_short','length_of_episode', 'Count_visit', 'actual_diag', 'actual_med']
    *******************************************************************************************
'''


class ClassifyReadmissionWithMedicationDiagnosis:
    def __init__(self, file1, file2):
        self.file1 = file1
        self.file2 = file2
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data(self):
        original_df_1 = pd.read_csv(self.file1)
        original_df_2 = pd.read_csv(self.file2)

        self.merged_df = pd.merge(original_df_1, original_df_2, on='episode_id', how='inner')
        self.merged_df = self.merged_df.copy(deep=True)  
        
        self.dependent_variable_tillnextepisode = self.merged_df[['tillnextepisode']]
        self.dependent_variable_Cat_TNE = self.merged_df[['Cat_TNE']]
        self.dependent_variable_No_more_episode = self.merged_df[['No_more_episode']]
        self.dependent_variable_TNE_more180 = self.merged_df[['TNE_more180']]
        self.dependent_variable_TNE_less180 = self.merged_df[['TNE_less180']]
        self.independent_variable = self.merged_df[['MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
       'num_medications', 'Cat_LOE', 'Cat_CV', 'Therapy_ratio_x',
       'Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
       'Outpatient_ratio', 'Inpatient_ratio', 'Inpatient_day_ratio',
       'Inpatient_daynight_ratio', 'closingcode_0', 'closingcode_1',
       'closingcode_2', 'closingcode_3', 'closingcode_4', 'closingcode_5',
       'closingcode_6', 'closingcode_9', 'aftercode_1', 'aftercode_2',
       'aftercode_3', 'aftercode_4', 'aftercode_5', 'LOE_Norm', 'LOE_long',
       'LOE_short','length_of_episode', 'Count_visit', 'actual_diag', 'actual_med']]
        
        return self.merged_df, self.dependent_variable_tillnextepisode, self.dependent_variable_Cat_TNE,self.dependent_variable_No_more_episode,self.dependent_variable_TNE_more180,self.dependent_variable_TNE_less180,self.independent_variable

    def train_classifier_with_medication_diagnosis(self):
        dependent_variables = [self.dependent_variable_Cat_TNE,self.dependent_variable_No_more_episode,self.dependent_variable_TNE_more180,self.dependent_variable_TNE_less180]
        dependent_variable_names = {'self.dependent_variable_Cat_TNE': 'Cat_TNE', 'self.dependent_variable_No_more_episode': 'No_more_episode', 'self.dependent_variable_TNE_more180': 'TNE_more180', 'self.dependent_variable_TNE_less180': 'TNE_less180'}
        
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):

            vectorizer = CountVectorizer()
            X_vectorized = vectorizer.fit_transform(self.independent_variable[["actual_diag", "actual_med"]].apply(lambda x: " ".join(x), axis=1))
            X_vectorized = X_vectorized.toarray()
            X = np.concatenate([self.independent_variable.iloc[:,0:35], X_vectorized], axis=1)

            X_train, X_temp, y_train, y_temp = train_test_split(X, dependent_variable, train_size=0.7)
            X_test,X_eval,y_test,y_eval = train_test_split(X_temp,y_temp,test_size=0.33)

            model_log_count = LogisticRegression()
            model_log_count.fit(X_train, y_train)

            y_pred = model_log_count.predict(X_test)


            # Checks if the output category for classification is binary or multiclass
            category_count = int(y_train.nunique())
            print(f'\n****** Evaluation result With Medication Diagnosis as independent variable and  {variable_name} as dependent variable ******')
            
            # For binary class classification 
            if category_count == 2:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='binary')
                recall = recall_score(y_test, y_pred, average='binary')
                f1 = f1_score(y_test, y_pred, average='binary')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                print("Confused matrix:")
                conf_matrix = confusion_matrix(y_test, y_pred)
                tn, fp, fn, tp = conf_matrix.ravel()
                print(conf_matrix)
                print(f"True Negative (TN): {tn}")
                print(f"False Positive (FP): {fp}")
                print(f"False Negative (FN): {fn}")
                print(f"True Positive (TP): {tp}")
            
            # For binary class classification 
            else:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1 Score: {f1}")
                conf_matrix = confusion_matrix(y_test, y_pred)
                print("Confused matrix:")
                print(conf_matrix)

ClassifyReadmissionWithMedicationDiagnosis_Obj = ClassifyReadmissionWithMedicationDiagnosis(os.getenv("FILE1_PATH"), os.getenv("FILE2_PATH"))
merged_df, dependent_variable_tillnextepisode, dependent_variable_Cat_TNE,dependent_variable_No_more_episode,dependent_variable_TNE_more180,dependent_variable_TNE_less180,independent_variable  = ClassifyReadmissionWithMedicationDiagnosis_Obj.load_data()
print(f''' 
************************************************************************************************************************************************
************************************************************************************************************************************************
Independent variable Used: 
    'MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
    'num_medications', 'Cat_LOE', 'Cat_CV', 'Therapy_ratio_x',
    'Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
    'Outpatient_ratio', 'Inpatient_ratio', 'Inpatient_day_ratio',
    'Inpatient_daynight_ratio', 'closingcode_0', 'closingcode_1',
    'closingcode_2', 'closingcode_3', 'closingcode_4', 'closingcode_5',
    'closingcode_6', 'closingcode_9', 'aftercode_1', 'aftercode_2',
    'aftercode_3', 'aftercode_4', 'aftercode_5', 'LOE_Norm', 'LOE_long',
    'LOE_short','length_of_episode', 'Count_visit', 'actual_diag', 'actual_med'
    
Algorithm Used: 
    Logistic Regression 
************************************************************************************************************************************************
************************************************************************************************************************************************
''')
ClassifyReadmissionWithMedicationDiagnosis_Obj.train_classifier_with_medication_diagnosis()





########## With Medication and Diagnosis And Less Features ###########
#                                                                    #
#                                                                    #
#                                                                    #
######################################################################
'''This script is to train a machine learning model to predict readmission of patients using count vectorizer in medication and diagnosis column.
    *******************************************************************************************
#Third iteration
dependent_variable = [tillnextepisode,Cat_TNE,No_more_episode,TNE_more180,TNE_less180]
independent_variable = ['MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
       'num_medications', 'Therapy_ratio_x','Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
       'Outpatient_ratio', 'Inpatient_day_ratio','Inpatient_daynight_ratio', 'actual_diag', 'actual_med']
    *******************************************************************************************
'''

class ClassifyReadmissionWithMedicationDiagnosis_WithLessFeatures:
    def __init__(self, file1, file2):
        self.file1 = file1
        self.file2 = file2
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data(self):
        original_df_1 = pd.read_csv(self.file1)
        original_df_2 = pd.read_csv(self.file2)

        self.merged_df = pd.merge(original_df_1, original_df_2, on='episode_id', how='inner')
        self.merged_df = self.merged_df.copy(deep=True)  
        
        self.dependent_variable_tillnextepisode = self.merged_df[['tillnextepisode']]
        self.dependent_variable_Cat_TNE = self.merged_df[['Cat_TNE']]
        self.dependent_variable_No_more_episode = self.merged_df[['No_more_episode']]
        self.dependent_variable_TNE_more180 = self.merged_df[['TNE_more180']]
        self.dependent_variable_TNE_less180 = self.merged_df[['TNE_less180']]
        self.independent_variable = self.merged_df[['MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
       'num_medications', 'Therapy_ratio_x','Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
       'Outpatient_ratio', 'Inpatient_day_ratio','Inpatient_daynight_ratio', 'actual_diag', 'actual_med']]
        
        return self.merged_df, self.dependent_variable_tillnextepisode, self.dependent_variable_Cat_TNE,self.dependent_variable_No_more_episode,self.dependent_variable_TNE_more180,self.dependent_variable_TNE_less180,self.independent_variable

    def train_classifier_with_medication_diagnosis(self):
        dependent_variables = [self.dependent_variable_Cat_TNE,self.dependent_variable_No_more_episode,self.dependent_variable_TNE_more180,self.dependent_variable_TNE_less180]
        dependent_variable_names = {'self.dependent_variable_Cat_TNE': 'Cat_TNE', 'self.dependent_variable_No_more_episode': 'No_more_episode', 'self.dependent_variable_TNE_more180': 'TNE_more180', 'self.dependent_variable_TNE_less180': 'TNE_less180'}
        
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):

            vectorizer = CountVectorizer()
            X_vectorized = vectorizer.fit_transform(self.independent_variable[["actual_diag", "actual_med"]].apply(lambda x: " ".join(x), axis=1))
            X_vectorized = X_vectorized.toarray()
            X = np.concatenate([self.independent_variable.iloc[:,0:14], X_vectorized], axis=1)

            X_train, X_temp, y_train, y_temp = train_test_split(X, dependent_variable, train_size=0.7)
            X_test,X_eval,y_test,y_eval = train_test_split(X_temp,y_temp,test_size=0.33)

            model_log_count = LogisticRegression()
            model_log_count.fit(X_train, y_train)

            y_pred = model_log_count.predict(X_test)


            # Checks if the output category for classification is binary or multiclass
            category_count = int(y_train.nunique())
            print(f'\n****** Evaluation result With Medication Diagnosis and Less Number of Features as independent variable and  {variable_name} as dependent variable ******')
            
            # For binary class classification 
            if category_count == 2:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='binary')
                recall = recall_score(y_test, y_pred, average='binary')
                f1 = f1_score(y_test, y_pred, average='binary')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                print("Confused matrix:")
                conf_matrix = confusion_matrix(y_test, y_pred)
                tn, fp, fn, tp = conf_matrix.ravel()
                print(conf_matrix)
                print(f"True Negative (TN): {tn}")
                print(f"False Positive (FP): {fp}")
                print(f"False Negative (FN): {fn}")
                print(f"True Positive (TP): {tp}")
            
            # For binary class classification 
            else:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                print(f"Accuracy: {accuracy:.4f}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                conf_matrix = confusion_matrix(y_test, y_pred)
                print("Confused matrix:")
                print(conf_matrix)

ClassifyReadmissionWithMedicationDiagnosis_WithLessFeatures_Obj = ClassifyReadmissionWithMedicationDiagnosis_WithLessFeatures(os.getenv("FILE1_PATH"), os.getenv("FILE2_PATH"))
merged_df, dependent_variable_tillnextepisode, dependent_variable_Cat_TNE,dependent_variable_No_more_episode,dependent_variable_TNE_more180,dependent_variable_TNE_less180,independent_variable  = ClassifyReadmissionWithMedicationDiagnosis_WithLessFeatures_Obj.load_data()
print(f''' 
************************************************************************************************************************************************
************************************************************************************************************************************************
Independent variable Used: 
    'MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
    'num_medications', 'Therapy_ratio_x','Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
    'Outpatient_ratio', 'Inpatient_day_ratio','Inpatient_daynight_ratio', 'actual_diag', 'actual_med'
    
Algorithm Used: 
    Logistic Regression 
************************************************************************************************************************************************
************************************************************************************************************************************************
''')
ClassifyReadmissionWithMedicationDiagnosis_WithLessFeatures_Obj.train_classifier_with_medication_diagnosis()

### Support Vector Machine

In [None]:
################## Without Medication and Diagnosis ##################
#                                                                    #
#                                                                    #
#                                                                    #
######################################################################
'''
This script is to train a classifier to classify readmission of patients Without Diagnosis & Medication.
    *******************************************************************************************
    # First iteration except tillnextepisode
dependent_variable = [tillnextepisode,Cat_TNE,No_more_episode,TNE_more180,TNE_less180]
independent_variable = ['MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
       'num_medications', 'Cat_LOE', 'Cat_CV', 'Therapy_ratio_x',
       'Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
       'Outpatient_ratio', 'Inpatient_ratio', 'Inpatient_day_ratio',
       'Inpatient_daynight_ratio', 'closingcode_0', 'closingcode_1',
       'closingcode_2', 'closingcode_3', 'closingcode_4', 'closingcode_5',
       'closingcode_6', 'closingcode_9', 'aftercode_1', 'aftercode_2',
       'aftercode_3', 'aftercode_4', 'aftercode_5', 'LOE_Norm', 'LOE_long',
       'LOE_short','length_of_episode', 'Count_visit']
    *******************************************************************************************
'''

class ClassifyReadmissionWithoutMedicationDiagnosis:
    def __init__(self, file1, file2):
        self.file1 = file1
        self.file2 = file2
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data(self):
        original_df_1 = pd.read_csv(self.file1)
        original_df_2 = pd.read_csv(self.file2)

        self.merged_df = pd.merge(original_df_1, original_df_2, on='episode_id', how='inner')
        self.merged_df = self.merged_df.copy(deep=True)  
        
        self.dependent_variable_tillnextepisode = self.merged_df[['tillnextepisode']]
        self.dependent_variable_Cat_TNE = self.merged_df[['Cat_TNE']]
        self.dependent_variable_No_more_episode = self.merged_df[['No_more_episode']]
        self.dependent_variable_TNE_more180 = self.merged_df[['TNE_more180']]
        self.dependent_variable_TNE_less180 = self.merged_df[['TNE_less180']]
        self.independent_variable = self.merged_df[['MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
       'num_medications', 'Cat_LOE', 'Cat_CV', 'Therapy_ratio_x',
       'Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
       'Outpatient_ratio', 'Inpatient_ratio', 'Inpatient_day_ratio',
       'Inpatient_daynight_ratio', 'closingcode_0', 'closingcode_1',
       'closingcode_2', 'closingcode_3', 'closingcode_4', 'closingcode_5',
       'closingcode_6', 'closingcode_9', 'aftercode_1', 'aftercode_2',
       'aftercode_3', 'aftercode_4', 'aftercode_5', 'LOE_Norm', 'LOE_long',
       'LOE_short','length_of_episode', 'Count_visit']]
        
        return self.merged_df, self.dependent_variable_tillnextepisode, self.dependent_variable_Cat_TNE,self.dependent_variable_No_more_episode,self.dependent_variable_TNE_more180,self.dependent_variable_TNE_less180,self.independent_variable
        
    def train_classifier_without_medication_diagnosis(self):
        dependent_variables = [self.dependent_variable_Cat_TNE,self.dependent_variable_No_more_episode,self.dependent_variable_TNE_more180,self.dependent_variable_TNE_less180]
        dependent_variable_names = {'self.dependent_variable_Cat_TNE': 'Cat_TNE', 'self.dependent_variable_No_more_episode': 'No_more_episode', 'self.dependent_variable_TNE_more180': 'TNE_more180', 'self.dependent_variable_TNE_less180': 'TNE_less180'}
              
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):
            logistic_prediction_model = SVC()
            X_train, X_temp, y_train, y_temp = train_test_split(self.independent_variable, dependent_variable, train_size=0.7)
            X_test,X_eval,y_test,y_eval = train_test_split(X_temp,y_temp,test_size=0.33)
            logistic_prediction_model.fit(X_train,y_train)
            y_pred = logistic_prediction_model.predict(X_test)
            
            
            # Checks if the output category for classification is binary or multiclass
            category_count = int(y_train.nunique())
            print(f'\n****** Evaluation result Without Medication Diagnosis (only numeric column) as independent variable and  {variable_name} as dependent variable ******')
            # For binary class classification 
            if category_count == 2:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='binary')
                recall = recall_score(y_test, y_pred, average='binary')
                f1 = f1_score(y_test, y_pred, average='binary')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                print("Confused matrix:")
                conf_matrix = confusion_matrix(y_test, y_pred)
                tn, fp, fn, tp = conf_matrix.ravel()
                print(conf_matrix)
                print(f"True Negative (TN): {tn}")
                print(f"False Positive (FP): {fp}")
                print(f"False Negative (FN): {fn}")
                print(f"True Positive (TP): {tp}")
            
            # For binary class classification 
            else:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                conf_matrix = confusion_matrix(y_test, y_pred)
                print("Confused matrix:")
                print(conf_matrix)
                

ClassifyReadmissionWithoutMedicationDiagnosis_Obj = ClassifyReadmissionWithoutMedicationDiagnosis(os.getenv("FILE1_PATH"), os.getenv("FILE2_PATH"))
merged_df, dependent_variable_tillnextepisode, dependent_variable_Cat_TNE,dependent_variable_No_more_episode,dependent_variable_TNE_more180,dependent_variable_TNE_less180,independent_variable  = ClassifyReadmissionWithoutMedicationDiagnosis_Obj.load_data()

print(f''' 
************************************************************************************************************************************************
************************************************************************************************************************************************
Independent variable Used: 
    'MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M',
    'num_diagnoses','num_medications', 'Cat_LOE', 'Cat_CV', 'Therapy_ratio_x','Examination_ratio', 
    'Advisory_ratio', 'TreatmentPlanning_ratio','Outpatient_ratio', 'Inpatient_ratio', 
    'Inpatient_day_ratio','Inpatient_daynight_ratio', 'closingcode_0', 'closingcode_1',
    'closingcode_2', 'closingcode_3', 'closingcode_4', 'closingcode_5','closingcode_6', 
    'closingcode_9', 'aftercode_1', 'aftercode_2','aftercode_3', 'aftercode_4', 'aftercode_5',
    'LOE_Norm', 'LOE_long','LOE_short','length_of_episode', 'Count_visit'
    
Algorithm Used: 
    Support Vector Machine
************************************************************************************************************************************************
************************************************************************************************************************************************
''')
ClassifyReadmissionWithoutMedicationDiagnosis_Obj.train_classifier_without_medication_diagnosis()







#################### With Medication and Diagnosis ###################
#                                                                    #
#                                                                    #
#                                                                    #
######################################################################
'''
This script is to train a classifier to classify readmission of patients With Diagnosis & Medication using count vectorizer in medication and diagnosis column.
    *******************************************************************************************
# Second iteration except tillnextepisode
dependent_variable = [tillnextepisode,Cat_TNE,No_more_episode,TNE_more180,TNE_less180]
independent_variable = ['MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
       'num_medications', 'Cat_LOE', 'Cat_CV', 'Therapy_ratio_x',
       'Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
       'Outpatient_ratio', 'Inpatient_ratio', 'Inpatient_day_ratio',
       'Inpatient_daynight_ratio', 'closingcode_0', 'closingcode_1',
       'closingcode_2', 'closingcode_3', 'closingcode_4', 'closingcode_5',
       'closingcode_6', 'closingcode_9', 'aftercode_1', 'aftercode_2',
       'aftercode_3', 'aftercode_4', 'aftercode_5', 'LOE_Norm', 'LOE_long',
       'LOE_short','length_of_episode', 'Count_visit', 'actual_diag', 'actual_med']
    *******************************************************************************************
'''


class ClassifyReadmissionWithMedicationDiagnosis:
    def __init__(self, file1, file2):
        self.file1 = file1
        self.file2 = file2
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data(self):
        original_df_1 = pd.read_csv(self.file1)
        original_df_2 = pd.read_csv(self.file2)

        self.merged_df = pd.merge(original_df_1, original_df_2, on='episode_id', how='inner')
        self.merged_df = self.merged_df.copy(deep=True)  
        
        self.dependent_variable_tillnextepisode = self.merged_df[['tillnextepisode']]
        self.dependent_variable_Cat_TNE = self.merged_df[['Cat_TNE']]
        self.dependent_variable_No_more_episode = self.merged_df[['No_more_episode']]
        self.dependent_variable_TNE_more180 = self.merged_df[['TNE_more180']]
        self.dependent_variable_TNE_less180 = self.merged_df[['TNE_less180']]
        self.independent_variable = self.merged_df[['MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
       'num_medications', 'Cat_LOE', 'Cat_CV', 'Therapy_ratio_x',
       'Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
       'Outpatient_ratio', 'Inpatient_ratio', 'Inpatient_day_ratio',
       'Inpatient_daynight_ratio', 'closingcode_0', 'closingcode_1',
       'closingcode_2', 'closingcode_3', 'closingcode_4', 'closingcode_5',
       'closingcode_6', 'closingcode_9', 'aftercode_1', 'aftercode_2',
       'aftercode_3', 'aftercode_4', 'aftercode_5', 'LOE_Norm', 'LOE_long',
       'LOE_short','length_of_episode', 'Count_visit', 'actual_diag', 'actual_med']]
        
        return self.merged_df, self.dependent_variable_tillnextepisode, self.dependent_variable_Cat_TNE,self.dependent_variable_No_more_episode,self.dependent_variable_TNE_more180,self.dependent_variable_TNE_less180,self.independent_variable

    def train_classifier_with_medication_diagnosis(self):
        dependent_variables = [self.dependent_variable_Cat_TNE,self.dependent_variable_No_more_episode,self.dependent_variable_TNE_more180,self.dependent_variable_TNE_less180]
        dependent_variable_names = {'self.dependent_variable_Cat_TNE': 'Cat_TNE', 'self.dependent_variable_No_more_episode': 'No_more_episode', 'self.dependent_variable_TNE_more180': 'TNE_more180', 'self.dependent_variable_TNE_less180': 'TNE_less180'}
        
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):

            vectorizer = CountVectorizer()
            X_vectorized = vectorizer.fit_transform(self.independent_variable[["actual_diag", "actual_med"]].apply(lambda x: " ".join(x), axis=1))
            X_vectorized = X_vectorized.toarray()
            X = np.concatenate([self.independent_variable.iloc[:,0:35], X_vectorized], axis=1)

            X_train, X_temp, y_train, y_temp = train_test_split(X, dependent_variable, train_size=0.7)
            X_test,X_eval,y_test,y_eval = train_test_split(X_temp,y_temp,test_size=0.33)

            model_log_count = SVC()
            model_log_count.fit(X_train, y_train)

            y_pred = model_log_count.predict(X_test)


            # Checks if the output category for classification is binary or multiclass
            category_count = int(y_train.nunique())
            print(f'\n****** Evaluation result With Medication Diagnosis as independent variable and  {variable_name} as dependent variable ******')
            
            # For binary class classification 
            if category_count == 2:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='binary')
                recall = recall_score(y_test, y_pred, average='binary')
                f1 = f1_score(y_test, y_pred, average='binary')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                print("Confused matrix:")
                conf_matrix = confusion_matrix(y_test, y_pred)
                tn, fp, fn, tp = conf_matrix.ravel()
                print(conf_matrix)
                print(f"True Negative (TN): {tn}")
                print(f"False Positive (FP): {fp}")
                print(f"False Negative (FN): {fn}")
                print(f"True Positive (TP): {tp}")
            
            # For binary class classification 
            else:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1 Score: {f1}")
                conf_matrix = confusion_matrix(y_test, y_pred)
                print("Confused matrix:")
                print(conf_matrix)

ClassifyReadmissionWithMedicationDiagnosis_Obj = ClassifyReadmissionWithMedicationDiagnosis(os.getenv("FILE1_PATH"), os.getenv("FILE2_PATH"))
merged_df, dependent_variable_tillnextepisode, dependent_variable_Cat_TNE,dependent_variable_No_more_episode,dependent_variable_TNE_more180,dependent_variable_TNE_less180,independent_variable  = ClassifyReadmissionWithMedicationDiagnosis_Obj.load_data()
print(f''' 
************************************************************************************************************************************************
************************************************************************************************************************************************
Independent variable Used: 
    'MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
    'num_medications', 'Cat_LOE', 'Cat_CV', 'Therapy_ratio_x',
    'Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
    'Outpatient_ratio', 'Inpatient_ratio', 'Inpatient_day_ratio',
    'Inpatient_daynight_ratio', 'closingcode_0', 'closingcode_1',
    'closingcode_2', 'closingcode_3', 'closingcode_4', 'closingcode_5',
    'closingcode_6', 'closingcode_9', 'aftercode_1', 'aftercode_2',
    'aftercode_3', 'aftercode_4', 'aftercode_5', 'LOE_Norm', 'LOE_long',
    'LOE_short','length_of_episode', 'Count_visit', 'actual_diag', 'actual_med'
    
Algorithm Used: 
    Support Vector Machine
************************************************************************************************************************************************
************************************************************************************************************************************************
''')
ClassifyReadmissionWithMedicationDiagnosis_Obj.train_classifier_with_medication_diagnosis()





########## With Medication and Diagnosis And Less Features ###########
#                                                                    #
#                                                                    #
#                                                                    #
######################################################################
'''This script is to train a machine learning model to predict readmission of patients using count vectorizer in medication and diagnosis column.
    *******************************************************************************************
#Third iteration
dependent_variable = [tillnextepisode,Cat_TNE,No_more_episode,TNE_more180,TNE_less180]
independent_variable = ['MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
       'num_medications', 'Therapy_ratio_x','Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
       'Outpatient_ratio', 'Inpatient_day_ratio','Inpatient_daynight_ratio', 'actual_diag', 'actual_med']
    *******************************************************************************************
'''

class ClassifyReadmissionWithMedicationDiagnosis_WithLessFeatures:
    def __init__(self, file1, file2):
        self.file1 = file1
        self.file2 = file2
        self.merged_df = None
        self.selected_column_merged_df = None
        self.without_diag_medic_selected_column_merged_df = None
    
    def load_data(self):
        original_df_1 = pd.read_csv(self.file1)
        original_df_2 = pd.read_csv(self.file2)

        self.merged_df = pd.merge(original_df_1, original_df_2, on='episode_id', how='inner')
        self.merged_df = self.merged_df.copy(deep=True)  
        
        self.dependent_variable_tillnextepisode = self.merged_df[['tillnextepisode']]
        self.dependent_variable_Cat_TNE = self.merged_df[['Cat_TNE']]
        self.dependent_variable_No_more_episode = self.merged_df[['No_more_episode']]
        self.dependent_variable_TNE_more180 = self.merged_df[['TNE_more180']]
        self.dependent_variable_TNE_less180 = self.merged_df[['TNE_less180']]
        self.independent_variable = self.merged_df[['MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
       'num_medications', 'Therapy_ratio_x','Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
       'Outpatient_ratio', 'Inpatient_day_ratio','Inpatient_daynight_ratio', 'actual_diag', 'actual_med']]
        
        return self.merged_df, self.dependent_variable_tillnextepisode, self.dependent_variable_Cat_TNE,self.dependent_variable_No_more_episode,self.dependent_variable_TNE_more180,self.dependent_variable_TNE_less180,self.independent_variable

    def train_classifier_with_medication_diagnosis(self):
        dependent_variables = [self.dependent_variable_Cat_TNE,self.dependent_variable_No_more_episode,self.dependent_variable_TNE_more180,self.dependent_variable_TNE_less180]
        dependent_variable_names = {'self.dependent_variable_Cat_TNE': 'Cat_TNE', 'self.dependent_variable_No_more_episode': 'No_more_episode', 'self.dependent_variable_TNE_more180': 'TNE_more180', 'self.dependent_variable_TNE_less180': 'TNE_less180'}
        
        #for dependent_variable in dependent_variables:
        for variable_name, dependent_variable in zip(dependent_variable_names.values(), dependent_variables):

            vectorizer = CountVectorizer()
            X_vectorized = vectorizer.fit_transform(self.independent_variable[["actual_diag", "actual_med"]].apply(lambda x: " ".join(x), axis=1))
            X_vectorized = X_vectorized.toarray()
            X = np.concatenate([self.independent_variable.iloc[:,0:14], X_vectorized], axis=1)

            X_train, X_temp, y_train, y_temp = train_test_split(X, dependent_variable, train_size=0.7)
            X_test,X_eval,y_test,y_eval = train_test_split(X_temp,y_temp,test_size=0.33)

            model_log_count = SVC()
            model_log_count.fit(X_train, y_train)

            y_pred = model_log_count.predict(X_test)


            # Checks if the output category for classification is binary or multiclass
            category_count = int(y_train.nunique())
            print(f'\n****** Evaluation result With Medication Diagnosis and Less Number of Features as independent variable and  {variable_name} as dependent variable ******')
            
            # For binary class classification 
            if category_count == 2:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='binary')
                recall = recall_score(y_test, y_pred, average='binary')
                f1 = f1_score(y_test, y_pred, average='binary')
                
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                print("Confused matrix:")
                conf_matrix = confusion_matrix(y_test, y_pred)
                tn, fp, fn, tp = conf_matrix.ravel()
                print(conf_matrix)
                print(f"True Negative (TN): {tn}")
                print(f"False Positive (FP): {fp}")
                print(f"False Negative (FN): {fn}")
                print(f"True Positive (TP): {tp}")
            
            # For binary class classification 
            else:
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                print(f"Accuracy: {accuracy:.4f}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1score: {f1}")
                conf_matrix = confusion_matrix(y_test, y_pred)
                print("Confused matrix:")
                print(conf_matrix)

ClassifyReadmissionWithMedicationDiagnosis_WithLessFeatures_Obj = ClassifyReadmissionWithMedicationDiagnosis_WithLessFeatures(os.getenv("FILE1_PATH"), os.getenv("FILE2_PATH"))
merged_df, dependent_variable_tillnextepisode, dependent_variable_Cat_TNE,dependent_variable_No_more_episode,dependent_variable_TNE_more180,dependent_variable_TNE_less180,independent_variable  = ClassifyReadmissionWithMedicationDiagnosis_WithLessFeatures_Obj.load_data()
print(f''' 
************************************************************************************************************************************************
************************************************************************************************************************************************
Independent variable Used: 
    'MiddleChildhood', 'Preschooler', 'Teenager', 'F', 'M', 'num_diagnoses',
    'num_medications', 'Therapy_ratio_x','Examination_ratio', 'Advisory_ratio', 'TreatmentPlanning_ratio',
    'Outpatient_ratio', 'Inpatient_day_ratio','Inpatient_daynight_ratio', 'actual_diag', 'actual_med'
    
Algorithm Used: 
    Support Vector Machine
************************************************************************************************************************************************
************************************************************************************************************************************************
''')
ClassifyReadmissionWithMedicationDiagnosis_WithLessFeatures_Obj.train_classifier_with_medication_diagnosis()