In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Reading the Data

In [2]:
triage = pd.read_csv('triage.csv')
diagnosis = pd.read_csv('diagnosis.csv')

print(triage.shape, 'triage')
display(triage.head())

print(diagnosis.shape, 'diagnosis')
display(diagnosis.head())


(425087, 11) triage


Unnamed: 0,subject_id,stay_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,chiefcomplaint
0,10000032,32952584,97.8,87.0,14.0,97.0,71.0,43.0,7,2.0,Hypotension
1,10000032,33258284,98.4,70.0,16.0,97.0,106.0,63.0,0,3.0,"Abd pain, Abdominal distention"
2,10000032,35968195,99.4,105.0,18.0,96.0,106.0,57.0,10,3.0,"n/v/d, Abd pain"
3,10000032,38112554,98.9,88.0,18.0,97.0,116.0,88.0,10,3.0,Abdominal distention
4,10000032,39399961,98.7,77.0,16.0,98.0,96.0,50.0,13,2.0,"Abdominal distention, Abd pain, LETHAGIC"


(899050, 6) diagnosis


Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_version,icd_title
0,10000032,32952584,1,4589,9,HYPOTENSION NOS
1,10000032,32952584,2,07070,9,UNSPECIFIED VIRAL HEPATITIS C WITHOUT HEPATIC ...
2,10000032,32952584,3,V08,9,ASYMPTOMATIC HIV INFECTION
3,10000032,33258284,1,5728,9,"OTH SEQUELA, CHR LIV DIS"
4,10000032,33258284,2,78959,9,OTHER ASCITES


# Dealing with Triage Table

In [3]:
triage.drop(columns=['pain', 'acuity', 'chiefcomplaint'], inplace=True)

In [4]:
display(triage.head())

Unnamed: 0,subject_id,stay_id,temperature,heartrate,resprate,o2sat,sbp,dbp
0,10000032,32952584,97.8,87.0,14.0,97.0,71.0,43.0
1,10000032,33258284,98.4,70.0,16.0,97.0,106.0,63.0
2,10000032,35968195,99.4,105.0,18.0,96.0,106.0,57.0
3,10000032,38112554,98.9,88.0,18.0,97.0,116.0,88.0
4,10000032,39399961,98.7,77.0,16.0,98.0,96.0,50.0


# Min-Max Scaling on Triage

In [5]:
from sklearn.preprocessing import MinMaxScaler
import pickle

# Define columns to be normalized
columns_to_normalize = ['temperature', 'heartrate', 'resprate', 'sbp', 'o2sat', 'dbp']

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit scaler to the data and transform the columns
data_normalized = triage.copy()
data_normalized[columns_to_normalize] = scaler.fit_transform(triage[columns_to_normalize])

# Save the scaling parameters to a file (for future use)
scaling_parameters = {
    'min': scaler.data_min_.tolist(),
    'max': scaler.data_max_.tolist(),
    'scale': scaler.scale_.tolist(),
    'columns': columns_to_normalize
}

# Save the scaler to a file (for future use)
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

# Save the normalized data to a new file
data_normalized.to_csv('normalized_data.csv', index=False)


In [6]:
triage2 = pd.read_csv('normalized_data.csv')
triage2.head()

Unnamed: 0,subject_id,stay_id,temperature,heartrate,resprate,o2sat,sbp,dbp
0,10000032,32952584,0.099097,0.07009,0.007692,0.010405,0.000463,6.5e-05
1,10000032,33258284,0.099706,0.056235,0.008791,0.010405,0.000695,9.5e-05
2,10000032,35968195,0.10072,0.08476,0.00989,0.010298,0.000695,8.6e-05
3,10000032,38112554,0.100213,0.070905,0.00989,0.010405,0.000761,0.000133
4,10000032,39399961,0.10001,0.06194,0.008791,0.010513,0.000629,7.6e-05


# Treating nulls

In [7]:
triage2.isnull().sum()

subject_id         0
stay_id            0
temperature    23415
heartrate      17090
resprate       20353
o2sat          20596
sbp            18291
dbp            19091
dtype: int64

In [8]:
triage2.dropna(inplace = True)

In [9]:
triage2.isnull().sum()

subject_id     0
stay_id        0
temperature    0
heartrate      0
resprate       0
o2sat          0
sbp            0
dbp            0
dtype: int64

In [10]:
print('Total:', len(triage2))

Total: 392620


# Dealing with Diagnosis Table

In [11]:
diagnosis.drop(columns=['seq_num', 'icd_version','icd_title'], inplace=True)

In [12]:
display(diagnosis.head())

Unnamed: 0,subject_id,stay_id,icd_code
0,10000032,32952584,4589
1,10000032,32952584,07070
2,10000032,32952584,V08
3,10000032,33258284,5728
4,10000032,33258284,78959


# mapping of ICD codes

In [13]:
#Filter out E and V codes from ICD9 codes since processing will be done on the numeric first 3 values
diagnosis['recode'] = diagnosis['icd_code']
diagnosis['recode'] = diagnosis['recode'][~diagnosis['recode'].str.contains("[a-zA-Z]").fillna(False)]
diagnosis['recode'].fillna(value='999', inplace=True)

# Take in consideration just the first 3 integers of the ICD9 code
diagnosis['recode'] = diagnosis['recode'].str.slice(start=0, stop=3, step=1)
diagnosis['recode'] = diagnosis['recode'].astype(int)

In [14]:
# ICD Main Category ranges
icd_ranges = [(1, 140), (140, 240), (240, 280), (280, 290), (290, 320), (320,390),
               (390, 460), (460, 520), (520, 580), (580, 630), (630, 680),(680, 710),
               (710, 740), (740, 760), (760, 780), (780, 800), (800, 1000),(1000, 2000)]

# Associated category names
diag_dict = {0: 'infectious', 1: 'neoplasms', 2: 'endocrine', 3: 'blood',
             4: 'mental', 5: 'nervous', 6: 'circulatory', 7: 'respiratory',
             8: 'digestive', 9: 'genitourinary', 10: 'pregnancy', 11: 'skin',
             12: 'muscular', 13: 'congenital', 14: 'prenatal', 15: 'misc',
             16: 'injury', 17: 'misc'}

# Re-code in terms of integer
for num, category_range in enumerate(icd_ranges):
    diagnosis['recode'] = np.where(diagnosis['recode'].between(category_range[0],category_range[1]), num, diagnosis['recode'])
    
# Convert integer to category name using diag_dict
diagnosis['super_category'] = diagnosis['recode'].replace(diag_dict)

In [15]:
diagnosis.head()

Unnamed: 0,subject_id,stay_id,icd_code,recode,super_category
0,10000032,32952584,4589,6,circulatory
1,10000032,32952584,07070,0,infectious
2,10000032,32952584,V08,16,injury
3,10000032,33258284,5728,8,digestive
4,10000032,33258284,78959,15,misc


In [16]:
diagnosis[diagnosis['stay_id']==30000039]

Unnamed: 0,subject_id,stay_id,icd_code,recode,super_category
298745,13340997,30000039,8054,16,injury
298746,13340997,30000039,8052,16,injury
298747,13340997,30000039,E8889,16,injury
298748,13340997,30000039,80709,16,injury


In [17]:
stay_list = diagnosis.groupby('stay_id')['super_category'].apply(list).reset_index()

In [18]:
diagnosis_item = pd.get_dummies(stay_list['super_category'].explode()).groupby(stay_list['stay_id']).sum()
diagnosis_item.reset_index(inplace=True)

In [19]:
print(diagnosis_item)

         stay_id  blood  circulatory  congenital  digestive  endocrine  \
0       30000012      0            0           0          1          0   
1       30000017      0            1           0          0          0   
2       30000038      0            0           0          0          0   
3       30000039      0            0           0          0          0   
4       30000055      0            1           0          0          0   
...          ...    ...          ...         ...        ...        ...   
423984  39999939      0            1           0          0          1   
423985  39999953      0            0           0          0          0   
423986  39999961      0            0           0          0          0   
423987  39999964      0            0           0          0          0   
423988  39999965      0            0           0          0          0   

        genitourinary  infectious  injury  mental  misc  muscular  neoplasms  \
0                   1          

# Merging Triage with Diagnosis 

In [20]:
merged_df = pd.merge(triage2, diagnosis_item, on=['stay_id'], how='left')

In [21]:
display(merged_df.head(5))

Unnamed: 0,subject_id,stay_id,temperature,heartrate,resprate,o2sat,sbp,dbp,blood,circulatory,...,injury,mental,misc,muscular,neoplasms,nervous,pregnancy,prenatal,respiratory,skin
0,10000032,32952584,0.099097,0.07009,0.007692,0.010405,0.000463,6.5e-05,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10000032,33258284,0.099706,0.056235,0.008791,0.010405,0.000695,9.5e-05,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10000032,35968195,0.10072,0.08476,0.00989,0.010298,0.000695,8.6e-05,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10000032,38112554,0.100213,0.070905,0.00989,0.010405,0.000761,0.000133,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10000032,39399961,0.10001,0.06194,0.008791,0.010513,0.000629,7.6e-05,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [22]:
selected_columns = ['circulatory', 'congenital', 'digestive', 'endocrine', 'genitourinary', 'infectious', 'injury', 'mental', 'misc', 'muscular', 'neoplasms', 'nervous', 'pregnancy', 'prenatal', 'respiratory', 'skin']

# Checking unique values in each column
for column in selected_columns:
    unique_value_counts = merged_df[column].value_counts()
    print(f"Unique values and their counts in column '{column}':\n{unique_value_counts}")

Unique values and their counts in column 'circulatory':
0.0    354265
1.0     32206
2.0      4659
3.0       570
4.0        51
Name: circulatory, dtype: int64
Unique values and their counts in column 'congenital':
0.0    391598
1.0       152
2.0         1
Name: congenital, dtype: int64
Unique values and their counts in column 'digestive':
0.0    374974
1.0     15564
2.0      1168
3.0        42
4.0         3
Name: digestive, dtype: int64
Unique values and their counts in column 'endocrine':
0.0    365848
1.0     21542
2.0      4052
3.0       293
4.0        15
5.0         1
Name: endocrine, dtype: int64
Unique values and their counts in column 'genitourinary':
0.0    375068
1.0     15135
2.0      1422
3.0       121
4.0         5
Name: genitourinary, dtype: int64
Unique values and their counts in column 'infectious':
0.0    386762
1.0      4879
2.0       109
3.0         1
Name: infectious, dtype: int64
Unique values and their counts in column 'injury':
0.0    130975
1.0    105376
2.0     8

In [23]:
print('Total:', len(merged_df))

Total: 392620


# Base model

In [92]:
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pickle

# Initialize an empty dataframe to store results
results_df = pd.DataFrame(columns=['Iteration', 'Accuracy', 'Classification_Report', 'Positive_Probability'])

# Initialize an empty dictionary to store models
models_dict = {}

# Specify the columns you want to iterate over
target_columns = ['blood', 'circulatory', 'congenital', 'digestive', 'endocrine', 'genitourinary', 'infectious', 'injury', 'mental', 'misc', 'muscular', 'neoplasms', 'nervous', 'pregnancy', 'prenatal', 'respiratory', 'skin']  # Adjust these with your column names

for target_column in target_columns:
    start_time = datetime.now()
    
#     # Balance the dataset based on the current target column
    merged_df[target_column] = np.where(merged_df[target_column] >= 1, 1, 0)
    
    df_class_1 = merged_df[merged_df[target_column] == 1]
    df_class_0 = merged_df[merged_df[target_column] == 0]
    count_class_0 = len(df_class_0)
    
    if len(df_class_1) > 0:  # Check if df_class_1 has at least one row
        df_class_1_over = df_class_1.sample(count_class_0, replace=True)
        merged_data_over = pd.concat([df_class_0, df_class_1_over], axis=0)
        
        X = merged_data_over[['stay_id', 'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']]
        Y = merged_data_over[target_column]
        
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify=Y, random_state=2)
        
        rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_classifier.fit(X_train, Y_train)
        Y_pred = rf_classifier.predict(X_test)
        accuracy = accuracy_score(Y_test, Y_pred)

        class_report = classification_report(Y_test, Y_pred)
        
        # Append results to the dataframe
        results_df = results_df.append({'Iteration': target_column, 'Accuracy': accuracy, 'Classification_Report': class_report}, ignore_index=True)
        
        end_time = datetime.now()
        print(f'Model training time: {end_time-start_time}')
        print(f'Iteration {target_column}: Accuracy {accuracy:.4f}')
        print(class_report)
#         # Save the trained model
#         with open(f'model_{target_column}.pkl', 'wb') as file:
#             pickle.dump(rf_classifier, file)
#         print('Model saved.')
        
    else:
        print(f'No samples found for class {target_column}. Skipping...')
        
#Display the results dataframe
print("Results DataFrame:")
print(results_df)

# Display the models dictionary
print("Models Dictionary:")
print(models_dict)


Model training time: 0:03:43.888127
Iteration blood: Accuracy 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    116313
           1       1.00      1.00      1.00    116313

    accuracy                           1.00    232626
   macro avg       1.00      1.00      1.00    232626
weighted avg       1.00      1.00      1.00    232626

Model training time: 0:02:42.989105
Iteration circulatory: Accuracy 0.9922
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    106280
           1       0.99      1.00      0.99    106279

    accuracy                           0.99    212559
   macro avg       0.99      0.99      0.99    212559
weighted avg       0.99      0.99      0.99    212559

Model training time: 0:02:08.097250
Iteration congenital: Accuracy 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117480
           1       

In [25]:
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pickle

# Initialize an empty dataframe to store results
results_df = pd.DataFrame(columns=['Iteration', 'Accuracy', 'Classification_Report', 'Positive_Probability'])

# Initialize an empty dictionary to store models
models_dict = {}

# Specify the columns you want to iterate over
target_columns = ['blood', 'circulatory', 'congenital', 'digestive', 'endocrine', 'genitourinary', 'infectious', 'injury', 'mental', 'misc', 'muscular', 'neoplasms', 'nervous', 'pregnancy', 'prenatal', 'respiratory', 'skin']  # Adjust these with your column names

for target_column in target_columns:
    start_time = datetime.now()
    
#     # Balance the dataset based on the current target column
    merged_df[target_column] = np.where(merged_df[target_column] >= 1, 1, 0)
    
    df_class_1 = merged_df[merged_df[target_column] == 1]
    df_class_0 = merged_df[merged_df[target_column] == 0]
    count_class_0 = len(df_class_0)
    
    if len(df_class_1) > 0:  # Check if df_class_1 has at least one row
        df_class_1_over = df_class_1.sample(count_class_0, replace=True)
        merged_data_over = pd.concat([df_class_0, df_class_1_over], axis=0)
        
        X = merged_data_over[['stay_id', 'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']]
        Y = merged_data_over[target_column]
        
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify=Y, random_state=2)
        
        rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_classifier.fit(X_train, Y_train)
        Y_pred = rf_classifier.predict(X_test)
        accuracy = accuracy_score(Y_test, Y_pred)

        class_report = classification_report(Y_test, Y_pred)
        
        # Append results to the dataframe
        results_df = results_df.append({'Iteration': target_column, 'Accuracy': accuracy, 'Classification_Report': class_report}, ignore_index=True)
        
        end_time = datetime.now()
        print(f'Model training time: {end_time-start_time}')
        print(f'Iteration {target_column}: Accuracy {accuracy:.4f}')
        print(class_report)
        
        
        # Save the trained model
        with open(f'model_{target_column}.pkl', 'wb') as file:
            pickle.dump(rf_classifier, file)
        print('Model saved.')

Model training time: 0:01:21.653483
Iteration blood: Accuracy 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    116574
           1       1.00      1.00      1.00    116574

    accuracy                           1.00    233148
   macro avg       1.00      1.00      1.00    233148
weighted avg       1.00      1.00      1.00    233148

Model saved.
Model training time: 0:01:47.188391
Iteration circulatory: Accuracy 0.9920
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    106541
           1       0.99      1.00      0.99    106540

    accuracy                           0.99    213081
   macro avg       0.99      0.99      0.99    213081
weighted avg       0.99      0.99      0.99    213081

Model saved.
Model training time: 0:01:22.902788
Iteration congenital: Accuracy 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    