# Length of Hospitalization (ML 2023 Project)

## Necessary Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from transformers import AutoTokenizer, AutoModel
from multiprocessing import Pool

import lightgbm as lgb

from torch.utils.data import DataLoader
import torch
from torch.optim import Adam
from tqdm import tqdm

import random
import numpy as np

import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import string
import numpy as np
from torch.utils.data import Dataset
from torch import nn
import spacy
nlp = spacy.load('en_core_web_sm')

import seaborn as sns
sns.set_theme(style="whitegrid")

import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Dataset

In [None]:
hos_NUMTEXT = pd.read_csv('/content/drive/MyDrive/UIC Y3/Machine Learning/Project/RESOURCES/DATA/HOSP_DATA.csv')
hos_TEXT = pd.read_csv('/content/drive/MyDrive/UIC Y3/Machine Learning/Project/RESOURCES/DATA/HOSP_TEXT.csv')
hos_NUMTEXT = hos_NUMTEXT.rename(columns={'symptoms': 'text'})
hos_TEXT = hos_TEXT.rename(columns={'text_new': 'text'})

In [None]:
data = hos_TEXT.sample(50000)
# data = hos_TEXT
print(data['target'].value_counts())
data.head()

0    39399
1    10601
Name: target, dtype: int64


Unnamed: 0,subject_id,hadm_id,text,target
116659,16745416,28122033,"ELECTIVE, adult, Normal range, Encounter for a...",0
45163,16392878,22261889,"EU OBSERVATION, adult, Overweight (Pre-obese),...",0
335350,17352349,29383969,"URGENT, adult, Overweight (Pre-obese), Outcome...",0
88801,10109613,21509857,"URGENT, Older adults, Overweight (Pre-obese), ...",1
141949,15589702,22905737,"OBSERVATION ADMIT, Older adults, Normal range,...",1


In [None]:
data_num = hos_NUMTEXT.sample(50000)
print(data_num['target'].value_counts())
data_num.head()

0    39562
1    10438
Name: target, dtype: int64


Unnamed: 0,subject_id,hadm_id,admission_type,age,BMI,text,target
45719,12951637,28837538,DIRECT OBSERVATION,50,29.8,"Other and unspecified hyperlipidemia, Accident...",0
191977,11030576,21648828,EW EMER.,87,23.7,"Personal history of tobacco use, Hyposmolality...",1
83994,15318463,22330393,EW EMER.,52,21.3,"Human immunodeficiency virus [HIV] disease, Ca...",0
179409,16874326,21440674,URGENT,29,24.45,"First-degree perineal laceration, delivered, w...",0
200473,10716693,21808276,SURGICAL SAME DAY ADMISSION,64,34.16875,"Personal history of tobacco use, Hyposmolality...",0


## TEXT Preperations

In [None]:
import transformers
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [None]:
import string
from collections import Counter
# two-step preprocessing
def remove_punctuation(text):
    """
    This function takes a string as input and removes all punctuation marks from it.
    """
    return text.translate(str.maketrans('', '', string.punctuation))

def lowercase_text(text):
    """
    This function takes a string as input and converts it to lowercase.
    """
    return text.lower()

def counter_word(text):
  count = Counter()
  for i in text.values:
    for word in i.split():
      count[word]+=1
  return count

def text_tokenization(df, padding_length):
  df['text'] = df['text'].apply(remove_punctuation).apply(lowercase_text)
  text = df.text
  counter = counter_word(text)
  vocab_num = len(counter)

  # tokenization and convert the texts to vectors
  input_ids=[]
  attention_masks=[]

  for msg in data["text"]:
      bert_inp=tokenizer.encode_plus(msg,
                                    add_special_tokens = False,
                                    max_length = padding_length,
                                    padding='max_length',
                                    return_attention_mask = True)#,return_tensors='pt'
      input_ids.append(bert_inp['input_ids'])
      attention_masks.append(bert_inp['attention_mask'])

  input_ids=np.asarray(input_ids)
  attention_masks=np.array(attention_masks)

  return vocab_num, input_ids, attention_masks

In [None]:
# for text-only preprocessing
t_vocab, t_input_ids, t_att = text_tokenization(data, 650)
# check input shape
t_input_ids.shape

(50000, 650)

In [None]:
# for text & nums preprocessing
tn_vocab, tn_input_ids, t_att = text_tokenization(data_num, 650)
# check input shape
tn_input_ids.shape

(50000, 650)

### TEXT ONLY Dataset Loading

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
t_y = np.array(data["target"])

t_X_train, t_X_test, t_y_train, t_y_test = train_test_split(t_input_ids, t_y, test_size=0.1, random_state=42)

In [None]:
t_X_train.shape

(45000, 650)

### NUMERICAL + TEXT Dataset Loading

In [None]:
# inserting numerical values
age_arr = data_num['age'].values
bmi_arr = data_num['BMI'].values

TN_input_ids = np.insert(tn_input_ids, 0, age_arr, axis=1)
TN_input_ids = np.insert(TN_input_ids, 0, bmi_arr, axis=1)

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
tn_y = np.array(data_num["target"])

tn_X_train, tn_X_test, tn_y_train, tn_y_test = train_test_split(TN_input_ids, tn_y, test_size=0.1, random_state=42)

In [None]:
tn_X_train.shape

(45000, 652)

## LSTM

In [None]:
# Check GPU
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense , Dropout, Bidirectional
from keras.optimizers import Adam, Adagrad

# vocab_size = t_vocab
vocab_size = tn_vocab

max_seq_length = 652
embedding_dim = 300

# Define the model architecture
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, 
                    input_length = max_seq_length))

# Simple LSTM
model.add(LSTM(units=128, dropout=0.1, return_sequences=True))
model.add(LSTM(units=64, dropout=0.1))
model.add(Dense(units=1, activation='sigmoid'))

# # Bidirectional LSTM
# model.add(Bidirectional(LSTM(64, return_sequences=True)))
# model.add(Bidirectional(LSTM(64)))
# model.add(Dense(units=1, activation='sigmoid'))

print(model.summary())

model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 652, 300)          1536000   
                                                                 
 lstm_2 (LSTM)               (None, 652, 128)          219648    
                                                                 
 lstm_3 (LSTM)               (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,805,121
Trainable params: 1,805,121
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# Train the model
# model.fit(t_X_train, t_y_train, batch_size=128, epochs=10, validation_split=0.2) # for text-only
model.fit(tn_X_train, tn_y_train, batch_size=128, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f1d5172c7c0>

## Predictions

In [None]:
predictions = model.predict(tn_X_test)



In [None]:
y_pred = np.where(predictions > 0.45, 1, 0)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(tn_y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      1.00      0.88      3958
           1       0.00      0.00      0.00      1042

    accuracy                           0.79      5000
   macro avg       0.40      0.50      0.44      5000
weighted avg       0.63      0.79      0.70      5000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
