<a href="https://colab.research.google.com/github/flozgom/Machine_Learning_Healthcare/blob/main/MITx_ICU_Mortality_Skeleton.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import auth
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, f1_score

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer


## Loading Data


In [None]:
auth.authenticate_user()

#### Lab Data

In [None]:
!gsutil cp gs://mlhc-mimic/adult_icu.gz ./


In [None]:
lab_df = pd.read_csv('adult_icu.gz')

#### Note Data

In [None]:
!gsutil cp gs://mlhc-mimic/adult_notes.gz ./


In [None]:
note_df = pd.read_csv('adult_notes.gz')

## Question 2

Predicting hospital mortality from lab values



In [None]:
##TODO: Explore the dataset lab_df
print('Number of unique subjects %s' %len(lab_df['subject_id'].unique()))
lab_df

In [None]:
lab_df['mort_icu'].describe()

In [None]:
##Dropping features
lab_df.drop(["subject_id","hadm_id","icustay_id", "mort_icu", "mort_oneyr", "adult_icu", "admType_NEWBORN"], axis=1, inplace=True)

In [None]:
##TODO: Split the dataset into train/val/test. Note that we have already provided
##the columns "train", "test", "valid" for you which splits the dataset into 
##training set, validation set and testing set. 
##Once you're done, remove the columns train, val and test from the dataset. 
#for 4: 
#lab_df = lab_df[lab_df.age >40]
training_df = lab_df[lab_df.train == 1]
training_df.drop(['train'], axis=1, inplace=True)
print('Number of training samples %s' %len(training_df))
#test data
testing_df = lab_df[lab_df.test == 1]
testing_df.drop(['test'], axis=1, inplace=True)
print('Number of testing samples %s' %len(testing_df))
#val data 
validation_df = lab_df[lab_df.valid== 1]
validation_df.drop(['test'], axis=1, inplace=True)
print('Number of testing samples %s' %len(validation_df))

In [None]:
##TODO: Normalize the data in train/val/test. Be sure to fit StandardScaler to the training dataset only!
scaler = StandardScaler()
training_scaler = scaler.fit(training_df.drop(['mort_hosp'],axis=1,inplace=False))
training_std_df = training_scaler.transform(training_df.drop(['mort_hosp'],axis=1,inplace=False))
testing_std_df = training_scaler.transform(testing_df.drop(['mort_hosp'],axis=1,inplace=False))
validation_std_df = training_scaler.transform(validation_df.drop(['mort_hosp'],axis=1,inplace=False))

In [None]:
training_df

In [None]:
training_std_df.shape

In [None]:
##TODO: Problem 2.5, 2.6 - Train a Logistic Regression model (with solver = 'libnear') to predict mortality given the remaining features available. 

C = [0.1, 0.25, 1]
penalty = ['l1', 'l2']
for c in C:
  for pen in penalty:
    model = LogisticRegression(penalty=pen,C=c,solver='liblinear',max_iter=2000).fit(training_std_df,training_df['mort_hosp'])
    y_pred = model.predict(validation_std_df)
    print('C %s, Penalty %s, Accuracy_score %s' %(c,pen, accuracy_score(validation_df['mort_hosp'],y_pred)))

In [None]:
model = LogisticRegression(penalty='l2',C=1,solver='liblinear',max_iter=2000).fit(training_std_df,training_df['mort_hosp'])
y_pred = model.predict(testing_std_df)
print('Accuracy_score %s' %(accuracy_score(testing_df['mort_hosp'],y_pred)))
print('AUC %s' %(roc_auc_score(testing_df['mort_hosp'],y_pred)))

In [None]:
validation_df

In [None]:
##TODO: Problem 2.7 - Which of the following features are among the top 5 most 
##positive features, based on the coefficients of the logistic regression model?
testing_df.columns

In [None]:
list1 = list(enumerate(importance))
list2 = sorted(list1, key=lambda x:x[1])
list2

In [None]:
indexes_names[14]

In [None]:
indexes_names = testing_df.drop(['mort_hosp'],axis=1).columns

In [None]:
from matplotlib import pyplot
importance = model.coef_[0]
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

In [None]:
print(model.coef_, model.intercept_)

In [None]:
import numpy as np
np.round(model.coef_,decimals=2)>0.1

In [None]:
##TODO: Problem 2.8 - Which of the following features are among the top 5 most 
##negative features, based on the coefficients of the logistic regression model?

## Question 3

Predicting hospital mortality from clinical notes


In [None]:
note_df.head()

In [None]:
note_df.drop(["subject_id","hadm_id","icustay_id", "mort_icu", "mort_oneyr"], axis=1, inplace=True)

In [None]:
##TODO: Split the dataset into train/val/test
training_df = note_df[note_df.train == 1]
training_df.drop(['train'], axis=1, inplace=True)
print('Number of training samples %s' %len(training_df))
#test data
testing_df = note_df[note_df.test == 1]
testing_df.drop(['test'], axis=1, inplace=True)
print('Number of testing samples %s' %len(testing_df))
#val data 
validation_df = note_df[note_df.valid== 1]
validation_df.drop(['test'], axis=1, inplace=True)
print('Number of testing samples %s' %len(validation_df))

In [None]:
training_df['chartext']

In [None]:
##TODO: Fit a CountVectorizer with max_features = 5000 to the trianing dataset and generate features for train/val/test. 
vectorizer = CountVectorizer(max_features = 5000)
X = vectorizer.fit(training_df['chartext'])

In [None]:
y_train = vectorizer.transform(training_df['chartext'])


In [None]:
y_val = vectorizer.transform(validation_df['chartext'])

In [None]:
y_test = vectorizer.transform(testing_df['chartext'])

In [None]:
print(X.toarray())

In [None]:
##TODO: Problem 3.1, 3.2 Train a Logistic Regression model (with solver = 'liblinear') to predict mortality given the remaining features available. 

C = [0.1,0.25,1]
penalty = ['l1','l2']
for c in C:
  for pen in penalty:
    model = LogisticRegression(penalty=pen,C=c,solver='liblinear',max_iter=2000).fit(y_train,training_df['mort_hosp'])
    y_pred = model.predict(y_val)
    print('C %s, Penalty %s, Accuracy_score %s' %(c,pen, accuracy_score(validation_df['mort_hosp'],y_pred)))

In [None]:
model = LogisticRegression(penalty='l1',C=0.1,solver='liblinear',max_iter=2000).fit(y_train,training_df['mort_hosp'])
y_pred = model.predict(y_test)
print('Accuracy_score %s' %(accuracy_score(testing_df['mort_hosp'],y_pred)))
print('AUC %s' %(roc_auc_score(testing_df['mort_hosp'],y_pred)))

In [None]:
##TODO: Problem 3.3 Which of the following features are among the top 5 most 
##predictive positive words, based on the coefficients of the logistic regression model?

In [None]:
importance = model.coef_[0]

In [None]:
list1 = list(enumerate(importance))
list2 = sorted(list1, key=lambda x:x[1])
list2[-5:-1]

In [None]:
vectorizer.get_feature_names()[4790]

In [None]:
##TODO: Problem 3.4 Which of the following features are among the top 5 most 
##predictive negative words, based on the coefficients of the logistic regression model?

## Question 4

Analysis of data and results

In [None]:
lab_df

In [None]:
##TODO: Problem 4.1 - people / mortality rate in different ethnic categorizations
lab_df
asian = lab_df[lab_df.eth_asian == 1]
black = lab_df[lab_df.eth_black == 1]
hispanic = lab_df[lab_df.eth_hispanic == 1]
other = lab_df[lab_df['eth_other'] == 1]
white = lab_df[lab_df.eth_white == 1]

In [None]:
print(len(asian))
print('Mortality Rate %s' %(len(asian)/sum(asian.mort_hosp)/100))

In [None]:
print(len(black))
print('Mortality Rate %s' %(len(black)/sum(black.mort_hosp)/100))

In [None]:
print(len(hispanic))
print('Mortality Rate %s' %(len(hispanic)/sum(hispanic.mort_hosp)/100))

In [None]:
print(len(other))
print('Mortality Rate %s' %(len(other)/sum(other.mort_hosp)/100))

In [None]:
print(len(white))
print('Mortality Rate %s' %(len(white)/sum(white.mort_hosp)/100))

In [None]:
##TODO: Problem 4.2 - plot histogram for ages

In [None]:
lab_df.age.hist()

In [None]:
bins_list = [20, 30, 40, 50, 60, 70 ,80, 90]
lab_df.age.hist(bins=bins_list)

# New Section

In [None]:
##TODO: Problem 4.3 - plot histogram for mortality rates
for age in [20, 30, 40, 50, 60, 70, 80 , 90]:
  a = lab_df[(lab_df.age >= age) & (lab_df.age <=age+9)]
  print('mortality %s : %s' %(age,len(a.subject_id)/sum(a.mort_hosp)/100))
  print('mortality unique %s : %s' %(age,len(a.subject_id.unique())/sum(a.mort_hosp)/100))
  len(a.subject_id.unique())

In [None]:
##TODO: Problem 4.4 - retrain a model using C=1, penalty = l2 and evaluate AUC
##and accuracy on the test set with age less than 40 and on the test set with
##age greater than or equal to 40.

In [None]:
sum(lab_df.mort_oneyr)