# NSTI Modeling preset 

In [181]:
import numpy as np
import pandas as pd # data import and maniplation 
import os 
import seaborn as sns # data visualization 
import matplotlib.pyplot as plt # data visualization 
import re
from datetime import datetime, timedelta

from sklearn import datasets, linear_model # regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split # data split 

# Input data

Input: 4 filess 

Output: 4 dataframes that are stripped 

In [182]:
# if I want to change between google collab or spyder 
if False:
    from google.colab import files
    files.upload()    
nsti_med = pd.read_csv("NSTI_meds_for_KL.csv", na_values = ["NA"])
nsti_debride = pd.read_csv("NSTI_debride_for_KL.csv", na_values = ["NA"])
nsti = pd.read_csv("NSTI_for_KL.csv", na_values = ["NA"])
nsti_wbc = pd.read_csv("NSTI_WBC_updated_for_KL.csv")

#issues with extra white spaces 
nsti_med.columns = nsti_med.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
nsti_debride.columns = nsti_debride.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
nsti.columns = nsti.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
nsti_wbc.columns = nsti_wbc.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

#issues with extra white spaces 
nsti_medt = nsti_med.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
nsti_debridet = nsti_debride.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
nstit = nsti.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
nsti_wbct = nsti_wbc.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

## Shape of the datasets

In [183]:
print(nsti_medt.shape)
print(nsti_debridet.shape)
print(nstit.shape)
print(nsti_wbct.shape)

(35410, 6)
(502, 80)
(432, 78)
(9730, 7)


# NSTI clean up / merge 

In [184]:
#removes the suffix in the nsti dataset for easier merging 
nstit['patient'] = pd.to_numeric(nstit['patient'].str.split("*", n = 1, expand = True)[0])

# [NSTI med]Data manipulation


## [NSTI med] cleanup 

Removes NA's + other clean up

In [186]:
# removes bad entries 
nsti_medt = nsti_medt[(nsti_medt["med_start_date"] != "NOT") & 
                      (nsti_medt["med_start_date"] != "")  & 
                      (nsti_medt["med_start_date"] != "UNK") & 
                      (nsti_medt["med_start_date"] != "NA")]

# removes antibiotics that aren't needed
nsti_medt = nsti_medt[~  nsti_medt['meds'].isin(["DOBU", "ENOX", "EPINEP", "NOPRESS", "STERIOD", "VASO", "WARF", 
                                                 "ANTIBIOT", "DOP", "HEP", "IVIG", "NOREPI", "PCC", "SOFA"])]

#removes stuff from time date 
nsti_medt["med_start_time"] = nsti_medt["med_start_time"].apply(lambda x: x.replace("NOT", "0:00")).apply(lambda x: x.replace("UNK", "0:00")).apply(lambda x: x.replace("NA", "0:00"))

## [NSTI med][NSTI wbc]  Data Manipulation

Combining date and time and converting it to time date format

Also converting the time of wbc for validation

In [197]:
# combines into a full date and formats the string type
nsti_medt['total_time'] = nsti_medt['med_start_date'] + " " + nsti_medt['med_start_time']
nsti_medt['total_time'] = pd.to_datetime(nsti_medt['total_time'].apply(lambda x: x.strip(" ")), format = '%m/%d/%Y %H:%M')

#remove the parent columns 
nsti_medt = nsti_medt[nsti_medt.columns[~nsti_medt.columns.str.contains('med_start')]]

# Formating the times to time data
nsti_wbct['observationdttm'] = pd.to_datetime(nsti_wbct['observationdttm'].apply(lambda x: x.strip(" ")), format = '%m/%d/%y %H:%M')
nsti_wbct['specimenreceiveddttm'] = pd.to_datetime(nsti_wbct['specimenreceiveddttm'].apply(lambda x: x.strip(" ")), format = '%m/%d/%y %H:%M')

## [NSTI_medt][NSTI_wbct] Date validation
ASK: Match the last WBC date to the last date in the medication


Notes: Outputed a csv towards comparing the different latest times in the medication and wbc files 


In [225]:
# Takes medt and wbct df, abstracts identifer and time, takes only the most recent date of each patient 
testmedt = nsti_medt.sort_values(by = ['study_id','total_time'], ascending = [True, False]).drop_duplicates(subset = ['study_id'], keep = 'first')
testwbct = nsti_wbct.sort_values(by = ['patient', 'observationdttm'], ascending = [True, False]).drop_duplicates(subset = ['patient'], keep = 'first')

In [224]:
# Merges the two df's above on each respective identifier
testmedt[['study_id','total_time']].merge(testwbct[['patient','observationdttm']], left_on = 'study_id', right_on = 'patient')
#testmedt[['study_id','total_time']].merge(testwbct[['patient','observationdttm']], left_on = 'study_id', right_on = 'patient').to_csv('med_vs_wbc.csv')

Unnamed: 0,study_id,admit_date,meds,med_location,total_time
13791,1,3/18/2015,MERO,,2015-04-22 09:33:00
13757,1,3/18/2015,MERO,,2015-04-22 00:46:00
13801,1,3/18/2015,MERO,,2015-04-21 16:40:00


## Creating a proxy dataset for just total time andy study id

In [226]:
# makes it so there isn't any duplicate 
nsti_med_max = nsti_medt[['study_id', 'total_time']]
nsti_med_max = nsti_med_max.drop_duplicates(subset = ['study_id', 'total_time'], keep = 'first')
idx = nsti_med_max.groupby(['study_id'])['total_time'].transform(max) == nsti_med_max['total_time']
       
nsti_med_max = nsti_med_max[idx]

# [nstit][nsti_debridet] Merger of nsti demo and debride data

In [185]:
# Merges the debride and nsti df to one 
nsti_full = nstit.merge(nsti_debridet, left_on= "patient", right_on = "study_id")

## Removing featuresfrom nsti_full

Removes Cost, deb, and rec data from the overall dataset while keeping it as a seperate dataset with identifiers 

- debisolate
- recisolate 

In [188]:
#Remove cost data
listcost = list(i for i in list(nsti_full.columns) if not (re.search(r'(costs)',i)))
nsti_full = nsti_full[listcost]

# removing the deb data 
listdeb = list(i for i in list(nsti_full.columns) if (re.search(r'^(deb)',i)))
debisolate = nsti_full[['patient']+listdeb]
nsti_full.drop(listdeb, axis = 1, inplace = True)

# removin the rec data
listrec = list(i for i in list(nsti_full.columns) if (re.search(r'^(rec)',i)))
recisolate = nsti_full[['patient']+listrec]
nsti_full.drop(listrec, axis = 1, inplace = True)

## Formatting the date in [nsti_full] and pulling the target variable

In [228]:
# Removing nulls
nsti_full = nsti_full[nsti_full['minutes_from'] != "NA"]

#Formating string dates to timedate type
nsti_full['admit.x'] = pd.to_datetime(nsti_full['admit.x'], format = '%m/%d/%Y')
nsti_full['discharge'] = pd.to_datetime(nsti_full['discharge'], format = '%m/%d/%Y')
nsti_full['minutes_from'] = pd.to_numeric(nsti_full['minutes_from']).apply(lambda x: timedelta(minutes = x))

# Making a feature that combines the minutes from {admit to last debridement} 
# and the date of the admission to get rough time of the actual time date of debridement
nsti_full['admittodeb'] = nsti_full['admit.x'] + nsti_full['minutes_from']

## TASK: Find the difference in time between the last debridement and last medication 

- Status: Completed 

- Notes: 

The main issue is that there are negative values when it should be mostly positive.

It was noted that for some of the patients, incomplete data as well as wrong indication towards data usage.

Manual picking is needed 

- Updates: Need to work off of the new dataset

In [229]:
# making sure that the dataset only has positive variables 
nsti_complete = nsti_full.merge(nsti_med_max, left_on = 'patient', right_on= 'study_id' )

# Creating a new feature in the difference in hours 
nsti_complete['debtomed'] = (nsti_complete['total_time'] - nsti_complete['admittodeb']) / np.timedelta64(1, 'h') 
nsti_complete = nsti_complete.loc[nsti_complete['debtomed'] >= 0.0]
nsti_complete = nsti_complete[nsti_complete.columns[~nsti_complete.columns.str.contains('minutes')]]

# Cleaning up the dataset by eliminating some of the extra features 
nsti_complete.drop(['discharge','sex','admit.x','location.code.1', 'location.code.2','icd.10.code..1', 'icd.10..1.description','icd.10.code..2','icd10..2.descrition','admit.y',
                    'insure','insure_type','admit_date','dc_date','transfer_y','dc.dispo','admittodeb', 'total_time','study_id_y','study_id_x','transfer_x'], axis = 1, inplace = True)

# Model creation 

Under construction need some more work and understand of the data 

# Feature encoding 

Need to figure out what exactly to encode some of the features 

In [None]:
nsti_complete_filter = nsti_complete[nsti_complete['debtomed'] <5000]

binary_change = {"outcome": {"A": 1, "D": 0}}

nsti_complete_filter.replace(binary_change, inplace=True)
nsti_complete_filter.head()

#https://stackoverflow.com/questions/29034928/pandas-convert-a-column-of-list-to-dummies
nsti_complete_filter = pd.get_dummies(nsti_complete_filter, columns =['mechanism.of.infection', 'race'])

dummy_code = pd.get_dummies(nsti_complete_filter['comorbid.codes'].str.split(';').apply(pd.Series).stack()).sum(level=0)
dummy_region = pd.get_dummies(nsti_complete_filter['region'].str.split('; ').apply(pd.Series).stack()).sum(level=0)
df_dummyfull = pd.concat([dummy_code,dummy_region],axis=1,join = 'inner')
nsti_new = pd.concat([nsti_complete_filter, dummy_region], axis = 1)
nsti_new.drop(['region','patient','comorbid.codes','co.morbids','icu.los','icu.hours','hosp.los', 'vent.days'], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  regex=regex)


Converting to all floats 

In [None]:
# see how many na are there
fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(nsti_new.dropna().isnull(), cbar=False, cmap="YlGnBu_r")
fig.patch.set_facecolor('xkcd:mint green')
plt.show()

#nsti_new.dropna(axis=1,how='all')

In [None]:
corr = nsti_new.corr()

In [None]:
fig, ax = plt.subplots(figsize=(20,14))
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns,
        cmap = "PiYG")


Doesn't seem to be that much correlation for our target 

Feature selection

In [None]:
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier

import seaborn as sns
import statsmodels.api as sm
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

nsti_new = nsti_new.dropna()
y = nsti_new['debtomed']
y = y.astype('int')
nsti_new.drop(['debtomed','unnamed:_0'], axis = 1, inplace = True)
X = nsti_new

# Univariate selection
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

#tasks: Need to convert other strings to numerical or numeraid

                          Specs       Score
0                           age  961.014047
54                        race_  368.000000
48  mechanism.of.infection_BURN  368.000000
6                         cauti  368.000000
18                          vap  368.000000
20                     bleeding  368.000000
15                           pe  322.000000
13                unplanned.ett  312.166667
16                 unplanned.or  304.500000
10                          cpr  291.250000


In [None]:
model = LinearRegression().fit(X[['age','race_','mechanism.of.infection_BURN', 'cauti','vap','bleeding','pe','unplanned.ett','unplanned.or','cpr']], y)

r_sq = model.score(X[['age','race_','mechanism.of.infection_BURN', 'cauti','vap','bleeding','pe','unplanned.ett','unplanned.or','cpr']], y)
print('coefficient of determination:', r_sq)


coefficient of determination: 0.04367364652493122


Seeing as a linear regression doesn't really help with this