In [1]:
# load the data

import pandas as pd
import numpy as np

df = pd.read_csv('8k_diabetes.csv')

In [2]:
# Drop weight, payer_code, medical_specialty because they have a large number of missing values

df = df.drop(['weight','payer_code','medical_specialty'], axis = 1)

In [3]:
#acetohexamide,examide,citoglipton,Troglitazone, glimepiride.pioglitazone, metformin.rosiglitazone ,metformin.pioglitazone 

#In the following columns, there are 2 or 3 classes, however one of them has the majority of values

#Tolbutamide, glipizide.metformin 
#- The great majority of the data is NO, 7998. Only 2 are Steady
df = df.drop(['tolbutamide', 'glipizide.metformin'], axis = 1)
#Tolazamide
#- The great majority of the data is NO, 7999. Only 1 is Steady
df = df.drop(['tolazamide'], axis = 1)
#Miglitol
#- The great majority is NO 7997, Steady 2 and Down 1
df = df.drop(['miglitol'], axis = 1)

#Acarbose
#- The majority is NO 7976, Steady 23 and Up 1
df = df.drop(['acarbose'], axis = 1)

#Chlorpropamide
#- No 7990, Steady 9, Up 1
df = df.drop(['chlorpropamide'], axis = 1)

#Nateglinide
#- No 7962, Steady 36, Down 1, Up 1
df = df.drop(['nateglinide'], axis = 1)

#Repaglinide
#- No 7888, Steady 96, Up 11, Down 5
df = df.drop(['repaglinide'], axis = 1)

#acetohexamide has only one value for all rows
df = df.drop(['acetohexamide'], axis =1)

In [4]:
### Explain why I use Not Mapped instead of ?

df = df.replace('?', 'Not Mapped')

In [5]:
#df['admission_type_id'].isnull().sum()

# I decided to replace the null values in the following columns because the data on these rows could be important

df['admission_type_id'].fillna("Not Mapped", inplace = True)
df['discharge_disposition_id'].fillna("Not Mapped", inplace = True)
df['admission_source_id'].fillna("Not Mapped", inplace = True)

# In the case of the diagnostic description, I prefer to leave it as "" but not null. 
df['diag_2_desc'].fillna("", inplace = True)
df['diag_3_desc'].fillna("", inplace = True)


In [20]:
df['age'] = df['age'].replace({"[70-80)":75,
                         "[60-70)":65,
                         "[50-60)":55,
                         "[80-90)":85,
                         "[40-50)":45,
                         "[30-40)":35,
                         "[90-100)":95,
                         "[20-30)":25,
                         "[10-20)":15,
                         "[0-10)":5})

In [21]:
import nltk
from time import time
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

# clean_text is a function to remove tokens like white spaces in the text

def clean_text(text):
    text = re.sub(r'[^A-Za-z0-9]+',' ',text)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text


In [22]:
df['Processed_diag_1_desc'] = df.diag_1_desc.apply(lambda x: clean_text(x))

df['Processed_diag_2_desc'] = df.diag_2_desc.apply(lambda x: clean_text(x))

df['Processed_diag_3_desc'] = df.diag_3_desc.apply(lambda x: clean_text(x))

In [54]:

# example of using the ColumnTransformer for the Abalone dataset
from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR

import category_encoders as ce
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression


# split into inputs and outputs

X = df.drop(['readmitted'],axis =1)
y = df['readmitted'].astype(int)

print(X.shape, y.shape)

# determine categorical and numerical features

numerical_columns = X.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns

categorical_columns = df.drop(['time_in_hospital','num_lab_procedures','num_procedures','num_medications','number_outpatient','number_emergency','number_inpatient','number_diagnoses','diag_1_desc','diag_2_desc','diag_3_desc','Processed_diag_1_desc','Processed_diag_2_desc','Processed_diag_3_desc','readmitted'],axis=1).columns
#X.select_dtypes(include=['object', 'bool']).columns

text_columns = ['Processed_diag_1_desc','Processed_diag_2_desc','Processed_diag_3_desc']

# define the data preparation for the columns

cat_encoder = ce.CatBoostEncoder()

tfidf_params = dict(sublinear_tf= True, 
                       #min_df = 5, 
                       norm= 'l2', 
                       #ngram_range= (1,2), 
                       stop_words ='english')

transformer = [
    ('cat_encoder', cat_encoder, categorical_columns), 
    #('num_scaler', MinMaxScaler(), numerical_columns)
    ('num_scaler', MinMaxScaler(), numerical_columns),
    #('tf_idf',TfidfVectorizer(**tfidf_params), text_columns)
    ]

col_transform = ColumnTransformer(transformers=transformer)

#text_model = LogisticRegression(solver="saga", penalty="elasticnet", l1_ratio=0.5)

pipeline = Pipeline(steps=[('prep',col_transform)])


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)



pipeline.fit(X_train, y_train)


(8000, 41) (8000,)


Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('cat_encoder',
                                                  CatBoostEncoder(),
                                                  Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'diag_1', 'diag_2',
       'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'glimepiride',
       'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone',
       'troglitazone', 'examide', 'c...
       'glyburide.metformin', 'glimepiride.pioglitazone',
       'metformin.rosiglitazone', 'metformin.pioglitazone', 'change',
       'diabetesMed'],
      dtype='object')),
                                                 ('num_scaler', MinMaxScaler(),
                                                  Index(['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'numbe

In [35]:
text_model_transformer

[('text_model',
  LogisticRegression(l1_ratio=0.5, penalty='elasticnet', solver='saga'),
  ['Processed_diag_1_desc', 'Processed_diag_2_desc', 'Processed_diag_3_desc'])]