### Perform Exploratory Data Analysis (EDA) and discuss the data and what you observe prior to beginning modeling and how impact how to proceed ###

In [21]:
# load the data

import pandas as pd
import numpy as np

df = pd.read_csv('8k_diabetes.csv')

df.shape

(8000, 51)

In [None]:
# preview the data
df.head(10).T

In [None]:
# Checking the counts and Data types of each column 
df.info()

On the previous step, I discovered that these columns contain null values.

admission_type_id
discharge_disposition_id 
admission_source_id
diag_2_desc
diag_3_desc

The null values from these columns can be replaced with '?', which is the simbol for 'Missing Values' in the data set.

In [None]:
# Checking null values in all the columns 
# Here I want to see the number of nulls.
df.isnull().sum()

In [None]:
# This loop will print the unique classes of each column 

print("All classes by column")
for column in df.columns:
    print(column)
    print(df[column].drop_duplicates())

The age column can be considered categorical data since the values represent a range of age. 
Maybe the values could be replaced with an age in the middle, for example 50-60 = 55.

In [None]:
# This loop will print the counts of each class in every column

print("All value counts by column")
for column in df.columns:
    print(column,"\n")
    
    print(df[column].value_counts(),"\n")
    

In the dataset '?' represents mission values.
This code will show the count of '?' in all the columns.

In [None]:
for col in df.columns:
    if df[col].dtype == object:
         print(col,df[col][df[col] == '?'].count())

### Pre-processed categorical data for use in the model and justified pre-processing method. Note this may be different for each algorithm you try ###

Drop the columns with a large count of missing values and where the majority of categories are only one value

In [22]:
# Drop weight, payer_code, medical_specialty because they have a large number of missing values

df = df.drop(['weight','payer_code','medical_specialty'], axis = 1)

In [23]:
#acetohexamide,examide,citoglipton,Troglitazone, glimepiride.pioglitazone, metformin.rosiglitazone ,metformin.pioglitazone 

#In the following columns, there are 2 or 3 classes, however one of them has the majority of values

#Tolbutamide, glipizide.metformin 
#- The great majority of the data is NO, 7998. Only 2 are Steady
df = df.drop(['tolbutamide', 'glipizide.metformin'], axis = 1)
#Tolazamide
#- The great majority of the data is NO, 7999. Only 1 is Steady
df = df.drop(['tolazamide'], axis = 1)
#Miglitol
#- The great majority is NO 7997, Steady 2 and Down 1
df = df.drop(['miglitol'], axis = 1)

#Acarbose
#- The majority is NO 7976, Steady 23 and Up 1
df = df.drop(['acarbose'], axis = 1)

#Chlorpropamide
#- No 7990, Steady 9, Up 1
df = df.drop(['chlorpropamide'], axis = 1)

#Nateglinide
#- No 7962, Steady 36, Down 1, Up 1
df = df.drop(['nateglinide'], axis = 1)

#Repaglinide
#- No 7888, Steady 96, Up 11, Down 5
df = df.drop(['repaglinide'], axis = 1)

#acetohexamide has only one value for all rows
df = df.drop(['acetohexamide'], axis =1)

In [24]:
# Drop all rows where discharge_disposition_id = expired, because it means that the patient died.

df = df.drop(df[df.discharge_disposition_id =='Expired'].index)

In [None]:
df.shape

In [None]:
df.info()

In [25]:
#df['admission_type_id'].isnull().sum()

# I decided to replace the null values in the following columns because the data on these rows could be important

df['admission_type_id'].fillna("?", inplace = True)
df['discharge_disposition_id'].fillna("?", inplace = True)
df['admission_source_id'].fillna("?", inplace = True)

# In the case of the diagnostic description, I prefer to leave it as "" but not null. 
df['diag_2_desc'].fillna("", inplace = True)
df['diag_3_desc'].fillna("", inplace = True)


In [None]:
#encode columns with categorical data

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

cat_data=cat_data = df.drop(['time_in_hospital','num_lab_procedures','num_procedures','num_medications','number_outpatient','number_emergency','number_inpatient','number_diagnoses','diag_1_desc','diag_2_desc','diag_3_desc'], axis = 1 )

for i in cat_data:
    cat_data[i] = le.fit_transform(cat_data[i])

### Pre-processed numerical data appropriately including handling missing data and justified methods used. Note this may be different for each algorithm you try.

In [None]:
# This code will return all the numeric columns

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

df.select_dtypes(include=numerics)[:10].T

Apparently, there are no missing values for the previously listed columns. However, there are some columns like age that could be transformed to numeric.

In [26]:
df['age'] = df['age'].replace({"[70-80)":75,
                         "[60-70)":65,
                         "[50-60)":55,
                         "[80-90)":85,
                         "[40-50)":45,
                         "[30-40)":35,
                         "[90-100)":95,
                         "[20-30)":25,
                         "[10-20)":15,
                         "[0-10)":5})

In [None]:
df['age'].value_counts()

In [27]:
df["readmitted"] = df["readmitted"].astype(int)

### Implement a model to make predictions using text data using tf-idf

Before creating the model, the text data should be processed. I decided to use Lemmatization and Stop Words elimination.
Lemmatization: group words and use the lemma (base form), removing the endings of each word.
Stop Words: Remove words that don't have a significative meaning. (sometimes words used as connectors like "the", "a", etc.

In [72]:
import nltk
from time import time
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.svm import SVC
from sklearn import metrics

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer

from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline

from sklearn import model_selection, naive_bayes, svm

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support

from sklearn.metrics import roc_auc_score

#nltk.download('stopwords')

#nltk.download('wordnet')


In [29]:
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

# clean_text is a function to remove tokens like white spaces in the text

def clean_text(text):
    text = re.sub(r'[^A-Za-z0-9]+',' ',text)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text



In [37]:
## Clean the stop words from the text
df['Processed_diag_1_desc'] = df.diag_1_desc.apply(lambda x: clean_text(x))

df['Processed_diag_2_desc'] = df.diag_2_desc.apply(lambda x: clean_text(x))

df['Processed_diag_3_desc'] = df.diag_3_desc.apply(lambda x: clean_text(x))

# Create a new column joining the 3 text columns

df['diag_desc'] = df['diag_1_desc'] + '. ' + df['diag_2_desc'] + '. ' + df['diag_3_desc']

df['Processed_diag_desc'] = df.diag_desc.apply(lambda x: clean_text(x))

#### Applying TfidfVectorizer


In [116]:
tfidf_vect=TfidfVectorizer()

diag_1_tfidf=tfidf_vect.fit_transform(df['Processed_diag_1_desc'].values.tolist())
diag_2_tfidf=tfidf_vect.fit_transform(df['Processed_diag_2_desc'].values.tolist())
diag_3_tfidf=tfidf_vect.fit_transform(df['Processed_diag_3_desc'].values.tolist())

diag_desc_tfidf  = tfidf_vect.fit_transform(df['Processed_diag_desc'].values.tolist())


In [143]:
X = diag_desc_tfidf

y = df['readmitted']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


### These models will only use the text data (diag_1_desc, diag_2_desc, diag_3_desc combined)

#### Using Naive Bayers

In [118]:
# This is a model using Naive Bayes Classifier using the combined diag_descript tfidf weights

naive_bayes_classifier = MultinomialNB()

naive_bayes_classifier.fit(X_train, y_train)

nb_y_pred = naive_bayes_classifier.predict(X_test)

In [119]:
print("Accuracy Score:",accuracy_score(y_pred, nb_y_pred))
print('Precision:',precision_score(y_test, nb_y_pred))
print('Recall:',recall_score(y_test, nb_y_pred))
print('F1-Score:',f1_score(y_test, nb_y_pred))


Accuracy Score: 0.780891719745223
Precision: 0.46
Recall: 0.3021346469622332
F1-Score: 0.36471754212091184


In [120]:
# evaluate performance with AUC

roc_auc_score(y_test, naive_bayes_classifier.predict_proba(X_test)[:, 1])

# This model is not good enough. Notice that this is only using the text portion of the data.

0.5614200109696899

#### Using SVM

In [121]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', probability=True)

SVM.fit(X_train, y_train)

y_pred_SVM = SVM.predict(X_test)

In [122]:
print("Accuracy Score:",accuracy_score(y_test,y_pred_SVM))
print('Precision:',precision_score(y_test, y_pred_SVM))
print('Recall:',recall_score(y_test, y_pred_SVM))
print('F1-Score:',f1_score(y_test, y_pred_SVM))

Accuracy Score: 0.5987261146496815
Precision: 0.45023696682464454
Recall: 0.15599343185550082
F1-Score: 0.23170731707317072


In [123]:
roc_auc_score(y_test, SVM.predict_proba(X_test)[:, 1])

0.5645878933582116

In [140]:
pipeline = Pipeline([
('vect', CountVectorizer(stop_words='english',lowercase=True)),
("tfidf1", TfidfTransformer(use_idf=True,smooth_idf=True)),
('clf', MultinomialNB(alpha=1)) #Laplace smoothing
 ])

train,test=train_test_split(df,test_size=.3,random_state=42, shuffle=True)
pipeline.fit(train['diag_desc'],train['readmitted'])

predictions=pipeline.predict(test['diag_desc'])
print(test['readmitted'],predictions)

score = f1_score(test['readmitted'],predictions,pos_label='positive',average='micro')
print("Score of Naive Bayes is :" , score)

6798    1
2205    1
3484    1
1941    0
1483    0
       ..
4724    0
4123    0
3637    0
3759    0
617     0
Name: readmitted, Length: 2354, dtype: int32 [0 1 0 ... 1 0 0]
Score of Naive Bayes is : 0.5909090909090909




In [141]:
print("Accuracy Score:",accuracy_score(test['readmitted'],predictions))
print('Precision:',precision_score(test['readmitted'],predictions))
print('Recall:',recall_score(test['readmitted'],predictions))
print('F1-Score:',f1_score(test['readmitted'],predictions))

Accuracy Score: 0.5909090909090909
Precision: 0.4940374787052811
Recall: 0.303347280334728
F1-Score: 0.37589112119248214


In [145]:
from sklearn.ensemble import RandomForestClassifier

In [146]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

y_pred = classifier.predict(X_test)

In [147]:
print("Accuracy Score:",accuracy_score(y_test,y_pred))
print('Precision:',precision_score(y_test,y_pred))
print('Recall:',recall_score(y_test,y_pred))
print('F1-Score:',f1_score(y_test,y_pred))

Accuracy Score: 0.5624203821656051
Precision: 0.40930232558139534
Recall: 0.2889983579638752
F1-Score: 0.3387872954764196


In [149]:
roc_auc_score(y_test, classifier.predict_proba(X_test)[:, 1])

0.5358001466042659

In [152]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
#from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

preprocessor = ColumnTransformer(
     transformers=[
         ('text', TfidfVectorizer(), 'raw_text_ft'), #TfidfVectorizer accepts column name only between quotes
         ('category', LabelEncoder(), ['categorical_ft']),
     ],
)
pipe = Pipeline(
     steps=[
         ('preprocessor', preprocessor),
         ('classifier', LogisticRegression()),
     ],
)

In [153]:
pipe.fit(train['diag_desc'],train['readmitted'])

IndexError: tuple index out of range