In [54]:
# import required packages

import pandas as pd
import numpy as np

import nltk
from time import time
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import category_encoders as ce
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC

from sklearn import svm

from sklearn.linear_model import ElasticNet

from sklearn.linear_model import SGDClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score



In [55]:
# load the data

df = pd.read_csv('8k_diabetes.csv')

In [56]:
def dropAndReplaceData(df):

    # Drop weight, payer_code, medical_specialty because they have a large number of missing values

    df = df.drop(['weight','payer_code','medical_specialty'], axis = 1)

    #acetohexamide,examide,citoglipton,Troglitazone, glimepiride.pioglitazone, metformin.rosiglitazone ,metformin.pioglitazone 

    #In the following columns, there are 2 or 3 classes, however one of them has the majority of values

    #Tolbutamide, glipizide.metformin 
    #- The great majority of the data is NO, 7998. Only 2 are Steady
    df = df.drop(['tolbutamide', 'glipizide.metformin'], axis = 1)
    #Tolazamide
    #- The great majority of the data is NO, 7999. Only 1 is Steady
    df = df.drop(['tolazamide'], axis = 1)
    #Miglitol
    #- The great majority is NO 7997, Steady 2 and Down 1
    df = df.drop(['miglitol'], axis = 1)

    #Acarbose
    #- The majority is NO 7976, Steady 23 and Up 1
    df = df.drop(['acarbose'], axis = 1)

    #Chlorpropamide
    #- No 7990, Steady 9, Up 1
    df = df.drop(['chlorpropamide'], axis = 1)

    #Nateglinide
    #- No 7962, Steady 36, Down 1, Up 1
    df = df.drop(['nateglinide'], axis = 1)

    #Repaglinide
    #- No 7888, Steady 96, Up 11, Down 5
    df = df.drop(['repaglinide'], axis = 1)

    #acetohexamide has only one value for all rows
    df = df.drop(['acetohexamide'], axis =1)


    #df = df.replace('?', 'Not Mapped')
    df = df.replace('?', 'Missing')


    #df['admission_type_id'].isnull().sum()

    # I decided to replace the null values in the following columns because the data on these rows could be important

    df['admission_type_id'].fillna("Not Mapped", inplace = True)
    df['discharge_disposition_id'].fillna("Not Mapped", inplace = True)
    df['admission_source_id'].fillna("Not Mapped", inplace = True)

    # In the case of the diagnostic description, I prefer to leave it as "" but not null. 
    df['diag_1_desc'].fillna("", inplace = True)
    df['diag_2_desc'].fillna("", inplace = True)
    df['diag_3_desc'].fillna("", inplace = True)


    df['age'] = df['age'].replace({"[70-80)":75,
                             "[60-70)":65,
                             "[50-60)":55,
                             "[80-90)":85,
                             "[40-50)":45,
                             "[30-40)":35,
                             "[90-100)":95,
                             "[20-30)":25,
                             "[10-20)":15,
                             "[0-10)":5})
    return df


In [57]:
df = dropAndReplaceData(df)

In [58]:
# clean_text is a function to remove tokens like white spaces in the text

def clean_text(text):
    
    stop_words = set(stopwords.words("english")) 
    lemmatizer = WordNetLemmatizer()
    
    text = re.sub(r'[^A-Za-z0-9]+',' ',text)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    
    return text

def lematization(df):

    df['diag_1_desc'] = df.diag_1_desc.apply(lambda x: clean_text(x))

    df['diag_2_desc'] = df.diag_2_desc.apply(lambda x: clean_text(x))

    df['diag_3_desc'] = df.diag_3_desc.apply(lambda x: clean_text(x))
    
    return df



In [59]:
df = lematization(df)


In [60]:
# define the data preparation for the columns

def getCatNumPipeline(categorical_columns, numerical_columns):

    cat_encoder = ce.CatBoostEncoder()

    tfidf_params = dict(sublinear_tf= True, 
                           #min_df = 5, 
                           norm= 'l2', 
                           #ngram_range= (1,2), 
                           stop_words ='english')

    transformer = [
        ('cat_encoder', cat_encoder, categorical_columns), 
        #('num_scaler', MinMaxScaler(), numerical_columns),
        ('num_scaler', StandardScaler(), numerical_columns),

        #('tf_idf',TfidfVectorizer(**tfidf_params), text_columns)
        ]

    col_transform = ColumnTransformer(transformers=transformer)

    #text_model = LogisticRegression(solver="saga", penalty="elasticnet", l1_ratio=0.5)

    pipeline = Pipeline(steps=[('prep',col_transform)])
    
    return pipeline


In [61]:
# split into inputs and outputs

X = df.drop(['readmitted'],axis =1)
y = df['readmitted'].astype(int)

# determine categorical and numerical features

numerical_columns = X.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns

categorical_columns = df.drop(['time_in_hospital','num_lab_procedures','num_procedures','num_medications','number_outpatient','number_emergency','number_inpatient','number_diagnoses','diag_1_desc','diag_2_desc','diag_3_desc','readmitted'],axis=1).columns
#X.select_dtypes(include=['object', 'bool']).columns

text_columns = ['diag_1_desc','diag_2_desc','diag_3_desc']


pipeline = getCatNumPipeline(categorical_columns, numerical_columns)


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size=0.2)


pipeline.fit(X_train, y_train)

X_train_cat_num = pipeline.transform(X_train)

X_test_cat_num = pipeline.transform(X_test)

TF-IDF Pipeline

In [62]:

def getTextPipeline():

    # get the params
    tfidf_params = dict(sublinear_tf= True, 
                           #min_df = 5, 
                           norm= 'l2', 
                           #ngram_range= (1,2), 
                           stop_words ='english')


    # create a Pipeline that will do features transformation then pass to the model

    cls = Pipeline(steps=[
    ('features', TfidfVectorizer(**tfidf_params)),
    #('elasticnet', ElasticNet(random_state=0))
    ('clf', LogisticRegression(solver="saga", penalty="elasticnet", l1_ratio=0.5))
    ])
    
    return cls


In [63]:
cls = getTextPipeline()

def getTextMetamodel(X_train,y_train,X_test):
    
    # Use clf as a model, fit X_train and y_train
    cls.fit(X_train['diag_1_desc'], y_train)

    # predicted 
    text_pred_model1 = cls.predict_proba(X_test['diag_1_desc'])[:,1]
    #text_pred_model1 = cls.predict(X_test['Processed_diag_1_desc'])

    # Use clf as a model, fit X_train and y_train
    cls.fit(X_train['diag_2_desc'], y_train)

    # predicted 
    text_pred_model2 = cls.predict_proba(X_test['diag_2_desc'])[:,1]
    #text_pred_model2 = cls.predict(X_test['Processed_diag_2_desc'])

    # Use clf as a model, fit X_train and y_train
    cls.fit(X_train['diag_3_desc'], y_train)

    # predicted 
    text_pred_model3 = cls.predict_proba(X_test['diag_3_desc'])[:,1]
    #text_pred_model3 = cls.predict(X_test['Processed_diag_3_desc'])
    
    return text_pred_model1,text_pred_model2,text_pred_model3


In [64]:
def joinData(cat_num,text_pred1,text_pred2,text_pred3):
    
    #numerical_columns and text_columns are "Global Variables". There is no need to use them as attributes
    
    columns = np.append(categorical_columns, numerical_columns)

    #df_cat_num = pd.DataFrame(data=X_test_cat_num,columns = columns)

    df_cat_num = pd.DataFrame(data=cat_num,columns = columns)

    #df_text_pred = pd.DataFrame(np.array([text_pred_model1,text_pred_model2,text_pred_model3]).T,columns = text_columns)
    df_text_pred = pd.DataFrame(np.array([text_pred_model1,text_pred_model2,text_pred_model3]).T,columns = text_columns)

    X = df_cat_num.join(df_text_pred)
    
    return X

In [65]:
text_pred_model1,text_pred_model2,text_pred_model3 = getTextMetamodel(X_train,y_train,X_test)

X = joinData(X_test_cat_num,text_pred_model1,text_pred_model2,text_pred_model3)

y = y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size=0.2)


In [43]:
clf = GradientBoostingClassifier(n_estimators=400, learning_rate=1.0,max_depth=9, random_state=0)

clf.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=1.0, max_depth=9, n_estimators=400,
                           random_state=0)

In [44]:
clf.score(X_test, y_test)

0.6375

In [45]:
pred_prob = clf.predict_proba(X_test)

auc_score1 = roc_auc_score(y_test, pred_prob[:,1])

auc_score1

0.5926066270893857

In [69]:
clfGBC = GradientBoostingClassifier(n_estimators=40, learning_rate=1.0,max_depth=9, random_state=0)
    
scores = cross_val_score(clfGBC, X, y, cv=10,scoring='roc_auc')
scores

array([0.59719405, 0.56355646, 0.58181204, 0.57133198, 0.49560514,
       0.59533469, 0.65984226, 0.61369357, 0.55042792, 0.54740728])

In [82]:
clfRFC = RandomForestClassifier(n_estimators=1000, random_state=0)
    
scores = cross_val_score(clfRFC, X, y, cv=10,scoring='roc_auc')
scores

array([0.637762  , 0.5872211 , 0.67900609, 0.62711291, 0.59001014,
       0.62770453, 0.72252056, 0.61511999, 0.64994127, 0.5839906 ])

In [83]:
clfRFC.fit(X_train, y_train)

pred_prob = clfRFC.predict_proba(X_test)

auc_score2 = roc_auc_score(y_test, pred_prob[:,1])

auc_score2

0.6155109258557535

In [48]:
clfSDGC = SGDClassifier()
    
scores = cross_val_score(clfSDGC, X, y, cv=10,scoring='roc_auc')
scores

array([0.61883029, 0.56659905, 0.65179175, 0.59516565, 0.62964841,
       0.65111562, 0.64524249, 0.62426582, 0.65145159, 0.52878   ])

### Test with 2k File

In [49]:
df_scoring = pd.read_csv('2k_diabetes_scoring.csv')

In [50]:
df_scoring.columns

Index(['race', 'gender', 'age', 'weight', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'payer_code', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide.metformin', 'glipizide.metformin',
       'glimepiride.pioglitazone', 'metformin.rosiglitazone',
       'metformin.pioglitazone', 'change', 'diabetesMed', 'diag_1_desc',
       'diag_2_desc', 'diag_3_desc'],
      dtype='object')

In [51]:
# Preprocess the data data for scoring


df_scoring = dropAndReplaceData(df_scoring)

df_scoring = lematization(df_scoring)

#numerical_columns

#categorical_columns

#text_columns


X = df_scoring

#pipeline = getCatNumPipeline(categorical_columns, numerical_columns)


#pipeline.fit(X_train, y_train)

# Encode categorical data and standarize numerical data 


X_cat_num = pipeline.transform(X)

# Reusing cls

# predicted 
text_pred_model1 = cls.predict_proba(X['diag_1_desc'])[:,1]
#text_pred_model1 = cls.predict(X_test['Processed_diag_1_desc'])

# predicted 
text_pred_model2 = cls.predict_proba(X['diag_2_desc'])[:,1]
#text_pred_model2 = cls.predict(X_test['Processed_diag_2_desc'])

# predicted 
text_pred_model3 = cls.predict_proba(X['diag_3_desc'])[:,1]
#text_pred_model3 = cls.predict(X_test['Processed_diag_3_desc'])

X = joinData(X_cat_num,text_pred_model1,text_pred_model2,text_pred_model3)

#score = clf.score(X, y)



In [None]:
#pred = clf.predict_proba(X)[:,1]
pred = clf.predict(X)

In [None]:
readmitted=pd.DataFrame(data=pred,columns = ['readmitted'])
#df_scoring.join(readmitted).to_csv('freire_diego_pred2.csv')

df_scoring.join(readmitted)

### 8k train 2k test

In [None]:
df_train = pd.read_csv('8k_diabetes.csv')

df_test = pd.read_csv('2k_diabetes_scoring.csv')


df_train = dropAndReplaceData(df_train)

df_test = dropAndReplaceData(df_test)

df_train = lematization(df_train)

df_test = lematization(df_test)

numerical_columns = df_train.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns

categorical_columns = df_train.drop(['time_in_hospital','num_lab_procedures','num_procedures','num_medications','number_outpatient','number_emergency','number_inpatient','number_diagnoses','diag_1_desc','diag_2_desc','diag_3_desc','readmitted'],axis=1).columns
#X.select_dtypes(include=['object', 'bool']).columns

text_columns = ['diag_1_desc','diag_2_desc','diag_3_desc']


X_train = df_train.drop(["readmitted"],axis=1)

y_train = df_train["readmitted"].astype(int)

X_test = df_test


pipeline = getCatNumPipeline(categorical_columns,numerical_columns)

pipeline.fit(X_train, y_train)

X_train_cat_num = pipeline.transform(X_train)

X_test_cat_num = pipeline.transform(X_test)


# text metamodel

#text_pred_model1,text_pred_model2,text_pred_model3 = getTextMetamodel(X_train,y_train,X_train)

cls = getTextPipeline()

# Use clf as a model, fit X_train and y_train
cls.fit(X_train['diag_1_desc'], y_train)

# predicted 
text_pred_model1 = cls.predict_proba(X_train['diag_1_desc'])[:,1]
#text_pred_model1 = cls.predict(X_test['Processed_diag_1_desc'])

# Use clf as a model, fit X_train and y_train
cls.fit(X_train['diag_2_desc'], y_train)

# predicted 
text_pred_model2 = cls.predict_proba(X_train['diag_2_desc'])[:,1]
#text_pred_model2 = cls.predict(X_test['Processed_diag_2_desc'])

# Use clf as a model, fit X_train and y_train
cls.fit(X_train['diag_3_desc'], y_train)

# predicted 
text_pred_model3 = cls.predict_proba(X_train['diag_3_desc'])[:,1]


X_train = joinData(X_train_cat_num,text_pred_model1,text_pred_model2,text_pred_model3)



In [None]:
clf = GradientBoostingClassifier(n_estimators=400, learning_rate=1.0,max_depth=9, random_state=0)

clf.fit(X_train, y_train)


In [None]:
#cls = getTextPipeline()

# Use clf as a model, fit X_train and y_train
#cls.fit(X_train['diag_1_desc'], y_train)

# predicted 
text_pred_model1 = cls.predict_proba(X_test['diag_1_desc'])[:,1]
#text_pred_model1 = cls.predict(X_test['Processed_diag_1_desc'])

# Use clf as a model, fit X_train and y_train
#cls.fit(X_train['diag_2_desc'], y_train)

# predicted 
text_pred_model2 = cls.predict_proba(X_test['diag_2_desc'])[:,1]
#text_pred_model2 = cls.predict(X_test['Processed_diag_2_desc'])

# Use clf as a model, fit X_train and y_train
#cls.fit(X_train['diag_3_desc'], y_train)

# predicted 
text_pred_model3 = cls.predict_proba(X_test['diag_3_desc'])[:,1]


X_test = joinData(X_test_cat_num,text_pred_model1,text_pred_model2,text_pred_model3)




In [None]:
pred = clf.predict(X_test)


In [None]:
readmitted=pd.DataFrame(data=pred,columns = ['readmitted'])
df_scoring.join(readmitted).to_csv('freire_diego_pred2.csv')

In [None]:
df_scoring.join(readmitted)