**IMPORTING DATA**

In [48]:
# Importing Numpy, Pandas, and Dataset.

import numpy as np
import pandas as pd
df = pd.read_csv('diabetic_readmission_data.csv')
pd.options.display.max_columns = None

In [49]:
# Examine data types and descriptive stats.
# Observations:
# - 50 total columns
# - Some columns seem irrelevant to analysis of project (e.g. payer_code, number_diagnoses) 
# - 37 object dtypes, 13 int dtypes

df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
encounter_id                101766 non-null int64
patient_nbr                 101766 non-null int64
race                        101766 non-null object
gender                      101766 non-null object
age                         101766 non-null object
weight                      101766 non-null object
admission_type_id           101766 non-null int64
discharge_disposition_id    101766 non-null int64
admission_source_id         101766 non-null int64
time_in_hospital            101766 non-null int64
payer_code                  101766 non-null object
medical_specialty           101766 non-null object
num_lab_procedures          101766 non-null int64
num_procedures              101766 non-null int64
num_medications             101766 non-null int64
number_outpatient           101766 non-null int64
number_emergency            101766 non-null int64
number_inpatient            10176

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,102640300.0,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84961190.0,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152389000.0,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230270900.0,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


In [50]:
# Observations:
# - Weight column is missing 98% of data
# - Payer code column is missing 52% of data
# - Medical specialty column is missing 53% of data

df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [51]:
# Original dataframe includes 101,766 rows & 50 columns.

df.shape

(101766, 50)

**DATA CLEANING**

In [52]:
# Dropping any potential duplicate rows that might skew any data analysis.
# Multiple encounters for same patient will skew data.

df = df.drop_duplicates(subset='patient_nbr', keep='first')

In [53]:
# Confirming changes made after duplicate patient encounter removal.

df.shape

(71518, 50)

In [54]:
# Removing columns that have incomplete data or unuseful data points.

df = df.drop(['weight', 'payer_code', 'medical_specialty', 'diag_2', 'diag_3'], axis=1)

In [55]:
# Confirming changes made after column removal.

df.shape

(71518, 45)

In [56]:
# Removing patients who died during hospital admission or discharged to hospice care
# since these patients would not be readmitted again.

df = df.loc[~df.discharge_disposition_id.isin([11,13,14,19,20,21])]


In [57]:
# Removing patients with missing Race values since we are using this as a data point for analysis.

df = df.loc[~df.race.isin(['?'])]

In [58]:
# Confirming changes made after missing Race value removal.

df.shape

(68055, 45)

In [59]:
# Removing patients with missing Gender values or unknown values
# since we are using this as a data point for analysis.

df = df.loc[~df.gender.isin(['Unknown/Invalid'])]

In [60]:
# Confirming changes made after missing Gender value removal.

df.shape

(68054, 45)

**FEATURE ENGINEERING**

In [61]:
# Creating new column to feature Number of Medication changes made upon each encounter.

keys = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide',
        'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 
        'glyburide-metformin', 'tolazamide', 'metformin-pioglitazone','metformin-rosiglitazone', 
        'glimepiride-pioglitazone', 'glipizide-metformin', 'troglitazone', 'tolbutamide', 
        'acetohexamide', 'examide', 'citoglipton', ]

for x in keys:
    colname = str(x) + 'temp'
    df[colname] = df[x].apply(lambda y: 0 if (y == 'No' or y == 'Steady') else 1)
    
df['medchange'] = 0

for x in keys:
    colname = str(x) + 'temp'
    df['medchange'] += df[colname]
    del df[colname]

In [62]:
# Check dataframe for new 'medchange' column.

df.medchange.value_counts()

0    51255
1    15819
2      904
3       73
4        3
Name: medchange, dtype: int64

In [63]:
# # Engineer New features for number of Medications used

df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,medchange
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,41,0,1,0,0,0,250.83,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,0
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,59,0,18,0,0,0,276.0,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,1
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,2,0,1,648.0,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,0
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,44,1,16,0,0,0,8.0,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,1
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,51,0,8,0,0,0,197.0,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,0


In [64]:
df.time_in_hospital.corr(df.number_diagnoses)

0.2341334217327469

In [45]:
keys = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide',
        'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 
        'glyburide-metformin', 'tolazamide', 'metformin-pioglitazone','metformin-rosiglitazone', 
        'glimepiride-pioglitazone', 'glipizide-metformin', 'troglitazone', 'tolbutamide', 
        'acetohexamide', 'examide', 'citoglipton', ]

df['medtotal'] = 0

for x in keys:
    if df[x] != 'No' :
        df['medtotal'] = df['medtotal'] + 1
    

In [46]:
df.medtotal.value_counts()

23    68054
Name: medtotal, dtype: int64

In [41]:
list = [1,2,3,4,5]
list[1]

2

In [18]:
# Count of medication used per patient
df['nummed']= 0

key = [list of medication, a, b, c]

for col in key:
    if df[col] != 'No' :
        df['nummed'] += 1
    

SyntaxError: invalid syntax (<ipython-input-18-bed73c072d30>, line 4)