# Healthcare Final Project

#### Edwin Ramirez, Darshil Desai, Rashi Saxena

In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.feature_selection import f_classif, chi2, SelectKBest
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, make_scorer,roc_auc_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency
from scipy import stats
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
import re


In [2]:
# Reading in the data file
data = pd.read_csv('projectTrain.csv')
data.head()

Unnamed: 0,pdc,num_ip_post,total_los_post,num_op_post,num_er_post,num_ndc_post,num_gpi6_post,adjust_total_30d_post,generic_rate_post,post_ip_flag,...,generic_cost,brand_cost,ratio_G_total_cost,numofgen_post,numofbrand_post,generic_cost_post,brand_cost_post,ratio_G_total_cost_post,pdc_80_flag,drug_class
0,0.333333,0,0,4,0,15,5,14.466667,0.101382,0,...,30.621563,2984.927229,0.010155,2,13,196.359216,3001.501507,0.061403,0,*ANTIDIABETICS*
1,0.866667,0,0,5,0,16,4,18.0,0.888889,0,...,720.788173,0.0,1.0,14,2,671.755173,735.661568,0.477297,1,*ANTIDIABETICS*
2,0.938889,0,0,7,0,11,4,8.6,1.0,0,...,179.538533,0.0,1.0,11,0,171.446621,0.0,1.0,1,*ANTICOAGULANTS*
3,0.077778,1,12,40,0,5,5,2.033333,1.0,1,...,105.816329,0.0,1.0,5,0,335.826436,0.0,1.0,0,*ANTICOAGULANTS*
4,0.444444,0,0,12,1,15,9,14.833333,0.52809,0,...,630.173638,3265.59595,0.161758,8,7,75.988845,2054.400835,0.035669,0,*ANTIDIABETICS*


In [3]:
# Taking a look at the data dimensions
print ('This dataset has {} records and {} features'.format(data.shape[0], data.shape[1]))

This dataset has 2000 records and 94 features


## Determining the Categorical Features

In [145]:
#Gather all feature names
possible_cat_features = np.array(list(data))

feat_unique_vals = np.array([len(np.unique(data[i])) for i in possible_cat_features])

Variables that contain more than 2 unique values could potentially be considered a categorical variable that we have to encode with one hot encoding. We want to only encode categorical variables that contain more than 2 unique values because those that are binary are already encoded. Therefore, if we were to encode these as well, they would lead to a misrepresentation of the number of categorical variables we have. If we set a limit to the number of unique values, we'll have a set of variables that we could individually check in the data dictionary. In this scenario we'll set the maximum unique value to 20.

In [183]:
possible_features = np.unique(possible_cat_features[(feat_unique_vals >2) & (feat_unique_vals < 20)])

In [184]:
print("These are the following variables that may require one hot encoding: ")
count = 0
for val in possible_features:
    print((count,val) )
    count = count+1

These are the following variables that may require one hot encoding: 
(0, 'age_cat')
(1, 'age_grpN')
(2, 'idx_paytypN')
(3, 'idx_prodtypeN')
(4, 'num_er')
(5, 'num_er_post')
(6, 'num_ip')
(7, 'num_ip_post')
(8, 'pdc_cat')
(9, 'pre_CCI')
(10, 'pre_total_cat')
(11, 'regionN')


### Potential Categorical Features That May Require One Hot Encoding 

- age_cat
- age_grpN
- idx_paytypB
- idx_prodtypeN
- num_er
- num_er_post
- num_ip
- pdc_cat
- pre_CCI
- pre_total_cat
- regionN

After viewing the data dictionary, we see that num_er (number of visits), and num_ip (number of stays) are continuous variables, and additionaly we have to remove num_er_post because we will remove all variables related to the post index period to not affect our target variable `post_total_cost`.

In [186]:
index = [4,5,6,7]
possible_features = np.delete(possible_features, index)

In [187]:
possible_features

array(['age_cat', 'age_grpN', 'idx_paytypN', 'idx_prodtypeN', 'pdc_cat',
       'pre_CCI', 'pre_total_cat', 'regionN'], dtype='<U26')

In [189]:
un_encoded = data[possible_features]

In [190]:
label_encoder = preprocessing.LabelEncoder()
cat_features2 = un_encoded.apply(label_encoder.fit_transform)
cat_features2.head()

Unnamed: 0,age_cat,age_grpN,idx_paytypN,idx_prodtypeN,pdc_cat,pre_CCI,pre_total_cat,regionN
0,1,0,0,1,3,1,6,2
1,1,0,3,1,1,0,3,2
2,0,0,0,2,1,0,8,2
3,0,0,0,1,0,0,9,2
4,3,1,0,2,3,1,7,2


In [191]:
def one_hot_encode(df_column, data):
        feature = np.array(data[df_column])
        feature = feature.reshape((2000, 1))
        
        enc = preprocessing.OneHotEncoder()
        enc.fit(feature)
        
        #Transform the data
        onehotlabels = enc.transform(feature).toarray()
        no_cols = onehotlabels.shape[1]
        labels = []
        for ii in range(no_cols):
            labels.append(df_column + str(ii))
            
        one_hot_encoded = pd.DataFrame(onehotlabels)
        one_hot_encoded.columns = labels
        
        return one_hot_encoded

In [192]:
encoded = [one_hot_encode(i, cat_features2) for i in un_encoded.columns]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the On

In [211]:
encoded_cats = pd.DataFrame()

In [212]:
for i in range(len(encoded)):
    encoded_cats = pd.concat([encoded_cats, encoded[i]], axis=1)

In [213]:
encoded_cats.head()

Unnamed: 0,age_cat0,age_cat1,age_cat2,age_cat3,age_cat4,age_cat5,age_grpN0,age_grpN1,age_grpN2,idx_paytypN0,...,pre_total_cat4,pre_total_cat5,pre_total_cat6,pre_total_cat7,pre_total_cat8,pre_total_cat9,regionN0,regionN1,regionN2,regionN3
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [214]:
encoded_cats.shape

(2000, 46)

Now we must include the additional binary categorical variables with our encoded categorical variables. We can do this by viewing all the total categorical variables that only contain 2 unique values.

In [224]:
other_cats = np.unique(possible_cat_features[(feat_unique_vals < 3)])

In [225]:
other_cats = np.setdiff1d(other_cats, possible_features)

In [226]:
other_cats = data[other_cats]

In [228]:
other_cats = [other_cats.columns[[not i for i in [bool(re.search('post', x)) for x in other_cats.columns]]]]

In [232]:
other_cats = np.array(other_cats)

In [234]:
other_cats[0]


array(['ALCOHOL_DRUG', 'ASTHMA', 'CARDIAC_ARRYTHMIA', 'CARDIAC_VALVULAR',
       'CEREBROVASCULAR', 'CHF', 'CHRONIC_KIDNEY', 'CHRONIC_PAIN_FIBRO',
       'COPD', 'Cancer_In_Situ', 'DEMENTIA', 'DEPRESSION', 'DIABETES',
       'DYSLIPIDEMIA', 'EPILEPSY_SEIZURE', 'HEPATITIS', 'HIV_AIDS',
       'HYPERTENSION', 'LIVER_GALLBLADDER_PANCREAS', 'Leukemia_Lymphoma',
       'MI_CAD', 'Metastatic', 'OSTEOARTHRITIS', 'Other_Cancer',
       'PARALYSIS', 'PEPTIC_ULCER', 'PERIPHERAL_VASCULAR',
       'RENAL_FAILURE', 'RHEUMATOLOGIC', 'SCHIZOPHRENIA',
       'SLEEP_DISORDERS', 'SMOKING', 'Solid_Tumor', 'THYROID',
       'drug_class', 'pdc_80_flag', 'pre_er_flag', 'pre_ip_flag', 'sexN'],
      dtype=object)

In [235]:
cat_vars = pd.concat([data[other_cats[0]], encoded_cats], axis= 1)
cat_vars.shape

(2000, 85)

## Determining the continuous variables

We can determine the same process for our continuous variables by simply removing all cate

In [247]:
non_continuous = list(possible_features) + list(other_cats[0])
non_continuous.append('post_total_cost')

In [286]:
continuous_df = data.drop(non_continuous, axis=1)

#remove post index variables
cont_col = [continuous_df.columns[[not i for i in [bool(re.search('post', x)) for x in continuous_df.columns]]]]


In [287]:
continuous_df = continuous_df[cont_col[0]]

In [288]:
#Store target y
target_var = data['post_total_cost']

In [289]:
all_features = pd.concat([continuous_df, encoded_cats], axis=1)
all_features.head()

Unnamed: 0,pdc,idx_copay,log_idx_copay,pre_ip_cost,pre_er_cost,pre_rx_cost,pre_op_cost,pre_total_cost,pre_medical_cost,num_ip,...,pre_total_cat4,pre_total_cat5,pre_total_cat6,pre_total_cat7,pre_total_cat8,pre_total_cat9,regionN0,regionN1,regionN2,regionN3
0,0.333333,40.4745,3.700672,0.0,0.0,3015.548793,1626.271037,4641.819829,1626.271037,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.866667,4.060471,1.401299,0.0,0.0,720.788173,479.3494,1200.137573,479.3494,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.938889,0.01,-4.60517,9794.292309,0.0,195.891492,781.50266,10771.68646,10575.79497,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.077778,10.203185,2.3227,43476.27646,0.0,105.816329,186.932041,43769.02483,43663.2085,2,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.444444,4.050485,1.398837,0.0,0.0,3895.769588,2302.981377,6198.750965,2302.981377,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [290]:
#Drop NA values
all_features = pd.concat([all_features, target_var], axis=1)
all_features.dropna(inplace=True)

In [292]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(all_features.drop(columns = ['post_total_cost']), all_features['post_total_cost'], test_size=0.2, random_state=42)

In [293]:
X_train.to_csv("X_train.csv", index = False)
X_test.to_csv("X_test.csv", index = False)
y_train.to_csv("y_train.csv", index = False)
y_test.to_csv("y_test.csv", index = False)

### Use R for Feature Selection Using LASSO