# Supervised learning leveraging cluster outcomes

## Load the data and import pipeline package 

In [1]:
import pipeline #a package we have previously written + will use some functions from
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
import warnings
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from os import path
from IPython.display import Image
import pipeline
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.tree import DecisionTreeClassifier
import sk_models # a script we wrote to run classifiers below

warnings.simplefilter("ignore")
pd.set_option('display.max_columns', None)

In [2]:
#This comes from the output of our clustering analysis
df = pd.read_csv('clustered_data.csv')

In [3]:
df.head(1)

Unnamed: 0,year,SEQN,BMXBMI,BPQ020,RIDAGEYR,RIAGENDR,INDFMPIR,RIDRETH1,TKCAL,TPROT,TCARB,TSUGR,TTFAT,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,assignment_kmeans
0,2007-2008,41475,58.04,1,62,2,1.83,5,3057,139.31,348.69,160.48,125.33,-1.617823,1.919746,0.455475,-0.172202,0.297086,2.514255,-0.836439,-1.447478,1.210543,0.014087,3.585609,-1.442924,1.06467,-0.591428,1.113903,1.035296,-0.094012,0.249038,0.889505,-1.001357,1


In [4]:
#Initial exploration
#for col in df.columns:
 #   print(col,": ", type(col))

In [5]:
#Split the data using sklearn's default 80/20
(train, test) = sklearn.model_selection.train_test_split(df)

In [6]:
#Clean the datasets - updated package + included parameters for this dataset
clean_train, clean_test = pipeline.clean_split((train, test))

In [7]:
clean_train.head(1)

Unnamed: 0,cluster_1,has_had_hbp,is_obese,RIDAGEYR_scaled,INDFMPIR_scaled,RIAGENDR_1,RIAGENDR_2,RIDRETH1_1,RIDRETH1_2,RIDRETH1_3,RIDRETH1_4,RIDRETH1_5
6531,0,0,1,-1.418048,-1.5638,1,0,0,0,0,1,0


In [8]:
#Checking columns
clean_train.columns

Index(['cluster_1', 'has_had_hbp', 'is_obese', 'RIDAGEYR_scaled',
       'INDFMPIR_scaled', 'RIAGENDR_1', 'RIAGENDR_2', 'RIDRETH1_1',
       'RIDRETH1_2', 'RIDRETH1_3', 'RIDRETH1_4', 'RIDRETH1_5'],
      dtype='object')

In [9]:
#First let's grab BMI (obesity) as the output label - drop the vars and make obese the label

clean_train['label'] = clean_train['is_obese']
clean_train_o = clean_train.drop(['has_had_hbp','is_obese'], axis=1)

clean_test['label'] = clean_test['is_obese']
clean_test_o = clean_test.drop(['has_had_hbp','is_obese'], axis=1)


In [10]:
#Now let's grab blood pressure - drop the vars and make high blood pressure the label

clean_train['label'] = clean_train['has_had_hbp']
clean_train_b = clean_train.drop(['has_had_hbp','is_obese'], axis=1)

clean_test['label'] = clean_test['has_had_hbp']
clean_test_b = clean_test.drop(['has_had_hbp','is_obese'], axis=1)

In [11]:
#col_check
clean_test_b.columns

Index(['cluster_1', 'RIDAGEYR_scaled', 'INDFMPIR_scaled', 'RIAGENDR_1',
       'RIAGENDR_2', 'RIDRETH1_1', 'RIDRETH1_2', 'RIDRETH1_3', 'RIDRETH1_4',
       'RIDRETH1_5', 'label'],
      dtype='object')

In [12]:
#Now let's make dfs leveraging components instead of cluster assignments

clean_train_pca, clean_test_pca = pipeline.clean_split((train, test), components=True)

clean_train_pca['label'] = clean_train_pca['is_obese']
clean_train_o_pca = clean_train_pca.drop(['has_had_hbp','is_obese'], axis=1)

clean_test_pca['label'] = clean_test_pca['is_obese']
clean_test_o_pca = clean_test_pca.drop(['has_had_hbp','is_obese'], axis=1)

In [13]:
#Now let's make dfs leveraging components instead of cluster assignments

clean_train_pca, clean_test_pca = pipeline.clean_split((train, test), components=True)

clean_train_pca['label'] = clean_train_pca['has_had_hbp']
clean_train_b_pca = clean_train_pca.drop(['has_had_hbp','is_obese'], axis=1)

clean_test_pca['label'] = clean_test_pca['has_had_hbp']
clean_test_b_pca = clean_test_pca.drop(['has_had_hbp','is_obese'], axis=1)

In [14]:
#col_check
clean_test_o_pca.columns

Index(['cluster_1', 'RIDAGEYR_scaled', 'INDFMPIR_scaled', 'PC1_scaled',
       'PC2_scaled', 'PC3_scaled', 'PC4_scaled', 'PC5_scaled', 'PC6_scaled',
       'PC7_scaled', 'PC8_scaled', 'PC9_scaled', 'PC10_scaled', 'PC11_scaled',
       'PC12_scaled', 'PC13_scaled', 'PC14_scaled', 'PC15_scaled',
       'PC16_scaled', 'PC17_scaled', 'PC18_scaled', 'PC19_scaled',
       'PC20_scaled', 'RIAGENDR_1', 'RIAGENDR_2', 'RIDRETH1_1', 'RIDRETH1_2',
       'RIDRETH1_3', 'RIDRETH1_4', 'RIDRETH1_5', 'label'],
      dtype='object')

## Run models for predicting obesity 

In [16]:
#Define a function to find the best model

def find_best_model(output_list):
    max_o = 0
    best_t = 0
    
    for o in output_list:
        t, accuracy, _ = o
        
        if accuracy > max_o:
            max_o = accuracy
            best_t = t
    
    return best_t, max_o

In [39]:
thresholds = [.5, .6, .7, .8, .9, .95, .99]
rf_o_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_rf_model(clean_train_o, clean_test_o, ['label'], list(clean_train_o.drop(['label'],axis=1).columns),
          t)
    rf_o_outputs.append((t, accuracy, feature_importance))

In [40]:
find_best_model(rf_o_outputs)

(0.8, 0.6357056694813028)

In [41]:
#display feature imporance for best model:
rf_o_outputs[3][2]

Unnamed: 0,feature,importance
2,INDFMPIR_scaled,0.564746
1,RIDAGEYR_scaled,0.376222
0,cluster_1,0.013172
9,RIDRETH1_5,0.010427
8,RIDRETH1_4,0.009995
5,RIDRETH1_1,0.006727
7,RIDRETH1_3,0.00573
4,RIAGENDR_2,0.005238
3,RIAGENDR_1,0.004282
6,RIDRETH1_2,0.003461


In [42]:
thresholds = [.5, .6, .7, .8, .9, .95, .99]
svc_o_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_SVC_model(clean_train_o, clean_test_o, ['label'], list(clean_train_o.drop(['label'],axis=1).columns),
          t)
    svc_o_outputs.append((t, accuracy, feature_importance))

In [43]:
find_best_model(svc_o_outputs)

(0.5, 0.632086851628468)

In [44]:
#display feature imporance for best model:
svc_o_outputs[0][2]

Unnamed: 0,coef,value
8,RIDRETH1_4,0.000207
9,RIDRETH1_5,-0.000137
6,RIDRETH1_2,-0.00013
1,RIDAGEYR_scaled,-6e-05
7,RIDRETH1_3,5.2e-05
2,INDFMPIR_scaled,-2.1e-05
0,cluster_1,-1.6e-05
4,RIAGENDR_2,-8e-06
3,RIAGENDR_1,8e-06
5,RIDRETH1_1,7e-06


In [45]:
#Obesity - using PCA factors, rf

rf_o_pca_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_rf_model(clean_train_o_pca, clean_test_o_pca, ['label'], list(clean_train_o.drop(['label'],axis=1).columns),
          t)
    rf_o_pca_outputs.append((t, accuracy, feature_importance))

In [46]:
find_best_model(rf_o_pca_outputs)

(0.9, 0.632388419782871)

In [67]:
#display feature imporance for best model:
rf_o_pca_outputs[4][2]

Unnamed: 0,feature,importance
2,INDFMPIR_scaled,0.57589
1,RIDAGEYR_scaled,0.358121
0,cluster_1,0.01587
9,RIDRETH1_5,0.014144
8,RIDRETH1_4,0.008471
3,RIAGENDR_1,0.006506
4,RIAGENDR_2,0.006469
5,RIDRETH1_1,0.006257
7,RIDRETH1_3,0.005318
6,RIDRETH1_2,0.002955


In [48]:
#Obesity - using PCA factors, SVC
svc_o_pca_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_SVC_model(clean_train_o_pca, clean_test_o_pca, ['label'], list(clean_train_o_pca.drop(['label'],axis=1).columns),
          t)
    svc_o_pca_outputs.append((t, accuracy, feature_importance))

In [49]:
find_best_model(svc_o_pca_outputs)

(0.5, 0.9161640530759951)

In [68]:
#display feature imporance for best model:
svc_o_pca_outputs[0][2]

Unnamed: 0,coef,value
13,PC11_scaled,2.287461
22,PC20_scaled,-1.85739
20,PC18_scaled,1.232975
14,PC12_scaled,-1.163144
15,PC13_scaled,0.905786
8,PC6_scaled,0.62611
4,PC2_scaled,0.522993
9,PC7_scaled,0.499874
21,PC19_scaled,0.489022
18,PC16_scaled,-0.389593


## Run models for predicting high blood pressure

In [51]:
#Blood pressure - cluster, RF
rf_b_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_rf_model(clean_train_b, clean_test_b, ['label'], list(clean_train_b.drop(['label'],axis=1).columns),
          t)
    rf_b_outputs.append((t, accuracy, feature_importance))

In [52]:
find_best_model(rf_b_outputs)

(0.6, 0.678226779252111)

In [53]:
rf_b_outputs[1][2]

Unnamed: 0,feature,importance
1,RIDAGEYR_scaled,0.478776
2,INDFMPIR_scaled,0.46254
0,cluster_1,0.014595
8,RIDRETH1_4,0.011355
7,RIDRETH1_3,0.006433
4,RIAGENDR_2,0.005674
3,RIAGENDR_1,0.005455
9,RIDRETH1_5,0.005389
5,RIDRETH1_1,0.005295
6,RIDRETH1_2,0.004488


In [54]:
#Blood pressure - cluster, SVC
svc_b_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_SVC_model(clean_train_b, clean_test_b, ['label'], list(clean_train_b.drop(['label'],axis=1).columns),
          t)
    svc_b_outputs.append((t, accuracy, feature_importance))

In [55]:
find_best_model(svc_b_outputs)

(0.5, 0.7195416164053076)

In [58]:
svc_b_outputs[0][2]

Unnamed: 0,coef,value
1,RIDAGEYR_scaled,1.012307
8,RIDRETH1_4,0.489307
5,RIDRETH1_1,-0.194243
9,RIDRETH1_5,-0.153513
2,INDFMPIR_scaled,-0.141737
6,RIDRETH1_2,-0.089075
7,RIDRETH1_3,-0.052474
0,cluster_1,0.03124
3,RIAGENDR_1,-0.009131
4,RIAGENDR_2,0.009131


In [59]:
#Blood pressure - PCA, RF
rf_b_pca_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_rf_model(clean_train_b_pca, clean_test_b_pca, ['label'], list(clean_train_b_pca.drop(['label'],axis=1).columns),
          t)
    rf_b_pca_outputs.append((t, accuracy, feature_importance))

In [60]:
find_best_model(rf_b_pca_outputs)

(0.5, 0.9517490952955368)

In [61]:
rf_b_pca_outputs[0][2]

Unnamed: 0,feature,importance
13,PC11_scaled,0.302618
22,PC20_scaled,0.152438
1,RIDAGEYR_scaled,0.122005
19,PC17_scaled,0.056197
14,PC12_scaled,0.044658
15,PC13_scaled,0.038153
17,PC15_scaled,0.037288
18,PC16_scaled,0.031051
21,PC19_scaled,0.031045
9,PC7_scaled,0.021242


In [64]:
#Blood pressure - PCA, SVC
svc_b_pca_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_SVC_model(clean_train_b_pca, clean_test_b_pca, ['label'], list(clean_train_b_pca.drop(['label'],axis=1).columns),
          t)
    svc_b_pca_outputs.append((t, accuracy, feature_importance))

In [65]:
find_best_model(svc_b_pca_outputs)

(0.8, 0.9948733413751508)

In [66]:
svc_b_pca_outputs[3][2]

Unnamed: 0,coef,value
13,PC11_scaled,3.003884
22,PC20_scaled,1.909981
19,PC17_scaled,-1.187288
14,PC12_scaled,-1.08868
15,PC13_scaled,0.986114
17,PC15_scaled,-0.852897
18,PC16_scaled,-0.821494
10,PC8_scaled,-0.819879
21,PC19_scaled,-0.806007
9,PC7_scaled,0.788619
