# Supervised learning leveraging cluster outcomes

## Load the data and import pipeline package 

In [1]:
import pipeline #a package we have previously written + will use some functions from
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
import warnings
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from os import path
from IPython.display import Image
import pipeline
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.tree import DecisionTreeClassifier
import sk_models # a script we wrote to run classifiers below

warnings.simplefilter("ignore")
pd.set_option('display.max_columns', None)

In [2]:
#This comes from the output of our clustering analysis
df = pd.read_csv('clustered_data.csv')

In [3]:
df.head(1)

Unnamed: 0,year,SEQN,BMXBMI,BPQ020,RIDAGEYR,RIAGENDR,INDFMPIR,RIDRETH1,TKCAL,TPROT,TCARB,TSUGR,TTFAT,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,assignment_kmeans
0,2007-2008,41475,58.04,1,62,2,1.83,5,3057,139.31,348.69,160.48,125.33,-1.617823,1.919746,0.455475,-0.172202,0.297086,2.514255,-0.836439,-1.447478,1.210543,0.014087,3.585609,-1.442924,1.06467,-0.591428,1.113903,1.035296,-0.094012,0.249038,0.889505,-1.001357,1


In [4]:
#Initial exploration
#for col in df.columns:
 #   print(col,": ", type(col))

In [5]:
#Split the data using sklearn's default 80/20
(train, test) = sklearn.model_selection.train_test_split(df)

In [6]:
#Clean the datasets - updated package + included parameters for this dataset
train_1 = train.copy(deep=True)
test_1 = test.copy(deep=True)

clean_train, clean_test = pipeline.clean_split((train_1, test_1))

In [7]:
train.columns

Index(['year', 'SEQN', 'BMXBMI', 'BPQ020', 'RIDAGEYR', 'RIAGENDR', 'INDFMPIR',
       'RIDRETH1', 'TKCAL', 'TPROT', 'TCARB', 'TSUGR', 'TTFAT', 'PC1', 'PC2',
       'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12',
       'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20',
       'assignment_kmeans'],
      dtype='object')

In [8]:
clean_train.head(1)

Unnamed: 0,cluster_1,has_had_hbp,is_obese,RIDAGEYR_scaled,INDFMPIR_scaled,RIAGENDR_1,RIAGENDR_2,RIDRETH1_1,RIDRETH1_2,RIDRETH1_3,RIDRETH1_4,RIDRETH1_5
5158,0,0,0,-0.009176,-1.055111,1,0,0,0,1,0,0


In [9]:
#Checking columns
clean_train.columns

Index(['cluster_1', 'has_had_hbp', 'is_obese', 'RIDAGEYR_scaled',
       'INDFMPIR_scaled', 'RIAGENDR_1', 'RIAGENDR_2', 'RIDRETH1_1',
       'RIDRETH1_2', 'RIDRETH1_3', 'RIDRETH1_4', 'RIDRETH1_5'],
      dtype='object')

In [10]:
#First let's grab BMI (obesity) as the output label - drop the vars and make obese the label

clean_train['label'] = clean_train['is_obese']
clean_train_o = clean_train.drop(['has_had_hbp','is_obese'], axis=1)

clean_test['label'] = clean_test['is_obese']
clean_test_o = clean_test.drop(['has_had_hbp','is_obese'], axis=1)


In [11]:
#Now let's grab blood pressure - drop the vars and make high blood pressure the label

clean_train['label'] = clean_train['has_had_hbp']
clean_train_b = clean_train.drop(['has_had_hbp','is_obese'], axis=1)

clean_test['label'] = clean_test['has_had_hbp']
clean_test_b = clean_test.drop(['has_had_hbp','is_obese'], axis=1)

In [12]:
#col_check
clean_test_b.columns

Index(['cluster_1', 'RIDAGEYR_scaled', 'INDFMPIR_scaled', 'RIAGENDR_1',
       'RIAGENDR_2', 'RIDRETH1_1', 'RIDRETH1_2', 'RIDRETH1_3', 'RIDRETH1_4',
       'RIDRETH1_5', 'label'],
      dtype='object')

In [13]:
#Now let's make dfs leveraging components instead of cluster assignments

clean_train_pca, clean_test_pca = pipeline.clean_split((train, test), components=True)

clean_train_pca['label'] = clean_train_pca['is_obese']
clean_train_o_pca = clean_train_pca.drop(['has_had_hbp','is_obese'], axis=1)

clean_test_pca['label'] = clean_test_pca['is_obese']
clean_test_o_pca = clean_test_pca.drop(['has_had_hbp','is_obese'], axis=1)

In [14]:
#Now let's make dfs leveraging components instead of cluster assignments

clean_train_pca, clean_test_pca = pipeline.clean_split((train, test), components=True)

clean_train_pca['label'] = clean_train_pca['has_had_hbp']
clean_train_b_pca = clean_train_pca.drop(['has_had_hbp','is_obese'], axis=1)

clean_test_pca['label'] = clean_test_pca['has_had_hbp']
clean_test_b_pca = clean_test_pca.drop(['has_had_hbp','is_obese'], axis=1)

In [15]:
#col_check
df.columns

Index(['year', 'SEQN', 'BMXBMI', 'BPQ020', 'RIDAGEYR', 'RIAGENDR', 'INDFMPIR',
       'RIDRETH1', 'TKCAL', 'TPROT', 'TCARB', 'TSUGR', 'TTFAT', 'PC1', 'PC2',
       'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12',
       'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20',
       'assignment_kmeans'],
      dtype='object')

## Run models for predicting obesity 

In [16]:
#Define a function to find the best model

def find_best_model(output_list):
    max_o = 0
    best_t = 0
    
    for o in output_list:
        t, accuracy, _ = o
        
        if accuracy > max_o:
            max_o = accuracy
            best_t = t
    
    return best_t, max_o

In [17]:
thresholds = [.5, .6, .7, .8, .9, .95, .99]
rf_o_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_rf_model(clean_train_o, clean_test_o, ['label'], list(clean_train_o.drop(['label'],axis=1).columns),
          t)
    rf_o_outputs.append((t, accuracy, feature_importance))

In [18]:
find_best_model(rf_o_outputs)

(0.99, 0.6390229191797346)

In [19]:
#display feature imporance for best model:
rf_o_outputs[3][2]

Unnamed: 0,feature,importance
2,INDFMPIR_scaled,0.580662
1,RIDAGEYR_scaled,0.358324
9,RIDRETH1_5,0.016128
0,cluster_1,0.015704
8,RIDRETH1_4,0.007353
5,RIDRETH1_1,0.005475
7,RIDRETH1_3,0.004853
4,RIAGENDR_2,0.004531
6,RIDRETH1_2,0.003499
3,RIAGENDR_1,0.003473


In [20]:
thresholds = [.5, .6, .7, .8, .9, .95, .99]
svc_o_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_SVC_model(clean_train_o, clean_test_o, ['label'], list(clean_train_o.drop(['label'],axis=1).columns),
          t)
    svc_o_outputs.append((t, accuracy, feature_importance))

In [21]:
find_best_model(svc_o_outputs)

(0.5, 0.6366103739445115)

In [22]:
#display feature imporance for best model:
svc_o_outputs[0][2]

Unnamed: 0,coef,value
9,RIDRETH1_5,-0.000253
8,RIDRETH1_4,0.000198
6,RIDRETH1_2,8e-05
7,RIDRETH1_3,-6.6e-05
5,RIDRETH1_1,4e-05
0,cluster_1,4e-05
4,RIAGENDR_2,-3.7e-05
3,RIAGENDR_1,3.7e-05
1,RIDAGEYR_scaled,-1.4e-05
2,INDFMPIR_scaled,-1e-05


In [24]:
#Obesity - using PCA factors, rf

rf_o_pca_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_rf_model(clean_train_o_pca, clean_test_o_pca, ['label'], list(clean_train_o_pca.drop(['label'],axis=1).columns),
          t)
    rf_o_pca_outputs.append((t, accuracy, feature_importance))

In [25]:
find_best_model(rf_o_pca_outputs)

(0.5, 0.8642943305186972)

In [26]:
#display feature imporance for best model:
rf_o_pca_outputs[4][2]

Unnamed: 0,feature,importance
12,PC11_scaled,0.217108
21,PC20_scaled,0.151837
13,PC12_scaled,0.073537
14,PC13_scaled,0.063683
19,PC18_scaled,0.05457
3,PC2_scaled,0.037267
8,PC7_scaled,0.030589
7,PC6_scaled,0.030365
20,PC19_scaled,0.030025
0,RIDAGEYR_scaled,0.025732


In [27]:
#Obesity - using PCA factors, SVC
svc_o_pca_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_SVC_model(clean_train_o_pca, clean_test_o_pca, ['label'], list(clean_train_o_pca.drop(['label'],axis=1).columns),
          t)
    svc_o_pca_outputs.append((t, accuracy, feature_importance))

In [28]:
find_best_model(svc_o_pca_outputs)

(0.5, 0.919481302774427)

In [29]:
#display feature imporance for best model:
svc_o_pca_outputs[0][2]

Unnamed: 0,coef,value
12,PC11_scaled,2.265005
21,PC20_scaled,-1.821517
19,PC18_scaled,1.262638
13,PC12_scaled,-1.160041
14,PC13_scaled,0.925652
7,PC6_scaled,0.594863
20,PC19_scaled,0.521687
3,PC2_scaled,0.509389
8,PC7_scaled,0.493865
17,PC16_scaled,-0.395266


## Run models for predicting high blood pressure

In [31]:
#Blood pressure - cluster, RF
rf_b_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_rf_model(clean_train_b, clean_test_b, ['label'], list(clean_train_b.drop(['label'],axis=1).columns),
          t)
    rf_b_outputs.append((t, accuracy, feature_importance))

In [32]:
find_best_model(rf_b_outputs)

(0.6, 0.6878769601930036)

In [33]:
rf_b_outputs[1][2]

Unnamed: 0,feature,importance
2,INDFMPIR_scaled,0.469066
1,RIDAGEYR_scaled,0.468931
0,cluster_1,0.015243
8,RIDRETH1_4,0.011074
7,RIDRETH1_3,0.00795
3,RIAGENDR_1,0.006262
5,RIDRETH1_1,0.006137
4,RIAGENDR_2,0.005512
9,RIDRETH1_5,0.005013
6,RIDRETH1_2,0.004811


In [34]:
#Blood pressure - cluster, SVC
svc_b_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_SVC_model(clean_train_b, clean_test_b, ['label'], list(clean_train_b.drop(['label'],axis=1).columns),
          t)
    svc_b_outputs.append((t, accuracy, feature_importance))

In [35]:
find_best_model(svc_b_outputs)

(0.5, 0.7234620024125452)

In [36]:
svc_b_outputs[0][2]

Unnamed: 0,coef,value
1,RIDAGEYR_scaled,1.000069
8,RIDRETH1_4,0.4947
5,RIDRETH1_1,-0.185963
9,RIDRETH1_5,-0.170044
2,INDFMPIR_scaled,-0.159748
6,RIDRETH1_2,-0.113515
0,cluster_1,0.052477
7,RIDRETH1_3,-0.025179
3,RIAGENDR_1,0.000302
4,RIAGENDR_2,-0.000302


In [37]:
#Blood pressure - PCA, RF
rf_b_pca_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_rf_model(clean_train_b_pca, clean_test_b_pca, ['label'], list(clean_train_b_pca.drop(['label'],axis=1).columns),
          t)
    rf_b_pca_outputs.append((t, accuracy, feature_importance))

In [38]:
find_best_model(rf_b_pca_outputs)

(0.5, 0.9499396863691194)

In [39]:
rf_b_pca_outputs[0][2]

Unnamed: 0,feature,importance
12,PC11_scaled,0.270351
21,PC20_scaled,0.161029
0,RIDAGEYR_scaled,0.110315
18,PC17_scaled,0.056431
13,PC12_scaled,0.047349
14,PC13_scaled,0.044679
20,PC19_scaled,0.041564
16,PC15_scaled,0.040777
17,PC16_scaled,0.034691
8,PC7_scaled,0.024461


In [40]:
#Blood pressure - PCA, SVC
svc_b_pca_outputs = []

for t in thresholds:
    accuracy, feature_importance = sk_models.run_SVC_model(clean_train_b_pca, clean_test_b_pca, ['label'], list(clean_train_b_pca.drop(['label'],axis=1).columns),
          t)
    svc_b_pca_outputs.append((t, accuracy, feature_importance))

In [41]:
find_best_model(svc_b_pca_outputs)

(0.7, 0.994270205066345)

In [42]:
svc_b_pca_outputs[3][2]

Unnamed: 0,coef,value
12,PC11_scaled,3.035164
21,PC20_scaled,1.926036
18,PC17_scaled,-1.116412
14,PC13_scaled,1.010313
13,PC12_scaled,-1.000116
17,PC16_scaled,-0.96873
16,PC15_scaled,-0.923785
20,PC19_scaled,-0.882467
8,PC7_scaled,0.817959
9,PC8_scaled,-0.783379
