<b>

<p>
<center>
<font size="5">
Data Science Capstone, Summer 2020
</font>
</center>
</p>

<p>
<center>
<font size="4">
Predicting Depression
</font>
</center>
</p>

<p>
<center>
<font size="3">
Data Science, Columbian College of Arts & Sciences, George Washington University
</font>
</center>
</p>

<p>
<center>
<font size="3">
Author: Caroline Sklaver
</font>
</center>
</p>

</b>

In [1]:
path = ('/Users/carolinesklaver/Desktop/Capstone/NHANES/data/csv_data/')

import os
os.chdir(path)

In [2]:
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np

# Data Preprocessing

In [4]:
# Importing the data

df_raw = pd.read_csv('df_raw_v2.csv')

# bring year and target col to the beginning of df
year = df_raw.pop('year')
df_raw.insert(1, 'year', year)

dep = df_raw.pop('depressed')
df_raw.insert(2, 'depressed', dep)



# drop marijuana use
df_raw.drop(['used_marijuana'],axis=1, inplace=True)
# help!
df_raw.drop(['year'],axis=1, inplace=True)

df_raw.drop(['SEQN'],axis=1, inplace=True)

In [5]:
#continuous features
cont = ['#_ppl_household', 'age', 'triglyceride','caffeine', 'lifetime_partners',
       'glycohemoglobin', 'CRP', 'tot_cholesterol','systolic_BP','diastolic_BP', 'BMI', 'waist_C', '#meals_fast_food',
       'min_sedetary', 'bone_mineral_density']

# categorical features
cat = ['race_ethnicity', 'edu_level', 'gender', 'marital_status', 'annual_HI',
       'doc_diabetes', 'how_healthy_diet', 'used_CMH',
       'health_insurance', 'doc_asthma', 'doc_overweight', 'doc_arthritis',
       'doc_CHF', 'doc_CHD', 'doc_heart_attack', 'doc_stroke',
       'doc_chronic_bronchitis', 'doc_liver_condition', 'doc_thyroid_problem',
       'doc_cancer', 'difficult_seeing', 'doc_kidney', 'broken_hip',
       'doc_osteoporosis', 'vigorous_activity', 'moderate_activity',
       'doc_sleeping_disorder', 'smoker', 'sexual_orientation',
       'alcoholic','herpes_2', 'HIV', 'doc_HPV','difficult_hearing', 'doc_COPD']

# target binary feature
target = 'depressed'

# multi-class features
cat_encode = ['race_ethnicity', 'edu_level', 'gender', 'marital_status', 'annual_HI','how_healthy_diet',
              'sexual_orientation']


In [6]:
def nan_helper(df):
    """
    The NaN helper

    Parameters
    ----------
    df : dataframe
    
    Returns
    ----------
    The dataframe of variables with NaN (index), 
    raw number missing, and their proportion
    
    """
    
    
    # get the raw number of missing values & sort
    missing = df.isnull().sum().sort_values(ascending=True)
    
    # get the proportion of missing values (%)
    proportion = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=True)
    
    # create table of missing data
    nan_data = pd.concat([missing, proportion], axis=1, keys=['missing', 'proportion'])
    
    return nan_data


def missing_values(df, threshold_col, threshold_row, impute_type):
    """
    Handle Missing Values

    Parameters
    ----------
    df : dataframe
    threshold_col: the proportion of missing values at which  to drop whole column
    threshold_row: the proportion of missing values at which to drop rows
    impute_type: mean or median imputation for continuous variables
    
    Returns
    ----------
    The dataframe without missing values
    
    """
    
    # Dropping Cols and Rows
    # call NaN helper function
    df_nan = nan_helper(df)
        
    # drop columns with higher proportion missing than threshold col
    df = df.drop((df_nan[df_nan['proportion'] > threshold_col]).index,1)
    
    # drop rows with higher proportion missing than threshold row
    df_nan_2 = df_nan[df_nan['proportion']>threshold_row]
    df = df.dropna(subset=np.intersect1d(df_nan_2.index, df.columns),
                           inplace=False)
    

    
    # Imputing values
    # Impute continuous variables with mean 
    if impute_type == 'mean':
        for col in cont:
            if col in df.columns:
                df[col].fillna(df[col].mean(), inplace=True)
    # Impute continuous variables with median
    elif impute_type == 'median':
        for col in cont:
            if col in df.columns:
                df[col].fillna(df[col].median(), inplace=True)
    
    
    # Impute categorical variables with most frequent/mode
    for col in cat:
        if col in df.columns:
            df[col].fillna(df[col].value_counts().index[0], inplace=True)
    

    return df


df_mean = missing_values(df_raw, 0.65, 0.65, "mean")
df_median = missing_values(df_raw, 0.65, 0.65, "median")



In [7]:
nan_data = nan_helper(df_raw)
nan_data.head()

Unnamed: 0,missing,proportion
depressed,0,0.0
race_ethnicity,0,0.0
#_ppl_household,0,0.0
age,0,0.0
gender,0,0.0


## Encoding the data

### Combining the training, validation and testing data
The code below shows how to combine the training, validation and testing data (using pandas concat).

In [8]:
# # Combine df_train, df_valid and df_test
# df = pd.concat([df_train, df_valid, df_test], sort=False)

# # Print the unique dtype of variables in df
# pd.DataFrame(df.dtypes.unique(), columns=['dtype'])

### Encoding the categorical features
The code below shows how to encode the categorical features in the combined data (using pandas.get\_dummies).

In [9]:
# # One-hot-encode the categorical features in the combined data
# df = pd.get_dummies(df, columns=cat_encode)

# # Print the first 5 rows of df
# df.head()

### Encoding the categorical target
The code below shows how to encode the categorical target in the combined data (using sklearn.LabelEncoder).

In [10]:
# from sklearn.preprocessing import LabelEncoder

# # The LabelEncoder
# le = LabelEncoder()

# # Encode the categorical target in the combined data
# df[target] = le.fit_transform(df[target].astype(str))

# # Print the first 5 rows of df
# df.head()

### Separating the training, validation and testing data
The code below shows how to separate the training, validation and testing data.

In [11]:
# # Separating the training data
# df_train = df.iloc[:df_train.shape[0], :].copy(deep=True)

# # Separating the validation data
# df_valid = df.iloc[df_train.shape[0]:df_train.shape[0] + df_valid.shape[0], :].copy(deep=True)

# # Separating the testing data
# df_test = df.iloc[df_train.shape[0] + df_valid.shape[0]:, :].copy(deep=True)

In [12]:
# # Print the dimension of df_train
# pd.DataFrame([[df_train.shape[0], df_train.shape[1]]], columns=['# rows', '# columns'])

In [13]:
# # Print the dimension of df_valid
# pd.DataFrame([[df_valid.shape[0], df_valid.shape[1]]], columns=['# rows', '# columns'])

In [14]:
# # Print the dimension of df_test
# pd.DataFrame([[df_test.shape[0], df_test.shape[1]]], columns=['# rows', '# columns'])

## Scaling the data
The code below shows how to normalize the data (using sklearn MinMaxScaler). 

In [15]:
# from sklearn.preprocessing import MinMaxScaler

# # The MinMaxScaler
# mms = MinMaxScaler()

# # Normalize the training data
# X_train = mms.fit_transform(X_train)

# # Normalize the validation data
# X_valid = mms.transform(X_valid)

# # Normalize the testing data
# X_test = mms.transform(X_test)

# Running Simple Models

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
#from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier

In [17]:
# rf = RandomForestClassifier(class_weight='balanced', random_state=42)

# # perform training
# rf.fit(X_train, y_train)

# y_pred = rf.predict(X_test)

# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))

In [18]:
# svm = SVC(kernel='linear', C=1.0, random_state=0)
# s = svm.fit(X_train, y_train)


# y_pred = s.predict(X_test)

# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))

In [19]:
# knn = KNeighborsClassifier(n_neighbors=1, p=2,metric='minkowski')
# k = knn.fit(X_train, y_train)

# y_pred = k.predict(X_test)

# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))

## Function to compare different types of imputation and their results

In [35]:
# read in the knn imputed data so we do not have to run the function every time
knn_df = pd.read_csv('df_progressive_knn.csv')
knn_df.drop(['SEQN'],axis=1,inplace=True)
knn_df.drop(['year'],axis=1,inplace=True)
#knn_df.head()


mlp_df = pd.read_csv('df_progressive_mlp.csv')
mlp_df.drop(['SEQN'],axis=1,inplace=True)
mlp_df.drop(['year'],axis=1,inplace=True)
mlp_df.head()

Unnamed: 0,depressed,race_ethnicity,edu_level,#_ppl_household,age,gender,marital_status,annual_HI,caffeine,doc_diabetes,...,systolic_BP,diastolic_BP,BMI,waist_C,#meals_fast_food,min_sedetary,doc_HPV,bone_mineral_density,difficult_hearing,doc_COPD
0,0.0,4.0,4.0,4.0,44.0,2.0,1.0,11.0,13.0,0.0,...,144.0,74.0,30.9,96.0,2.093681,398.557696,0.0,0.845891,0.0,0.0
1,0.0,3.0,5.0,2.0,70.0,1.0,1.0,11.0,260.0,1.0,...,138.0,60.0,24.74,96.5,2.093681,384.781692,0.0,0.845891,0.0,0.0
2,0.0,3.0,3.0,2.0,73.0,1.0,1.0,6.0,142.0,0.0,...,130.0,68.0,30.63,117.1,2.093681,382.287784,0.0,0.845891,0.0,0.0
3,0.0,2.0,4.0,3.0,18.0,2.0,5.0,11.0,5.397605e-79,0.0,...,110.0,64.0,29.45,84.0,2.093681,387.8057,0.0,0.845891,0.0,0.0
4,0.0,3.0,4.0,3.0,19.0,1.0,5.0,11.0,5.397605e-79,0.0,...,108.0,62.0,22.57,84.2,2.093681,409.963013,0.0,0.845891,0.0,0.0


In [37]:

def impute_data(df_cleaned, impute_strategy=None, cols_to_standardize=None):
    """
    Impute Data

    Parameters
    ----------
    df_cleaned : dataframe without identifiers
    impute_strategy: mean, median, or progressive_knn imputation
    cols_to_standardize: continous variables
    
    Returns
    ----------
    The dataframe without missing values from chosen imputation
    
    """
    
    
    df = df_cleaned.copy()
    if impute_strategy == 'mean':
        df = missing_values(df, 0.7, 0.7, 'mean')
    elif impute_strategy == 'median':
        df = missing_values(df, 0.7, 0.7, 'mean')
    elif impute_strategy == 'progressive_knn':
        df = knn_df
    elif impute_strategy == 'progressive_mlp':
        df = mlp_df
    else:
        arr = SimpleImputer(missing_values=np.nan,strategy=impute_strategy).fit(
          df.values).transform(df.values)
        df = pd.DataFrame(data=arr, index=df.index.values, columns=df.columns.values)
    
    if cols_to_standardize != None:
        cols_to_standardize = list(set(cols_to_standardize) & set(df.columns.values))
        df[cols_to_standardize] = df[cols_to_standardize].astype('float')
        df[cols_to_standardize] = pd.DataFrame(data=MinMaxScaler().fit(
        df[cols_to_standardize]).transform(df[cols_to_standardize]), 
                                             index=df[cols_to_standardize].index.values,
                                             columns=df[cols_to_standardize].columns.values)
    
    return df


In [38]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from timeit import default_timer as timer
from sklearn.preprocessing import MinMaxScaler

# function for handling missing values 
# and fitting random forest on clean data
def random_forest(data, impute_strategy=None,
                        cols_to_standardize=None,
                        test_size=0.2,
                        random_state=42): 
    """
    Random Forest

    Parameters
    ----------
    data: dataframe
    impute_strategy: call impute_data() function for mean, median, or progressive_knn imputation
    cols_to_standardize: continous variables
    test_size: train-test split proportion
    
    Returns
    ----------
    prints confusion matrix
    train_score, test_score: Accuracy on training and testing set
    reports time elapsed
    
    """
    
    start = timer()
    
    # store original columns
    original_columns = data.columns.difference(['depressed'])
    df_imputed = impute_data(data, impute_strategy, cols_to_standardize)
    train_data, test_data = train_test_split(df_imputed, test_size=test_size,
                                             random_state=random_state)
    
    # note which predictor columns were dropped or kept
    kept_columns = df_imputed.columns.difference(['depressed'])
    dropped_columns = original_columns.difference(df_imputed.columns)
    original_columns = original_columns.difference(['depressed'])
    
    # prepare tensors
    X_train = train_data.drop(columns=['depressed'])
    y_train = train_data['depressed']
    X_test = test_data.drop(columns=['depressed'])
    y_test = test_data['depressed']
    
    # model training
    rf = RandomForestClassifier(class_weight='balanced', random_state=42).fit(
        X_train, y_train)
    
    # model evaluation
    train_score = accuracy_score(y_train, rf.predict(X_train))
    test_score = accuracy_score(y_test, rf.predict(X_test))
    duration = timer() - start
    y_pred = rf.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(np.unique(y_pred))
    #print("Classification rate on training data: {}".format(train_score))
    #print("Classification rate on test data: {}".format(test_score))
    print("Execution time: {}".format(duration))
    
    return {
        'imputation strategy': impute_strategy,
        'standardized': cols_to_standardize!=None,
        'model': rf,
        'train score': train_score,
        'test score': test_score,
        'execution time (s)': duration
    }
  
# list to store models' performance  
rf_results = []

# prepare data
df = df_raw
cols_to_standardize = cont

# fit logistic regression for each imputation strategy
# with and without standardizing features
for impute_strategy in ['mean', 'median', 'progressive_knn', 'progressive_mlp']:
    for cols in [None, cols_to_standardize]:   
        result = random_forest(df, impute_strategy=impute_strategy, cols_to_standardize=cols)
        rf_results.append(result)

# display random forest regression performance
rf_results_df = pd.DataFrame(rf_results)
rf_results_df.drop(['model'], axis=1).drop_duplicates()

[[5800    2]
 [ 468    2]]
              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96      5802
         1.0       0.50      0.00      0.01       470

    accuracy                           0.93      6272
   macro avg       0.71      0.50      0.48      6272
weighted avg       0.89      0.93      0.89      6272

[0. 1.]
Execution time: 4.581997180000144
[[5800    2]
 [ 467    3]]
              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96      5802
         1.0       0.60      0.01      0.01       470

    accuracy                           0.93      6272
   macro avg       0.76      0.50      0.49      6272
weighted avg       0.90      0.93      0.89      6272

[0. 1.]
Execution time: 4.532017135999922
[[5800    2]
 [ 468    2]]
              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96      5802
         1.0       0.50      0.00      0.01       470

    accuracy     

Unnamed: 0,imputation strategy,standardized,train score,test score,execution time (s)
0,mean,False,0.99996,0.925064,4.581997
1,mean,True,0.99996,0.925223,4.532017
2,median,False,0.99996,0.925064,5.185358
3,median,True,0.99996,0.925223,4.916819
4,progressive_knn,False,1.0,0.925223,5.416575
5,progressive_knn,True,1.0,0.925223,5.584785
6,progressive_mlp,False,1.0,0.924585,5.592291
7,progressive_mlp,True,1.0,0.925064,8.39383


In [39]:
# function for handling missing values 
# and fitting knn on clean data
def knn_model(data, impute_strategy=None,
                        cols_to_standardize=None,
                        test_size=0.2,
                        random_state=42):
    """
    K-Nearest Neighbors

    Parameters
    ----------
    data: dataframe
    impute_strategy: call impute_data() function for mean, median, or progressive_knn imputation
    cols_to_standardize: continous variables
    test_size: train-test split proportion
    
    Returns
    ----------
    prints confusion matrix
    train_score, test_score: Accuracy on training and testing set
    reports time elapsed
    
    """
    
    start = timer()
    
    # store original columns
    original_columns = data.columns.difference(['depressed'])
    df_imputed = impute_data(data, impute_strategy, cols_to_standardize)
    train_data, test_data = train_test_split(df_imputed, test_size=test_size,
                                             random_state=random_state)
    
    # note which predictor columns were dropped or kept
    kept_columns = df_imputed.columns.difference(['depressed'])
    dropped_columns = original_columns.difference(df_imputed.columns)
    original_columns = original_columns.difference(['depressed'])
    
    # prepare tensors
    X_train = train_data.drop(columns=['depressed'])
    y_train = train_data['depressed']
    X_test = test_data.drop(columns=['depressed'])
    y_test = test_data['depressed']
    
    # model training
    knn = KNeighborsClassifier(n_neighbors=3, p=2,metric='minkowski').fit(
        X_train, y_train)
    
    # model evaluation
    train_score = accuracy_score(y_train, knn.predict(X_train))
    test_score = accuracy_score(y_test, knn.predict(X_test))
    duration = timer() - start
    y_pred = knn.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    # print(classification_report(y_test, y_pred))
    #print("Classification rate on training data: {}".format(train_score))
    #print("Classification rate on test data: {}".format(test_score))
    print("Execution time: {}".format(duration))
    
    return {
        'imputation strategy': impute_strategy,
        'standardized': cols_to_standardize!=None,
        'model': knn,
        'train score': train_score,
        'test score': test_score,
        'execution time (s)': duration
    }
  
# list to store models' performance  
knn_results = []

# prepare data
df = df_raw
cols_to_standardize = cont

# fit logistic regression for each imputation strategy
# with and without standardizing features
for impute_strategy in ['mean', 'median', 'progressive_knn', 'progressive_mlp']:
    for cols in [None, cols_to_standardize]:   
        result = knn_model(df, impute_strategy=impute_strategy, cols_to_standardize=cols)
        knn_results.append(result)

# display knn performance
knn_results_df = pd.DataFrame(knn_results)
knn_results_df.drop(['model'], axis=1).drop_duplicates()

[[5709   93]
 [ 463    7]]
Execution time: 10.90446405900002
[[5737   65]
 [ 442   28]]
Execution time: 34.70661648999976
[[5709   93]
 [ 463    7]]
Execution time: 6.706074951000119
[[5737   65]
 [ 442   28]]
Execution time: 30.20394973099974
[[5740   62]
 [ 446   24]]
Execution time: 25.22012957900006
[[5740   62]
 [ 446   24]]
Execution time: 31.889374418999978
[[5733   69]
 [ 442   28]]
Execution time: 23.235407035000208
[[5733   69]
 [ 442   28]]
Execution time: 23.003710847000548


Unnamed: 0,imputation strategy,standardized,train score,test score,execution time (s)
0,mean,False,0.931951,0.911352,10.904464
1,mean,True,0.93813,0.919165,34.706616
2,median,False,0.931951,0.911352,6.706075
3,median,True,0.93813,0.919165,30.20395
4,progressive_knn,False,0.937851,0.919005,25.22013
5,progressive_knn,True,0.937851,0.919005,31.889374
6,progressive_mlp,False,0.938489,0.918527,23.235407
7,progressive_mlp,True,0.938489,0.918527,23.003711


In [40]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from timeit import default_timer as timer
from sklearn.preprocessing import MinMaxScaler

# function for handling missing values 
# and fitting logistic regression on clean data
def NB_model(data, impute_strategy=None,
                        cols_to_standardize=None,
                        test_size=0.2,
                        random_state=42):
    """
    K-Nearest Neighbors

    Parameters
    ----------
    data: dataframe
    impute_strategy: call impute_data() function for mean, median, or progressive_knn imputation
    cols_to_standardize: continous variables
    test_size: train-test split proportion
    
    Returns
    ----------
    prints confusion matrix
    train_score, test_score: Accuracy on training and testing set
    reports time elapsed
    
    """
    
    start = timer()
    
    # store original columns
    original_columns = data.columns.difference(['depressed'])
    df_imputed = impute_data(data, impute_strategy, cols_to_standardize)
    train_data, test_data = train_test_split(df_imputed, test_size=test_size,
                                             random_state=random_state)
    
    # note which predictor columns were dropped or kept
    kept_columns = df_imputed.columns.difference(['depressed'])
    dropped_columns = original_columns.difference(df_imputed.columns)
    original_columns = original_columns.difference(['depressed'])
    
    # prepare tensors
    X_train = train_data.drop(columns=['depressed'])
    y_train = train_data['depressed']
    X_test = test_data.drop(columns=['depressed'])
    y_test = test_data['depressed']
    
    # model training
    nbc = GaussianNB().fit(
        X_train, y_train)
    
    # model evaluation
    train_score = accuracy_score(y_train, nbc.predict(X_train))
    test_score = accuracy_score(y_test, nbc.predict(X_test))
    duration = timer() - start
    y_pred = nbc.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    # print(classification_report(y_test, y_pred))
    #print("Classification rate on training data: {}".format(train_score))
    #print("Classification rate on test data: {}".format(test_score))
    print("Execution time: {}".format(duration))
    
    return {
        'imputation strategy': impute_strategy,
        'standardized': cols_to_standardize!=None,
        'model': nbc,
        'train score': train_score,
        'test score': test_score,
        'execution time (s)': duration
    }
  
# list to store models' performance  
nbc_results = []

# prepare data
df = df_raw
cols_to_standardize = cont

# fit logistic regression for each imputation strategy
# with and without standardizing features
for impute_strategy in ['mean', 'median', 'progressive_knn', 'progressive_mlp']:
    for cols in [None, cols_to_standardize]:   
        result = NB_model(df, impute_strategy=impute_strategy, cols_to_standardize=cols)
        nbc_results.append(result)

# display logistic regression performance
nbc_results_df = pd.DataFrame(nbc_results)
nbc_results_df.drop(['model'], axis=1).drop_duplicates()

[[4780 1022]
 [ 257  213]]
Execution time: 0.2950997690004442
[[4779 1023]
 [ 257  213]]
Execution time: 0.1727525220003372
[[4780 1022]
 [ 257  213]]
Execution time: 0.17494639300002746
[[4779 1023]
 [ 257  213]]
Execution time: 0.18341273399983038
[[4787 1015]
 [ 261  209]]
Execution time: 0.0908743569998478
[[4787 1015]
 [ 261  209]]
Execution time: 0.1108400069997515
[[4796 1006]
 [ 261  209]]
Execution time: 0.07934491800006072
[[4796 1006]
 [ 261  209]]
Execution time: 0.12824778300000617


Unnamed: 0,imputation strategy,standardized,train score,test score,execution time (s)
0,mean,False,0.803947,0.796078,0.2951
1,mean,True,0.803867,0.795918,0.172753
2,median,False,0.803947,0.796078,0.174946
3,median,True,0.803867,0.795918,0.183413
4,progressive_knn,False,0.803428,0.796556,0.090874
5,progressive_knn,True,0.803428,0.796556,0.11084
6,progressive_mlp,False,0.807375,0.797991,0.079345
7,progressive_mlp,True,0.807375,0.797991,0.128248


In [41]:
from sklearn.linear_model import Perceptron

# function for handling missing values 
# and fitting logistic regression on clean data
def ppn_model(data, impute_strategy=None,
                        cols_to_standardize=None,
                        test_size=0.2,
                        random_state=42):
    """
    Simple Perceptron Model

    Parameters
    ----------
    data: dataframe
    impute_strategy: call impute_data() function for mean, median, or progressive_knn imputation
    cols_to_standardize: continous variables
    test_size: train-test split proportion
    
    Returns
    ----------
    prints confusion matrix
    train_score, test_score: Accuracy on training and testing set
    reports time elapsed
    
    """
    
    start = timer()
    
    # store original columns
    original_columns = data.columns.difference(['depressed'])
    df_imputed = impute_data(data, impute_strategy, cols_to_standardize)
    train_data, test_data = train_test_split(df_imputed, test_size=test_size,
                                             random_state=random_state)
    
    # note which predictor columns were dropped or kept
    kept_columns = df_imputed.columns.difference(['depressed'])
    dropped_columns = original_columns.difference(df_imputed.columns)
    original_columns = original_columns.difference(['depressed'])
    
    # prepare tensors
    X_train = train_data.drop(columns=['depressed'])
    y_train = train_data['depressed']
    X_test = test_data.drop(columns=['depressed'])
    y_test = test_data['depressed']
    
    # model training
    ppn = Perceptron(max_iter=40, eta0=0.1, random_state=0).fit(X_train, y_train)
    
    # model evaluation
    train_score = accuracy_score(y_train, ppn.predict(X_train))
    test_score = accuracy_score(y_test, ppn.predict(X_test))
    duration = timer() - start
    y_pred = ppn.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    # print(classification_report(y_test, y_pred))
    #print("Classification rate on training data: {}".format(train_score))
    #print("Classification rate on test data: {}".format(test_score))
    print("Execution time: {}".format(duration))
    
    return {
        'imputation strategy': impute_strategy,
        'standardized': cols_to_standardize!=None,
        'model': ppn,
        'train score': train_score,
        'test score': test_score,
        'execution time (s)': duration
    }
  
# list to store models' performance  
ppn_results = []

# prepare data
df = df_raw
cols_to_standardize = cont

# fit logistic regression for each imputation strategy
# with and without standardizing features
for impute_strategy in ['mean', 'median', 'progressive_knn', 'progressive_mlp']:
    for cols in [None, cols_to_standardize]:   
        result = ppn_model(df, impute_strategy=impute_strategy, cols_to_standardize=cols)
        ppn_results.append(result)

# display logistic regression performance
ppn_results_df = pd.DataFrame(ppn_results)
ppn_results_df.drop(['model'], axis=1).drop_duplicates()

[[5558  244]
 [ 434   36]]
Execution time: 0.2578275019995999
[[5800    2]
 [ 468    2]]
Execution time: 0.18169172200032335
[[5558  244]
 [ 434   36]]
Execution time: 0.20551894300024287
[[5800    2]
 [ 468    2]]
Execution time: 0.17338147800001025
[[5795    7]
 [ 468    2]]
Execution time: 0.0916129320003165
[[5795    7]
 [ 468    2]]
Execution time: 0.10219282300022314
[[5456  346]
 [ 338  132]]
Execution time: 0.0716464970000743
[[5456  346]
 [ 338  132]]
Execution time: 0.11942043199996988


Unnamed: 0,imputation strategy,standardized,train score,test score,execution time (s)
0,mean,False,0.892486,0.891901,0.257828
1,mean,True,0.925095,0.925064,0.181692
2,median,False,0.892486,0.891901,0.205519
3,median,True,0.925095,0.925064,0.173381
4,progressive_knn,False,0.925334,0.924267,0.091613
5,progressive_knn,True,0.925334,0.924267,0.102193
6,progressive_mlp,False,0.895914,0.890944,0.071646
7,progressive_mlp,True,0.895914,0.890944,0.11942


In [31]:
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [42]:
from sklearn.linear_model import Perceptron

# function for handling missing values 
# and fitting logistic regression on clean data
def keras_model(data, impute_strategy=None,
                        cols_to_standardize=None,
                        test_size=0.2,
                        random_state=42):
    """
    Keras Perceptron Model

    Parameters
    ----------
    data: dataframe
    impute_strategy: call impute_data() function for mean, median, or progressive_knn imputation
    cols_to_standardize: continous variables
    test_size: train-test split proportion
    
    Returns
    ----------
    prints confusion matrix
    train_score, test_score: Accuracy on training and testing set
    reports time elapsed
    
    """
    
    start = timer()
    batch_size = 128
    num_classes = 10
    epochs = 1
    
    # store original columns
    original_columns = data.columns.difference(['depressed'])
    df_imputed = impute_data(data, impute_strategy, cols_to_standardize)
    train_data, test_data = train_test_split(df_imputed, test_size=test_size,
                                             random_state=random_state)
    
    # note which predictor columns were dropped or kept
    kept_columns = df_imputed.columns.difference(['depressed'])
    dropped_columns = original_columns.difference(df_imputed.columns)
    original_columns = original_columns.difference(['depressed'])
    
    # prepare tensors
    X_train = train_data.drop(columns=['depressed'])
    y_train = train_data['depressed']
    X_test = test_data.drop(columns=['depressed'])
    y_test = test_data['depressed']
    
    # convert class vectors to binary class matrices
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(len(kept_columns),)))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(num_classes, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

    history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(X_test, y_test))
    
    score = model.evaluate(X_test, y_test, verbose=0)
    
    y_pred = model.predict(X_test)
    # model evaulation
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    
    return {
        'imputation strategy': impute_strategy,
        'standardized': cols_to_standardize!=None,
        'model': keras_model,
        'Test loss' :  score[0],
        'Test accuracy' : score[1]
    }
  
# list to store models' performance  
keras_results = []

# prepare data
df = df_raw
cols_to_standardize = cont

# fit logistic regression for each imputation strategy
# with and without standardizing features
for impute_strategy in ['mean', 'median', 'progressive_knn', 'progressive_mlp']:
    for cols in [None, cols_to_standardize]:   
        result = keras_model(df, impute_strategy=impute_strategy, cols_to_standardize=cols)
        keras_results.append(result)

# display logistic regression performance
keras_results_df = pd.DataFrame(keras_results)
keras_results_df.drop(['model'], axis=1).drop_duplicates()

Train on 25085 samples, validate on 6272 samples
Epoch 1/1
Test loss: 0.28155736009381255
Test accuracy: 0.9250637888908386
Train on 25085 samples, validate on 6272 samples
Epoch 1/1
Test loss: 0.23096365437899924
Test accuracy: 0.9249043464660645
Train on 25085 samples, validate on 6272 samples
Epoch 1/1
Test loss: 0.29969514127136493
Test accuracy: 0.9250637888908386
Train on 25085 samples, validate on 6272 samples
Epoch 1/1
Test loss: 0.2470599681291045
Test accuracy: 0.9180484414100647
Train on 25085 samples, validate on 6272 samples
Epoch 1/1
Test loss: 0.23849957213946144
Test accuracy: 0.9245854616165161
Train on 25085 samples, validate on 6272 samples
Epoch 1/1
Test loss: 0.2332398233441066
Test accuracy: 0.9244260191917419
Train on 25085 samples, validate on 6272 samples
Epoch 1/1
Test loss: 0.236212224509491
Test accuracy: 0.9242665767669678
Train on 25085 samples, validate on 6272 samples
Epoch 1/1
Test loss: 0.25965843486542606
Test accuracy: 0.920918345451355


Unnamed: 0,imputation strategy,standardized,Test loss,Test accuracy
0,mean,False,0.281557,0.925064
1,mean,True,0.230964,0.924904
2,median,False,0.299695,0.925064
3,median,True,0.24706,0.918048
4,progressive_knn,False,0.2385,0.924585
5,progressive_knn,True,0.23324,0.924426
6,progressive_mlp,False,0.236212,0.924267
7,progressive_mlp,True,0.259658,0.920918


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from timeit import default_timer as timer
from sklearn.preprocessing import MinMaxScaler

# function for handling missing values 
# and fitting logistic regression on clean data
def svm_model(data, impute_strategy=None,
                        cols_to_standardize=None,
                        test_size=0.2,
                        random_state=42):
    """
    SVM

    Parameters
    ----------
    data: dataframe
    impute_strategy: call impute_data() function for mean, median, or progressive_knn imputation
    cols_to_standardize: continous variables
    test_size: train-test split proportion
    
    Returns
    ----------
    prints confusion matrix
    train_score, test_score: Accuracy on training and testing set
    reports time elapsed
    
    """
    
    start = timer()
    
    # store original columns
    original_columns = data.columns.difference(['depressed'])
    df_imputed = impute_data(data, impute_strategy, cols_to_standardize)
    train_data, test_data = train_test_split(df_imputed, test_size=test_size,
                                             random_state=random_state)
    
    # note which predictor columns were dropped or kept
    kept_columns = df_imputed.columns.difference(['depressed'])
    dropped_columns = original_columns.difference(df_imputed.columns)
    original_columns = original_columns.difference(['depressed'])
    
    # prepare tensors
    X_train = train_data.drop(columns=['depressed'])
    y_train = train_data['depressed']
    X_test = test_data.drop(columns=['depressed'])
    y_test = test_data['depressed']
    
    # model training
    svm = SVC(kernel='linear', class_weight='balanced', # penalize
            probability=True, random_state=1, gamma=0.2, C=1.0).fit(X_train, y_train)
    
    # model evaluation
    train_score = accuracy_score(y_train, svm.predict(X_train))
    test_score = accuracy_score(y_test, svm.predict(X_test))
    duration = timer() - start
    y_pred = svm.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    # print(classification_report(y_test, y_pred))
    #print("Classification rate on training data: {}".format(train_score))
    #print("Classification rate on test data: {}".format(test_score))
    print("Execution time: {}".format(duration))
    
    return {
        'imputation strategy': impute_strategy,
        'standardized': cols_to_standardize!=None,
        'model': ppn,
        'train score': train_score,
        'test score': test_score,
        'execution time (s)': duration
    }
  
# list to store models' performance  
svm_results = []

# prepare data
df = df_raw
cols_to_standardize = cont

# fit logistic regression for each imputation strategy
# with and without standardizing features
for impute_strategy in ['mean', 'median', 'progressive_knn']:
    for cols in [None, cols_to_standardize]:   
        result = svm_model(df, impute_strategy=impute_strategy, cols_to_standardize=cols)
        svm_results.append(result)

# display logistic regression performance
svm_results_df = pd.DataFrame(svm_results)
svm_results_df.drop(['model'], axis=1).drop_duplicates()

In [None]:
# best RF model for feature importance
# {'model__min_samples_leaf': 1, 'model__min_samples_split': 2}


In [None]:
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.2, random_state=2)
# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(testy))]
# fit a model
model = LogisticRegression(solver='lbfgs')
model.fit(trainX, trainy)
# predict probabilities
lr_probs = model.predict_proba(testX)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(testy, ns_probs)
lr_auc = roc_auc_score(testy, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(testy, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(testy, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()