In [249]:
import midus_varsum as mvs
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split 
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
bp_vars = mvs.labsearch('pressure').append(mvs.labsearch('bp'),ignore_index=True)['Variable']
tpath = '/Users/alex/Documents/alexdatasci/data_files/MIDUS_1/ICPSR_02760' \
    + '/DS0001/02760-0001-Data.dta'
bp_df = pd.read_stata(tpath,columns=bp_vars)
bp_df.head()
high_bp = np.where((bp_df['A1PA30']<=12) & \
                   ((bp_df['A1PA31']=='SLIGHTLY RAISED') | \
                    (bp_df['A1PA31']=='HIGH')),
                   1,
                   np.where((bp_df['A1PA30']=='DONT KNOW') | \
                           (bp_df['A1PA30']=='NEVER') | \
                            (bp_df['A1PA30']>12),
                            np.nan,
                            0))
pd.Series(high_bp).value_counts(dropna=False)
# Get the scale variables (want to omit these)
scalevars = mvs.scaledf['Varname'][~mvs.scaledf['Varname'].isna()]
# Get topic variables
topics = ['Life Satisfaction', 'Personal Beliefs', 'Work', 'Finances', 
          'Community Involvement', 'Your Neighborhood', 
          'Social Networks', 'Life Overall']
# varbytopic is a function I wrote that returns variables by topic by parsing XML
topicvars = [mvs.varbytopic(i)['Variable'] for i in topics]
topicvars = pd.Series([i for j in topicvars for i in j])
# Drop the constructed scale variables
topicvars = topicvars[~topicvars.isin(scalevars)]
topicvars = topicvars.append(pd.Series(['A1PAGE_M2','A1PRSEX']),
                             ignore_index=True)
# Import data from the STATA file.
df = pd.read_stata(tpath,columns=topicvars)
# Add the high_bp DV
df['high_bp'] = high_bp
# Get shape
df.shape


(7108, 475)

In [519]:
# Function for making nan_replace dict
def nan_replace(nan_labels=[],add_defaults=True):
    '''Makes dict that maps multicase values to None.
    defaults include (lower, upper, and capitalized case):
    ['missing', 'refused/missing', 'refused', 'inapp',
              'inapplicable', 'na', 'n/a', 'unknown', 'dont know',
              "don't know", 'does not apply', "doesn't apply", 
               'not applicable', 'Refused/Missing', 'not calculated',
               'missing data']'''
    if add_defaults==True:
        nan_replace = ['missing', 'refused/missing', 'refused', 'inapp',
              'inapplicable', 'na', 'n/a', 'unknown', 'dont know',
              "don't know", 'does not apply', "doesn't apply", 
               'not applicable', 'Refused/Missing', 'not calculated',
                      'missing data']
        nan_replace.extend([i.upper() for i in nan_replace])
        nan_replace.extend([" ".join(i.capitalize() for i in j.split()) \
                        for j in nan_replace])
    else:
        nan_replace=[]
    nan_replace.extend(nan_labels)
    nan_replace = dict(zip(nan_replace,[np.nan for i in nan_replace]))
    return nan_replace

# Function for removing categories included in nan_replace
def drop_nan_cats(series,nan_dict):
    '''Removes categories included in nan_dict (see nan_replace)'''
    try:
        series.dtype=='category'
    except TypeError:
        return 'MUST BE CATEGORY'
    fil = pd.Series(series.cat.categories).isin(list(nan_dict.keys()))
    return series.cat.remove_categories(series.cat.categories[fil])

# Get usable df with options
def make_x(df,
           convert_ordered_cats=True,
           ordered_cats_exceptions=[],
           scale_continuous=True,
           drop_na=True
          ):
    '''Returns pandas dataframe with only continuous and dummy variables
    as specified in args
    
    convert_ordered_cats: categories are recoded according to existing
    codes
    
    ordered_cats_exceptions: variables names of categorical variables
    that should be dummy coded (only applicpable if convert_ordered_cats=
    True)
    
    scale_continuous: converts continuous variables to z-scores (including 
    ordered_cats)
    
    drop_na: drop rows with any nan'''
    
    floats = df.dtypes=='float'
    ints = df.dtypes=='int'
    obs = df.dtypes=='object'
    cats = df.dtypes=='category'
    
    # Prelim subsets
    pre_continuous = df.loc[:,(floats|ints)]
    ordered_cats_exceptions = df.columns.isin(ordered_cats_exceptions)
    ords_continuous = df.loc[:,(~ordered_cats_exceptions & cats)]
    pre_dum = df.loc[:,ordered_cats_exceptions | obs]
    pre_continuous = df.loc[:,(floats|ints)]
    contin_dum = pre_continuous.loc[:,pre_continuous.apply(
                        lambda x: len(x.value_counts())==2)]
    pre_continuous = pre_continuous.loc[:,pre_continuous.apply(
                        lambda x: len(x.value_counts())!=2)]
    ords_continuous_rc = ords_continuous.apply(
                            lambda x: drop_nan_cats(x,nan_replace()),0)
    not_dums = ords_continuous_rc.apply(lambda x: len(x.value_counts())>2,0)
    dummy_df = pd.concat([pre_dum, ords_continuous_rc.loc[:,~not_dums]], 
                         axis=1)
    dummy_df.dropna(axis=1,how='all',inplace=True)
    dummy_df = pd.get_dummies(dummy_df,prefix=dummy_df.columns,
                              drop_first=True)
    dummy_df = pd.concat([dummy_df, contin_dum], axis=1)
    if convert_ordered_cats==True:
        ordered_continuous_df = ords_continuous_rc.loc[:,not_dums].apply(
            lambda x: x.cat.codes,axis=0)
        ordered_continuous_df = ordered_continuous_df.replace(-1,np.nan)
        continuous_df = pd.concat([pre_continuous,ordered_continuous_df],
                             axis=1)
        if scale_continuous==True:
            continuous_df = pd.DataFrame(preprocessing.scale(continuous_df,
                                                         copy=False),
                                     columns=continuous_df.columns)
    else:
        ordered_continuous_df = pd.get_dummies(
                            ords_continuous_rc.loc[:,not_dums],
                            prefix=ords_continuous_rc.loc[:,not_dums].columns,
                            drop_first=True)
        if scale_continuous==True:
            if pre_continuous.shape[1]>0:
                pre_continuous = pd.DataFrame(preprocessing.scale(
                    pre_continuous,copy=False),columns=pre_continuous.columns)
        continuous_df = pd.concat([pre_continuous,ordered_continuous_df],
                             axis=1)
    output = pd.concat([continuous_df,dummy_df],axis=1)   
    if drop_na==True:
        output.dropna(axis=0,inplace=True)
    return output

In [520]:
make_x(df,convert_ordered_cats=False,drop_na=False)

Unnamed: 0,A1PD1_SOMEWHAT,A1PD1_A LITTLE,A1PD1_NOT AT ALL,A1PD2_SOMEWHAT,A1PD2_A LITTLE,A1PD2_NOT AT ALL,A1PD3_SOMEWHAT,A1PD3_A LITTLE,A1PD3_NOT AT ALL,A1PD8_VERY GOOD,...,A1SM21H2_NO,A1SM21H3_NO,A1SM21I1_NO,A1SM21I2_NO,A1SM21I3_NO,A1SM21J1_NO,A1SM21J2_NO,A1SM21J3_NO,A1PRSEX_FEMALE,high_bp
0,0,0,0,0,0,0,0,0,0,1,...,1,1,1,1,0,1,1,0,0,1.0
1,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,1,0,1,0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,1,0.0
4,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,1,0,0,1,1,1.0
5,1,0,0,1,0,0,1,0,0,0,...,1,1,0,1,0,0,1,1,1,0.0
6,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,0,1,0,1.0
7,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0.0
8,1,0,0,1,0,0,1,0,0,0,...,1,1,1,1,1,1,1,1,0,
9,0,0,0,0,0,0,1,0,0,0,...,1,0,1,0,0,1,1,0,1,0.0
