In [1]:
import numpy as np
import pandas as pd

In [4]:
topics_df = pd.read_csv('../../DataPlus/dvd_topic_modeling_with_distributions.csv')

In [5]:
list(topics_df.columns)

['Unnamed: 0',
 'Unnamed: 0.1',
 'Tx3',
 'Advice1',
 'Anx11',
 'Anx111',
 'Anx112',
 'Anx113',
 'Anx12',
 'Anx13',
 'Anx51',
 'Anx52',
 'Anx53',
 'Anx61',
 'Anx62',
 'Anx63',
 'Anx71',
 'Anx72',
 'Anx73',
 'Anx91',
 'Anx92',
 'Anx93',
 'Ask3',
 'Clear3',
 'DA1',
 'Dienot2',
 'Doc1',
 'Explain3',
 'Finaltx3',
 'ID',
 'Involve3',
 'MD_age',
 'MD_gender',
 'MD_number_pts_wk',
 'MD_percentpts',
 'MD_race',
 'MD_specialty',
 'MD_type',
 'MD_yrgrad',
 'Mdtxrec3',
 'Opinion3',
 'Raded2',
 'Radpee2',
 'Satis3',
 'Study',
 'Sured2',
 'Surpee2',
 'Talkda3',
 'Timeda2',
 'TxgotTx3cc',
 'Txlean3',
 'Wait2',
 'Whyww2',
 'Wwed2',
 'Wwpee2',
 'age',
 'arabme',
 'asian',
 'black',
 'education',
 'gleason',
 'hispanic',
 'irespmd',
 'marry',
 'mdrespme',
 'native',
 'pacific',
 'psa1',
 'raceother',
 'txgot',
 'txgot_binary',
 'white',
 'Convo_1',
 'Convo_2',
 'Doctor_1',
 'Doctor_2',
 'Convo_1_lemmatized',
 'Convo_1_corp',
 'radiation_topic',
 'active_surveillance_topic',
 'appt_topic',
 'surgery_topi

## Education

In [3]:
def edu_helper(x):
    if np.isnan(x):
        return np.nan
    elif x > 6:
        return "College Degree"
    else:
        return "No College Degree"

In [6]:
def edu_binary(df):
    df = df.copy()
    df['edu_binary'] = df['education'].map(lambda x: edu_helper(x))
    return df

## Race

In [7]:
# combines race into one variable
def combine_race(df):
    df = df.copy()
    
    new_col = []
    found_race = False
    for i in range(len(df.index)):
        found_race = False
        for race in ['arabme', 'asian', 'black', 'hispanic', 'native', 'pacific', 'white', 'raceother']:
            if df[race][i] == 1:
                new_col.append(race)
                found_race = True
                break
        if not found_race:
            new_col.append(np.nan)
            
    df['combined_race'] = np.asarray(new_col)
    return df

In [8]:
# because minority is sparse, we create a binary caucasian variable
def white_binary(df):
    df = df.copy()
    race_col = df['combined_race']
    
    new_col = []
    for i in range(len(race_col.index)):
        if isinstance(race_col[i], str):
            if race_col[i] == 'white':
                new_col.append('White')
            else:
                new_col.append('Not White')
        else:
            new_col.append(np.nan)
            
    df['white_binary'] = np.asarray(new_col)
    return df

## Marriage

In [9]:
def marry_helper(x):
    if np.isnan(x):
        return np.nan
    elif int(x) == 1:
        return 'Married'
    else:
        return 'Not Married'

In [10]:
def marry_binary(df):
    df = df.copy()
    
    df['marry_binary'] = df['marry'].map(lambda x: marry_helper(x))
    return df

## Create Dataframe

In [12]:
PIPELINE = [
    edu_binary,
    combine_race,
    white_binary,
    marry_binary
]

In [14]:
# create argument determines whether dataframe is exported
def feature_engineer(df, steps, create=False):
    df = df.copy()
    
    for step in steps:
        df = step(df)
    
    if create:
        df.to_csv('../../DataPlus/topics_dataframe.csv')
        
    return df

In [16]:
# topics_df = feature_engineer(topics_df, PIPELINE, create=True)