#Merging, Cleaning Data Sets and Adding Variables#

This notebook will be used to explore merging the data sets, cleaning the data sets (changing various values to np.NaN, converting fields to the appropriate value type), and adding calculated variables. This code will be repeated in the exploratory and modeling notebooks.

In [1]:
import pandas as pd
import numpy as np

cross = pd.read_csv('000_Cross-Sectional.csv', low_memory=False)
base = pd.read_csv('00_Baseline.csv', low_memory=False)
visit1 = pd.read_csv('01_Visit1.csv', low_memory=False)
visit2 = pd.read_csv('02_Visit2.csv', low_memory=False)
visit3 = pd.read_csv('03_Visit3.csv', low_memory=False)
visit4 = pd.read_csv('04_Visit4.csv', low_memory=False)
visit5 = pd.read_csv('05_Visit5.csv', low_memory=False)
visit6 = pd.read_csv('06_Visit6.csv', low_memory=False)
visit7 = pd.read_csv('07_Visit7.csv', low_memory=False)
visit8 = pd.read_csv('08_Visit8.csv', low_memory=False)
visit9 = pd.read_csv('09_Visit9.csv', low_memory=False)
visit10 = pd.read_csv('10_Visit10.csv', low_memory=False)

pd.set_option('display.max_rows', 120)
cross.rename(columns={'ID':'SWANID'}, inplace=True)

##Merging Data Sets##

In [2]:
data = pd.merge(cross, base)
data = pd.merge(data, visit1, on='SWANID', how='outer')
data = pd.merge(data, visit2, on='SWANID', how='outer')
data = pd.merge(data, visit3, on='SWANID', how='outer')
data = pd.merge(data, visit4, on='SWANID', how='outer')
data = pd.merge(data, visit5, on='SWANID', how='outer')
data = pd.merge(data, visit6, on='SWANID', how='outer')
data = pd.merge(data, visit7, on='SWANID', how='outer')
data = pd.merge(data, visit8, on='SWANID', how='outer')
data = pd.merge(data, visit9, on='SWANID', how='outer')
data = pd.merge(data, visit10, on='SWANID', how='outer')

In [3]:
data.shape

(3302, 9214)

That's a lot of fields.

##Cleaning Data Sets##

**Changing coded values for missing/null data to np.NaN**

In [4]:
data.replace(' ', np.nan, inplace=True)
data.replace('-9', np.nan, inplace=True)
data.replace('-1', np.nan, inplace=True)
data.replace('-7', np.nan, inplace=True)
data.replace('-8', np.nan, inplace=True)

**Correcting data types**

Discrimination scores

In [5]:
data[['COURTES0', 'RESPECT0', 'POORSER0', 'NOTSMAR0', 'AFRAIDO0', 'DISHONS0', 'BETTER0', 'INSULTE0', 'HARASSE0', 'IGNORED0']] = data[['COURTES0', 'RESPECT0', 'POORSER0', 'NOTSMAR0', 'AFRAIDO0', 'DISHONS0', 'BETTER0', 'INSULTE0', 'HARASSE0', 'IGNORED0']].astype(float)
data[['COURTES1', 'RESPECT1', 'POORSER1', 'NOTSMAR1', 'AFRAIDO1', 'DISHONS1', 'BETTER1', 'INSULTE1', 'HARASSE1', 'IGNORED1']] = data[['COURTES1', 'RESPECT1', 'POORSER1', 'NOTSMAR1', 'AFRAIDO1', 'DISHONS1', 'BETTER1', 'INSULTE1', 'HARASSE1', 'IGNORED1']].astype(float)
data[['COURTES2', 'RESPECT2', 'POORSER2', 'NOTSMAR2', 'AFRAIDO2', 'DISHONS2', 'BETTER2', 'INSULTE2', 'HARASSE2', 'IGNORED2']] = data[['COURTES2', 'RESPECT2', 'POORSER2', 'NOTSMAR2', 'AFRAIDO2', 'DISHONS2', 'BETTER2', 'INSULTE2', 'HARASSE2', 'IGNORED2']].astype(float)
data[['COURTES3', 'RESPECT3', 'POORSER3', 'NOTSMAR3', 'AFRAIDO3', 'DISHONS3', 'BETTER3', 'INSULTE3', 'HARASSE3', 'IGNORED3']] = data[['COURTES3', 'RESPECT3', 'POORSER3', 'NOTSMAR3', 'AFRAIDO3', 'DISHONS3', 'BETTER3', 'INSULTE3', 'HARASSE3', 'IGNORED3']].astype(float)
data[['COURTES7', 'RESPECT7', 'POORSER7', 'NOTSMAR7', 'AFRAIDO7', 'DISHONS7', 'BETTER7', 'INSULTE7', 'HARASSE7', 'IGNORED7']] = data[['COURTES7', 'RESPECT7', 'POORSER7', 'NOTSMAR7', 'AFRAIDO7', 'DISHONS7', 'BETTER7', 'INSULTE7', 'HARASSE7', 'IGNORED7']].astype(float)
data[['COURTES10', 'RESPECT10', 'POORSER10', 'NOTSMAR10', 'AFRAIDO10', 'DISHONS10', 'BETTER10', 'INSULTE10', 'HARASSE10', 'IGNORED10']] = data[['COURTES10', 'RESPECT10', 'POORSER10', 'NOTSMAR10', 'AFRAIDO10', 'DISHONS10', 'BETTER10', 'INSULTE10', 'HARASSE10', 'IGNORED10']].astype(float)

Ages

In [6]:
data[['AGE0', 'AGE1', 'AGE2', 'AGE3', 'AGE4', 'AGE5', 'AGE6', 'AGE7', 'AGE8', 'AGE9', 'AGE10']] = data[['AGE0', 'AGE1', 'AGE2', 'AGE3', 'AGE4', 'AGE5', 'AGE6', 'AGE7', 'AGE8', 'AGE9', 'AGE10']].astype(float)

**Adding Calculated Variables**

*Average Discrimination Score*

In [7]:
data.loc[:, 'DISC_SCORE0'] = 5 - data[['COURTES0', 'RESPECT0', 'POORSER0', 'NOTSMAR0', 'AFRAIDO0', 'DISHONS0', 
                                   'BETTER0', 'INSULTE0', 'HARASSE0', 'IGNORED0']].mean(axis=1)
data.loc[:, 'DISC_SCORE1'] = 5 - data[['COURTES1', 'RESPECT1', 'POORSER1', 'NOTSMAR1', 'AFRAIDO1', 'DISHONS1', 
                                   'BETTER1', 'INSULTE1', 'HARASSE1', 'IGNORED1']].mean(axis=1)
data.loc[:, 'DISC_SCORE2'] = 5 - data[['COURTES2', 'RESPECT2', 'POORSER2', 'NOTSMAR2', 'AFRAIDO2', 'DISHONS2', 
                                    'BETTER2', 'INSULTE2', 'HARASSE2', 'IGNORED2']].mean(axis=1)
data.loc[:, 'DISC_SCORE3'] = 5 - data[['COURTES3', 'RESPECT3', 'POORSER3', 'NOTSMAR3', 'AFRAIDO3', 'DISHONS3', 
                                    'BETTER3', 'INSULTE3', 'HARASSE3', 'IGNORED3']].mean(axis=1)
data.loc[:, 'DISC_SCORE7'] = 5 - data[['COURTES7', 'RESPECT7', 'POORSER7', 'NOTSMAR7', 'AFRAIDO7', 'DISHONS7', 
                                    'BETTER7', 'INSULTE7', 'HARASSE7', 'IGNORED7']].mean(axis=1)
data.loc[:, 'DISC_SCORE10'] = 5 - data[['COURTES10', 'RESPECT10', 'POORSER10', 'NOTSMAR10', 'AFRAIDO10', 'DISHONS10', 
                                    'BETTER10', 'INSULTE10', 'HARASSE10', 'IGNORED10']].mean(axis=1)

*Reason for Discrimination*

In [8]:
def convert_binary(cols):
    data[cols] = data[cols].map({'1':0, '2':1})
    # data[cols].replace(np.nan, 0, inplace=True)
    print data[cols].value_counts(dropna=False)

def convert_binary_float(cols):
    data[cols] = data[cols].map({1:0, 2:1})
    # data[cols].replace(np.nan, 0, inplace=True) # this may need to come out for certain variables
    print data[cols].value_counts(dropna=False)

In [9]:
data['RACE_REASON0'] = data.MAINREA0.map({'1':1, '2':1, '3':0, '4':0, '5':0, '6':0, '7':0, '8':0, '9':0})

In [10]:
race_conversion = ['BCRACE1', 'BCETHN1', 'BCRACE2', 'BCETHN2', 'BCRACE3', 'BCETHN3', 
                   'BCRACE7', 'BCETHN7', 'BCRACE10', 'BCETHN10']

for x in race_conversion:
    if(data[x].dtype == np.float64):
            convert_binary_float(x)
    else:
        convert_binary(x)

NaN    2005
 0      743
 1      554
Name: BCRACE1, dtype: int64
NaN    2021
 0      859
 1      422
Name: BCETHN1, dtype: int64
NaN    2097
 0      704
 1      501
Name: BCRACE2, dtype: int64
NaN    2103
 0      855
 1      344
Name: BCETHN2, dtype: int64
NaN    2208
 0      645
 1      449
Name: BCRACE3, dtype: int64
NaN    2217
 0      762
 1      323
Name: BCETHN3, dtype: int64
NaN    2333
 0      600
 1      369
Name: BCRACE7, dtype: int64
NaN    2338
 0      661
 1      303
Name: BCETHN7, dtype: int64
NaN    2479
 0      507
 1      316
Name: BCRACE10, dtype: int64
NaN    2480
 0      561
 1      261
Name: BCETHN10, dtype: int64


In [11]:
data.loc[:, 'RACE_REASON1'] = data.BCRACE1 + data.BCETHN1
data['RACE_REASON1'] = data.RACE_REASON1.map({0:0, 1:1, 2:1})

data.loc[:, 'RACE_REASON2'] = data.BCRACE2 + data.BCETHN2
data['RACE_REASON2'] = data.RACE_REASON2.map({0:0, 1:1, 2:1})

data.loc[:, 'RACE_REASON3'] = data.BCRACE3 + data.BCETHN3
data['RACE_REASON3'] = data.RACE_REASON3.map({0:0, 1:1, 2:1})

data.loc[:, 'RACE_REASON7'] = data.BCRACE7 + data.BCETHN7
data['RACE_REASON7'] = data.RACE_REASON7.map({0:0, 1:1, 2:1})

data.loc[:, 'RACE_REASON10'] = data.BCRACE10 + data.BCETHN10
data['RACE_REASON10'] = data.RACE_REASON10.map({0:0, 1:1, 2:1})

In [14]:
data[['INTDAY1', 'INTDAY2', 'INTDAY3', 'INTDAY4', 'INTDAY5', 'INTDAY6', 'INTDAY7', 'INTDAY8', 'INTDAY9', 'INTDAY10']] = data[[
        'INTDAY1', 'INTDAY2', 'INTDAY3', 'INTDAY4', 'INTDAY5', 'INTDAY6', 'INTDAY7', 'INTDAY8', 
        'INTDAY9', 'INTDAY10']].astype(float)

In [24]:
# Difference between Lipid Vascular Age and actual age at each visit

data['LV_AGE_DIFF0'] = data.LV_AGE0 - data.AGE0
data['LV_AGE_DIFF1'] = data.LV_AGE1 - data.AGE1
data['LV_AGE_DIFF3'] = data.LV_AGE3 - data.AGE3
data['LV_AGE_DIFF4'] = data.LV_AGE4 - data.AGE4
data['LV_AGE_DIFF5'] = data.LV_AGE5 - data.AGE5
data['LV_AGE_DIFF6'] = data.LV_AGE6 - data.AGE6
data['LV_AGE_DIFF7'] = data.LV_AGE7 - data.AGE7

# Difference between BMI Vascular Age and actual age at each visit

data['BV_AGE_DIFF0'] = data.BV_AGE0 - data.AGE0
data['BV_AGE_DIFF1'] = data.BV_AGE1 - data.AGE1
data['BV_AGE_DIFF2'] = data.BV_AGE2 - data.AGE2
data['BV_AGE_DIFF3'] = data.BV_AGE3 - data.AGE3
data['BV_AGE_DIFF4'] = data.BV_AGE4 - data.AGE4
data['BV_AGE_DIFF5'] = data.BV_AGE5 - data.AGE5
data['BV_AGE_DIFF6'] = data.BV_AGE6 - data.AGE6
data['BV_AGE_DIFF7'] = data.BV_AGE7 - data.AGE7
data['BV_AGE_DIFF8'] = data.BV_AGE8 - data.AGE8
data['BV_AGE_DIFF9'] = data.BV_AGE9 - data.AGE9
data['BV_AGE_DIFF10'] = data.BV_AGE10 - data.AGE10

# Ratio of Change in Lipid Age from Visit X to Baseline to the number of days between Visit X and Baseline

data['LV_AGE_RATIO1'] = (data.LV_AGE1 - data.LV_AGE0) / data.INTDAY1
data['LV_AGE_RATIO3'] = (data.LV_AGE3 - data.LV_AGE0) / data.INTDAY3
data['LV_AGE_RATIO4'] = (data.LV_AGE4 - data.LV_AGE0) / data.INTDAY4
data['LV_AGE_RATIO5'] = (data.LV_AGE5 - data.LV_AGE0) / data.INTDAY5
data['LV_AGE_RATIO6'] = (data.LV_AGE6 - data.LV_AGE0) / data.INTDAY6
data['LV_AGE_RATIO7'] = (data.LV_AGE7 - data.LV_AGE0) / data.INTDAY7

# Ratio of Change in BMI Age from Visit X to Baseline to the number of days between Visit X and Baseline

data['BV_AGE_RATIO1'] = (data.BV_AGE1 - data.BV_AGE0) / data.INTDAY1
data['BV_AGE_RATIO2'] = (data.BV_AGE2 - data.BV_AGE0) / data.INTDAY2
data['BV_AGE_RATIO3'] = (data.BV_AGE3 - data.BV_AGE0) / data.INTDAY3
data['BV_AGE_RATIO4'] = (data.BV_AGE4 - data.BV_AGE0) / data.INTDAY4
data['BV_AGE_RATIO5'] = (data.BV_AGE5 - data.BV_AGE0) / data.INTDAY5
data['BV_AGE_RATIO6'] = (data.BV_AGE6 - data.BV_AGE0) / data.INTDAY6
data['BV_AGE_RATIO7'] = (data.BV_AGE7 - data.BV_AGE0) / data.INTDAY7
data['BV_AGE_RATIO8'] = (data.BV_AGE8 - data.BV_AGE0) / data.INTDAY8
data['BV_AGE_RATIO9'] = (data.BV_AGE9 - data.BV_AGE0) / data.INTDAY9
data['BV_AGE_RATIO10'] = (data.BV_AGE10 - data.BV_AGE0) / data.INTDAY10


In [23]:
data[['SWANID', 'LV_AGE1', 'LV_AGE0', 'INTDAY1', 'LV_AGE_RATIO1', 'AGE1', 'LV_AGE_DIFF1']].head(10)

Unnamed: 0,SWANID,LV_AGE1,LV_AGE0,INTDAY1,LV_AGE_RATIO1,AGE1,LV_AGE_DIFF1
0,87737,,64,,,,
1,33094,36.0,44,429.0,-0.018648,44.0,-8.0
2,23588,57.0,67,399.0,-0.025063,49.0,8.0
3,26343,38.0,34,355.0,0.011268,46.0,-8.0
4,37347,53.0,57,365.0,-0.010959,49.0,4.0
5,81061,36.0,45,441.0,-0.020408,44.0,-8.0
6,63273,58.0,75,455.0,-0.037363,47.0,11.0
7,79029,45.0,41,403.0,0.009926,49.0,-4.0
8,73009,27.0,23,364.0,0.010989,43.0,-16.0
9,19650,30.0,30,376.0,0.0,48.0,-18.0


In [21]:
data.LV_AGE_DIFF1.describe()

count    2731.000000
mean       -1.196997
std        13.642210
min       -25.000000
25%       -11.000000
50%        -4.000000
75%         4.000000
max       104.000000
Name: LV_AGE_DIFF1, dtype: float64

In [19]:
data.LV_AGE1.describe()

count    2731.000000
mean       45.719150
std        14.189038
min        21.000000
25%        36.000000
50%        43.000000
75%        52.000000
max       150.000000
Name: LV_AGE1, dtype: float64