In [1]:
cd dataSets

/Users/davidscroggins/Google Drive/DePaul/csc478_machineLearning/csc478_project/csc478_projectData/dataSets


In [2]:
ls

glucoseHomeostasis.tsv             wave2_publicUseContDB.tsv
inflammationAndImmuneFunction.tsv  wave3_inHomeQuest.tsv
lipids.tsv                         wave4_inHomeQuest.tsv
wave1_inHomeQuest.tsv              wave4_inHomeQuest_16b.tsv
wave1_publicUseContDB.tsv          wave4_inHomeQuest_16c.tsv
wave2_inHomeQuest.tsv


In [3]:
import pandas as pd

NB: There is some numpy wonkiness going on behind the scenes here. You'll see it if you look at the data types of the data frames. For some reason, numpy infers the data type as a float for any column that contains an NaN. We can't force the issue by specifying dtype='int' in read_csv, because then numpy throws an error when it can't convert the NaN to an int.

I didn't drop the NaN values, because we won't be keeping all of these columns, and I didn't want to prematurely prune the dataset.

Obviously we don't want floats for categorical variables, but I don't think this will be an issue since we'll be using get_dummies to generate the variables. Worse come to worst, we'll just have to cast all float data points to ints using to_numeric or something like that, which will be annoying, but not the end of the world.

Let me know if you spot a workaround I didn't think of.

#### Import biomarkers

In [4]:
glucHome = pd.read_csv('glucoseHomeostasis.tsv', sep='\t', header=0, na_values=' ')
infImmFunc = pd.read_csv('inflammationAndImmuneFunction.tsv', sep='\t', header=0, na_values=' ')
lipids = pd.read_csv('lipids.tsv', sep='\t', header=0, na_values=' ')

In [5]:
# glucHome.head()

In [6]:
glucHome.shape

(5114, 10)

In [7]:
# Note EBV_FLAG. We probably aren't going to use these, but if we do, the NaN need to be filled in with zeros
infImmFunc.head()

Unnamed: 0,AID,CRP,CRP_FLAG,EBV,EBV_FLAG,C_CRP,C_SUBCLN,C_INFECT,CRP_MED1,CRP_MED2,CRP_MED3,CRP_MED4,CRP_MED5,CRP_MED6,CRP_MED7,CRP_MED8
0,57101310,8.448,,90.0,,3.0,2,0,1,0,0,0,0,0,0,1
1,57103869,999.0,,9999.0,,9.0,0,0,0,0,0,0,0,0,0,0
2,57109625,1.204,,187.0,,2.0,1,0,0,0,0,0,0,0,0,0
3,57111071,0.905,,75.0,,1.0,0,0,0,0,0,0,0,0,0,0
4,57113943,5.363,,257.0,,3.0,3,1,1,0,0,0,0,0,0,1


In [8]:
infImmFunc.shape

(5114, 16)

In [9]:
'''We probably won't use these flags, but filling in NaNs with zero. 
I did these two because they were fast, but there are a lot of them scattered
throughout the In-Home Questionnaire sets. Something to keep an eye out for.'''
infImmFunc['CRP_FLAG'].fillna(0, inplace=True, downcast='int')
infImmFunc['EBV_FLAG'].fillna(0, inplace=True, downcast='int')
infImmFunc.head()

Unnamed: 0,AID,CRP,CRP_FLAG,EBV,EBV_FLAG,C_CRP,C_SUBCLN,C_INFECT,CRP_MED1,CRP_MED2,CRP_MED3,CRP_MED4,CRP_MED5,CRP_MED6,CRP_MED7,CRP_MED8
0,57101310,8.448,0,90.0,0,3.0,2,0,1,0,0,0,0,0,0,1
1,57103869,999.0,0,9999.0,0,9.0,0,0,0,0,0,0,0,0,0,0
2,57109625,1.204,0,187.0,0,2.0,1,0,0,0,0,0,0,0,0,0
3,57111071,0.905,0,75.0,0,1.0,0,0,0,0,0,0,0,0,0,0
4,57113943,5.363,0,257.0,0,3.0,3,1,1,0,0,0,0,0,0,1


In [10]:
# lipids.head()

In [11]:
lipids.shape

(5114, 14)

#### Importing Waves

NB: Wave 4 comes in mutiple files. I included only the main In-Home Questionaire file. 16b and 16c contain question re. intimate partners. I did not include it in light of previous dicussions, but it can easily be brought in. I also did not include section 18, 19 and 20 which contains data on pregancies, live births and children per our discussion. We can easily include if minds change.

In [12]:
wave1 = pd.read_csv('wave1_inHomeQuest.tsv', sep='\t', header=0, na_values=' ')
wave2 = pd.read_csv('wave2_inHomeQuest.tsv', sep='\t', header=0, na_values=' ')
wave3 = pd.read_csv('wave3_inHomeQuest.tsv', sep='\t', header=0, na_values=' ')
wave4 = pd.read_csv('wave4_inHomeQuest.tsv', sep='\t', header=0, na_values=' ')

In [13]:
# wave1.head()

In [14]:
wave1.shape

(6504, 2794)

In [15]:
# Note flags
# wave2.head()

In [16]:
wave2.shape

(4834, 2532)

In [17]:
# wave3.head()

In [18]:
wave3.shape

(4882, 1831)

In [19]:
# wave4.head()

In [20]:
wave4.shape

(5114, 920)

#### Importing Public Use Contextual Databases

NB: These datasets contain rich demographic information for individuals. They only exist for Waves 1 and 2

In [21]:
wave1_publicUse = pd.read_csv('wave1_publicUseContDB.tsv', sep='\t', header=0, na_values=' ')
wave2_publicUse = pd.read_csv('wave2_publicUseContDB.tsv', sep='\t', header=0, na_values=' ')

In [22]:
# wave1_publicUse.head()

In [23]:
wave1_publicUse.shape

(6504, 32)

In [24]:
# wave2_publicUse.head()

In [25]:
wave2_publicUse.shape

(4834, 32)

#### Merging Waves 1 and 2 with Public Use Contextual Databases

In [26]:
wave1 = pd.merge(left=wave1, right=wave1_publicUse, left_on='AID', right_on='AID')
wave2 = pd.merge(left=wave2, right=wave2_publicUse, left_on='AID', right_on='AID')

In [27]:
# wave1.head()

In [28]:
wave1.shape

(6504, 2825)

In [29]:
# wave2.head()

In [30]:
wave2.shape

(4834, 2563)

#### Merging Waves with Biomarkers

NB: I created 3 merged datasets for each wave, one for each set of biomarkers. Suspect we'll be working discretely with different biomarkers. This way we can just pair down the dataset to desired variables directly in one go. If you want a single dataset with all biomarkers it'll be quick to do.

In [31]:
wave1_gluc = pd.merge(left=wave1, right=glucHome, left_on='AID', right_on='AID')
wave1_infImm = pd.merge(left=wave1, right=infImmFunc, left_on='AID', right_on='AID')
wave1_lipids = pd.merge(left=wave1, right=lipids, left_on='AID', right_on='AID')

In [32]:
# wave1_gluc.head()

In [33]:
# wave1_infImm.head()

In [34]:
# wave1_lipids.head()

In [35]:
print wave1_gluc.shape
print wave1_infImm.shape
print wave1_lipids.shape

(5114, 2834)
(5114, 2840)
(5114, 2838)


In [36]:
wave2_gluc = pd.merge(left=wave2, right=glucHome, left_on='AID', right_on='AID')
wave2_infImm = pd.merge(left=wave2, right=infImmFunc, left_on='AID', right_on='AID')
wave2_lipids = pd.merge(left=wave2, right=lipids, left_on='AID', right_on='AID')

In [37]:
# wave2_gluc.head()

In [38]:
# wave2_infImm.head()

In [39]:
# wave2_lipids.head()

In [40]:
print wave2_gluc.shape
print wave2_infImm.shape
print wave2_lipids.shape

(3924, 2572)
(3924, 2578)
(3924, 2576)


In [41]:
wave3_gluc = pd.merge(left=wave3, right=glucHome, left_on='AID', right_on='AID')
wave3_infImm = pd.merge(left=wave3, right=infImmFunc, left_on='AID', right_on='AID')
wave3_lipids = pd.merge(left=wave3, right=lipids, left_on='AID', right_on='AID')

In [42]:
# wave3_gluc.head()

In [43]:
# wave3_infImm.head()

In [44]:
# wave3_lipids.head()

In [45]:
print wave3_gluc.shape
print wave3_infImm.shape
print wave3_lipids.shape

(4208, 1840)
(4208, 1846)
(4208, 1844)


In [46]:
wave4_gluc = pd.merge(left=wave4, right=glucHome, left_on='AID', right_on='AID')
wave4_infImm = pd.merge(left=wave4, right=infImmFunc, left_on='AID', right_on='AID')
wave4_lipids = pd.merge(left=wave4, right=lipids, left_on='AID', right_on='AID')

In [47]:
# wave4_gluc.head()

In [48]:
# wave4_infImm.head()

In [49]:
# wave4_lipids.head()

In [50]:
print wave4_gluc.shape
print wave4_infImm.shape
print wave4_lipids.shape

(5114, 929)
(5114, 935)
(5114, 933)
