In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from scipy import stats

%matplotlib inline

In [113]:
cross = pd.read_table('Cross.tsv', low_memory=False)
baseline = pd.read_table('Baseline.tsv', low_memory=False)

cross.rename(columns={'ID':'SWANID'}, inplace=True)

In [130]:
data = pd.merge(cross, baseline)

In [131]:
data.replace(' ', np.nan, inplace=True)
data.replace('-9', np.nan, inplace=True)
data.replace('-1', np.nan, inplace=True)
data.replace('-7', np.nan, inplace=True)
data.replace('-8', np.nan, inplace=True)

In [132]:
def convert_bin(cols):
    data[cols] = data[cols].map({'1':0, '2':1})
    data[cols].replace(np.nan, 0, inplace=True)
    print data[cols].value_counts(dropna=False)

def convert_bin_fl(cols):
    data[cols] = data[cols].map({1:0, 2:1})
    data[cols].replace(np.nan, 0, inplace=True)
    print data[cols].value_counts(dropna=False)

In [133]:
conversion = ['DIABETE', 'BP0', 'SMOKENO0']

for x in conversion:
    if(data[x].dtype == np.float64):
        convert_bin_fl(x)
    else:
        convert_bin(x)
        

0    3139
1     163
Name: DIABETE, dtype: int64
0    2913
1     389
Name: BP0, dtype: int64
0    2733
1     569
Name: SMOKENO0, dtype: int64


In [162]:
data[['SYSBP10', 'SYSBP20', 'SYSBP30', 'AGE0', 'CHOLRES0', 
      'HDLRESU0']] = data[['SYSBP10', 'SYSBP20', 'SYSBP30', 
                           'AGE0', 'CHOLRES0', 'HDLRESU0']].astype(float)

In [163]:
heart_cols = ['SWANID', 'DIABETE', 'BP0', 'SMOKENO0', 'AGE0', 'SYSBP10', 'SYSBP20', 'SYSBP30', 
               'CHOLRES0', 'HDLRESU0']

In [180]:
heart = data[heart_cols]
heart.dtypes

SWANID        int64
DIABETE     float64
BP0         float64
SMOKENO0    float64
AGE0        float64
SYSBP10     float64
SYSBP20     float64
SYSBP30     float64
CHOLRES0    float64
HDLRESU0    float64
dtype: object

In [181]:
heart.loc[:, 'SYSBP_AVG'] = heart[['SYSBP10', 'SYSBP20', 'SYSBP30']].mean(axis=1)

In [182]:
heart.SYSBP_AVG.describe()

count    3295.000000
mean      117.604552
std        16.986136
min        74.000000
25%       106.000000
50%       114.666667
75%       126.666667
max       224.000000
Name: SYSBP_AVG, dtype: float64

In [183]:
heart.dtypes

SWANID         int64
DIABETE      float64
BP0          float64
SMOKENO0     float64
AGE0         float64
SYSBP10      float64
SYSBP20      float64
SYSBP30      float64
CHOLRES0     float64
HDLRESU0     float64
SYSBP_AVG    float64
dtype: object

In [184]:
# drop records where Age0, CHOLRES0, HDLRESU0, and SYSBP_AVG are null

## data.dropna(subset = ['col1', 'col2', 'col4'])

heart.dropna(subset = ['AGE0'], inplace=True)
heart.dropna(subset = ['CHOLRES0'], inplace=True)
heart.dropna(subset = ['HDLRESU0'], inplace=True)
heart.dropna(subset = ['SYSBP_AVG'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [190]:
# setting constants

main = 192.4820096
age = 2.32888
choles = 1.20904
hdl = -.70833
sysbp = 2.76157
smoker = 0.52873
bpmed = 2.82263
diab = 0.69154

In [187]:
heart.dtypes

SWANID         int64
DIABETE      float64
BP0          float64
SMOKENO0     float64
AGE0         float64
SYSBP10      float64
SYSBP20      float64
SYSBP30      float64
CHOLRES0     float64
HDLRESU0     float64
SYSBP_AVG    float64
place1       float64
dtype: object

In [195]:
heart.loc[:, 'place1'] = (np.log(heart.AGE0)*age 
                          + np.log(heart.SYSBP_AVG)*sysbp 
                          + np.log(heart.CHOLRES0)*choles 
                          + np.log(heart.HDLRESU0)*hdl 
                          + heart.SMOKENO0*smoker 
                          + heart.DIABETE*diab)

count    3267.000000
mean       25.695608
std         0.629918
min        24.003383
25%        25.239932
50%        25.639648
75%        26.067781
max        28.533174
Name: place1, dtype: float64