This notebook performs the bivariate analysis of all the variables in the data against Disposition

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from pandas.tools.pivot import pivot_table
from scipy.stats import fisher_exact, chi2_contingency, pointbiserialr
from scipy.stats.distributions import norm
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [15]:
data = pd.read_csv('../data/data-not_coded.csv', na_values='?')
categorical = [line.strip() for line in open('../data/categorical.txt').readlines()]
numerical = [line.strip() for line in open('../data/numerical.txt').readlines()]
depvar = 'Disposition'
outcomes = data[depvar].dropna().unique()
pvalue_threshold = 0.05
confidence = int(100 * 1 - pvalue_threshold)
critical_value = norm.ppf(1 - pvalue_threshold/2)

use_presentation = True
use_rounding = False

In [16]:
data_num = data[numerical]
data_cat = data[categorical]
for var in categorical:
    data_cat[var] = data_cat[var].astype('category')
data_cat.describe().to_csv('./bivariate/summary-categorical.csv')
data_num.describe().to_csv('./bivariate/summary-numerical.csv')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Remove variables that we don't want to include in the predictive model

In [17]:
excluded_vars = ['PatientKey']
excluded_vars += ['EbolaContact']
excluded_vars += ['Fluids']
excluded_vars += ['Travel']
excluded_vars += ['IllInFamily']
excluded_vars += ['VisitedSomeoneIll']
excluded_vars += ['SomebodyDiedRecently']
excluded_vars += ['BeenToFuneral']
excluded_vars += ['SuspicionEbola']
excluded_vars += ['PatientPregnant']
excluded_vars += ['PatientFarAlong']
excluded_vars += ['PatientHealthcareWorker']
excluded_vars += ['cycletime1']
excluded_vars += ['cycletime2']
excluded_vars += ['BMI']

for name in categorical:
    if '_R' in name:
        if not use_rounding:
            excluded_vars += [name]
    elif not use_presentation:
            excluded_vars += [name]

And define as subset of the data corresponding to patients who survived at least tdays:

In [18]:
fdr_threshold = 0.5
model_name = 'fdr15'
# Use -1 to use all data
tdays = -1
# data.dropna(subset=['Merged CT values'], how='all', inplace=True)
if 0 <= tdays:
    model_name = 'day' + str(tdays)    
    data = data[data['Days of treatment'] >= tdays]
    maxdays = int(data['Days of treatment'].max())
print "Sample size:", len(data[depvar])


Sample size: 470


Some utility functions:

In [19]:
# Sorts the table by P-value
def sort_by_pvalue(table):
    table.sort_values(by=['P-value'], inplace=True)
    m = table.index.shape[0]
    table.index = np.arange(1, m + 1)
    return table

# Calculates the univariate odds-ratio for variable var using 
# a single-variable LR model. Using the statsmodels library as
# explained here:
# http://blog.yhathq.com/posts/logistic-regression-and-python.html  
def calc_odds_ratio(data, var, is_cat, cat_dict={'Yes':1.0, 'No':0.0}):
    dat = data.copy()
    dat['intercept'] = 1.0
    dat.replace('Died', 1.0, inplace=True)
    dat.replace('Survived', 0.0, inplace=True)
    if is_cat:
        for k in cat_dict:
            dat.replace(k, cat_dict[k], inplace=True)
        
    train_cols = ['intercept', var]
    logit = sm.Logit(dat[depvar], dat[train_cols])
    # fit the model
    result = logit.fit()
    #print result.summary()
    
    # Get the odds-ratios and the CI
    
    # Scaling OR to represent more meaningful changes in the predictor variables.
    scale = 1
    if not is_cat:
        if var == 'Patient age': 
            # Scale by closes power-of-ten
            n = np.floor(np.log10(dat[var].max()))
            scale = np.power(10, n)
        else:            
            # Scaling by inter-quartile range 
            scale = dat[var].quantile(0.75) - dat[var].quantile(0.25)
            if scale < 1: scale = 1.0/scale
            
    params = result.params
    conf = result.conf_int()
    conf['OR'] = params                                     
    conf.columns = ['2.5%', '97.5%', 'OR']    
    oddrat = np.exp(scale * conf)

    return [oddrat['OR'][1], oddrat['2.5%'][1], oddrat['97.5%'][1]]


### Calculating Associations between outcome and all binary variables

We iterate over all the categorical variables, use [pivot_table](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.pivot_table.html) to get the contingency table, skipping those with more than two categories, and then compute the odds-ratio and P-value using the [two-tailed Exact Fisher](http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html) test, and the Confidence Interval for the Odds Ratio, as described [here](http://www.biostat.umn.edu/~susant/Fall10ph6414/Lesson14_complete.pdf) and [here](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2938757/). The [Wikipedia article](https://en.wikipedia.org/wiki/Odds_ratio) on odds ratio is also very informative.

In [20]:
names = []
perc = []
pvalues = []
odd_ratios = []
odd_ratios_ci = []
for var in categorical:
    if var in excluded_vars: continue

    dat = data.loc[:,(var, depvar)].dropna()
    dat["VALUES"] = pd.Series(np.ones(len(dat[var])), index=dat.index)

    try:
        counts = pivot_table(dat, values="VALUES", index=[var], columns=[depvar], aggfunc=np.sum, fill_value=0)
    except:
        print 'Cannot generate counts table for',var
        continue
        
    if counts.shape[0] > 2 or counts.shape[1] > 2: 
        continue
  
    # Sort in descending order so the odds ratios are properly computed 
    counts.sort_index(ascending=False, inplace=True)

    tcount = dat[var].count()
    vcount = dat[var].value_counts()
    if len(vcount) < 2:
        print 'Cannot generate counts table for',var
        continue
        
    if var == 'PatientSex':
        f = float(vcount[1]) / float(tcount)
        perc.append(str(int(100 * f)) + ' male')
    else:
        f = float(vcount[1]) / float(tcount)
        perc.append(str(int(100 * f)) + '')
    
#     odds, pvalue = fisher_exact(counts)
    odds, pvalue, dof, ex = chi2_contingency(counts, correction=True)
    ci = critical_value * np.sqrt((1.0 / counts).sum().sum())
    
    names.append(var)
    pvalues.append(pvalue)
    
    #odds_ci = [np.exp(np.log(odds) - ci), np.exp(np.log(odds) + ci)]
    #odd_ratios.append(odds)
    #odd_ratios_ci.append("(%.2f, %.2f)" % (lr_odds[0][0], odds_ci[1]))

    # Calculate OR from univariate LR
    if var == 'malaria1':
        odds_lr = calc_odds_ratio(dat, var, True, {'Negative':0.0, 'Positive':1.0})
    else:
        odds_lr = calc_odds_ratio(dat, var, True)

    odd_ratios.append(odds_lr[0])
    odd_ratios_ci.append("(%.2f, %.2f)" % (odds_lr[1], odds_lr[2]))

cat_table = pd.DataFrame({'Name':pd.Series(np.array(names)), 
                          '%':pd.Series(np.array(perc)),
                          'P-value':pd.Series(np.array(pvalues)),
                          'Odd ratio':pd.Series(np.array(odd_ratios)),
                          'OR 2.5&97.5% CI':pd.Series(np.array(odd_ratios_ci))}, 
                         columns=['Name', '%', 'P-value', 'Odd ratio', 'OR 2.5&97.5% CI'])

sort_by_pvalue(cat_table)
cat_table.to_csv("./bivariate/correlation-categorical.csv")
cat_table

Optimization terminated successfully.
         Current function value: 0.681020
         Iterations 4
Cannot generate counts table for Disposition
Optimization terminated successfully.
         Current function value: 0.681722
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.679085
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.671733
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.681285
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.671746
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.683170
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690125
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690192
         Iterations 4
Optimization terminated successfully.

Unnamed: 0,Name,%,P-value,Odd ratio,OR 2.5&97.5% CI
1,Jaundice,5,0.015825,3.906883,"(1.31, 11.62)"
2,HaemorrhagicEyes,27,0.054368,0.6551724,"(0.43, 0.99)"
3,Coma,2,0.095833,4870270000000000.0,"(0.00, inf)"
4,Confusion,8,0.119885,2.855422,"(0.88, 9.23)"
5,Breathlessness,23,0.133192,1.439542,"(0.92, 2.24)"
6,Headache,57,0.142351,0.7417694,"(0.51, 1.08)"
7,Bleeding,5,0.148409,2.079493,"(0.86, 5.05)"
8,AstheniaWeakness,71,0.187215,1.344009,"(0.89, 2.02)"
9,Diarrhoea,54,0.303988,1.245833,"(0.85, 1.83)"
10,malaria1,21,0.330524,1.464052,"(0.76, 2.83)"


### Calculating Associations between outcome and all numerical variables

There are different ways to calculate correlation between ordinal and numerical variables. [This](https://www.andrews.edu/~calkins/math/edrm611/edrm13.htm#WHY) is a nice table summarizing independency tests available for different combinations of variable types. It seems that the [Biserial Correlation Coefficient](https://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient) should be adequate to test the independency between numerical and categorical. Available in scipy through the [pointbiserialr](http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pointbiserialr.html) function. The [T-test](http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html) for the mean of two samples, or [one-way ANOVA](http://www.biostathandbook.com/onewayanova.html), seem like good options to test that the mean of the numerical variable is different across two categories.

In [21]:
names = []
pearson = []
pvalues = []
mean_std_rec = []
mean_std_died = []
odd_ratios = []
odd_ratios_ci = []
for var in numerical:
    if var in excluded_vars: continue
        
    dat = data.loc[:,(var, depvar)].dropna()
    cat = dat[depvar] == outcomes[1]
    r, pvalue = pointbiserialr(cat, dat[var])

    mean0 = dat[dat[depvar] == outcomes[0]][var].mean()
    std0 = dat[dat[depvar] == outcomes[0]][var].std()
    mean1 = dat[dat[depvar] == outcomes[1]][var].mean()
    std1 = dat[dat[depvar] == outcomes[1]][var].std()    
    
    mean_std_rec.append("%.2f (%.2f)" % (mean0, std0))
    mean_std_died.append("%.2f (%.2f)" % (mean1, std1))
    
    odds_lr = calc_odds_ratio(dat, var, False)
    odd_ratios.append(odds_lr[0])
    odd_ratios_ci.append("(%.2f, %.2f)" % (odds_lr[1], odds_lr[2]))    
    
    names.append(var)
    pearson.append(r)
    pvalues.append(pvalue)
    
num_table = pd.DataFrame({'Name':pd.Series(np.array(names)), 
                          'Mean (dev) Rec.':pd.Series(np.array(mean_std_rec)),
                          'Mean (dev) Died':pd.Series(np.array(mean_std_died)),                          
                          'P-value':pd.Series(np.array(pvalues)),
                          'Pearson\'s R':pd.Series(np.array(pearson)),
                          'Odd ratio':pd.Series(np.array(odd_ratios)),
                          'OR 2.5&97.5% CI':pd.Series(np.array(odd_ratios_ci))},
                         columns=['Name', 'Mean (dev) Rec.', 'Mean (dev) Died', 'P-value', 'Pearson\'s R',
                                  'Odd ratio', 'OR 2.5&97.5% CI'])
sort_by_pvalue(num_table)
num_table.to_csv("./bivariate/correlation-numerical.csv")
num_table

Optimization terminated successfully.
         Current function value: 0.677408
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.669011
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.664496
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611534
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.612211
         Iterations 5


Unnamed: 0,Name,Mean (dev) Rec.,Mean (dev) Died,P-value,Pearson's R,Odd ratio,OR 2.5&97.5% CI
1,cycletime,26.72 (5.51),22.23 (5.64),3.61966e-12,-0.370894,0.330947,"(0.23, 0.47)"
2,WellnessScale,2.49 (0.86),3.20 (1.03),3.132048e-07,0.335946,4.571731,"(2.44, 8.58)"
3,PatientAge,28.49 (15.42),32.03 (20.44),0.04250099,0.094525,1.325657,"(1.01, 1.74)"
4,FeverTemperature,37.41 (0.97),37.67 (1.19),0.09924309,0.116318,1.390675,"(0.94, 2.06)"
5,FeverNoDays,3.44 (2.19),3.56 (2.22),0.7897575,0.025134,1.048481,"(0.74, 1.48)"
