# Univariate Analysis

The following cells load the data and perform a series of basic univariate analysis comparing each variable in the dataset with the outcome variable, Disposition.

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from scipy.stats import fisher_exact, chi2_contingency, pointbiserialr
from scipy.stats.distributions import norm
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [2]:
data = pd.read_csv('data.csv', na_values='?')
categorical = [line.strip() for line in open('categorical.txt').readlines()]
numerical = [line.strip() for line in open('numerical.txt').readlines()]
depvar = 'Disposition'
outcomes = data[depvar].dropna().unique()
pvalue_threshold = 0.05
confidence = int(100 * 1 - pvalue_threshold)
critical_value = norm.ppf(1 - pvalue_threshold/2)

use_presentation = True
use_rounding = False

print("Sample size:", len(data[depvar]))

Sample size: 470


In [3]:
data_num = data[numerical]
data_cat = data[categorical]
for var in categorical:
    data_cat[var] = data_cat[var].astype('category')
data_cat.describe().to_csv('summary-categorical.csv')
data_num.describe().to_csv('summary-numerical.csv')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Remove variables that we don't want to include in the predictive model

In [4]:
excluded_vars = ['PatientKey']
excluded_vars += ['EbolaContact']
excluded_vars += ['Fluids']
excluded_vars += ['Travel']
excluded_vars += ['IllInFamily']
excluded_vars += ['VisitedSomeoneIll']
excluded_vars += ['SomebodyDiedRecently']
excluded_vars += ['BeenToFuneral']
excluded_vars += ['SuspicionEbola']
excluded_vars += ['PatientPregnant']
excluded_vars += ['PatientFarAlong']
excluded_vars += ['PatientHealthcareWorker']
excluded_vars += ['cycletime1']
excluded_vars += ['cycletime2']
excluded_vars += ['FeverNoDays']
excluded_vars += ['DaysSinceSymptomOnset']
excluded_vars += ['Anorexia']
excluded_vars += ['LossAppetite']
excluded_vars += ['BMI']

for name in categorical:
    if '_R' in name:
        if not use_rounding:
            excluded_vars += [name]
    elif not use_presentation:
            excluded_vars += [name]

Some utility functions:

In [5]:
# Sorts the table by P-value
def sort_by_pvalue(table):
    table.sort_values(by=['P-value'], inplace=True)
    m = table.index.shape[0]
    table.index = np.arange(1, m + 1)
    return table

# Calculates the univariate odds-ratio for variable var using 
# a single-variable LR model. Using the statsmodels library as
# explained here:
# http://blog.yhathq.com/posts/logistic-regression-and-python.html  
def calc_odds_ratio(data, var, is_cat, cat_dict={'Yes':1.0, 'No':0.0}):
    dat = data.copy()
    dat['intercept'] = 1.0
    dat.replace('Died', 1.0, inplace=True)
    dat.replace('Survived', 0.0, inplace=True)
    if is_cat:
        for k in cat_dict:
            dat.replace(k, cat_dict[k], inplace=True)
        
    train_cols = ['intercept', var]
    logit = sm.Logit(dat[depvar], dat[train_cols], missing='drop')
    # fit the model
#     result = logit.fit(method='newton') # Might throw LinAlgError: Singular matrix 
    result = logit.fit(method='bfgs')
#     result = logit.fit(method='powell')
    #print result.summary()
    
    # Get the odds-ratios and the CI
    
    # Scaling OR to represent more meaningful changes in the predictor variables.
    scale = 1
    if not is_cat:
        if var == 'Patient age': 
            # Scale by closes power-of-ten
            n = np.floor(np.log10(dat[var].max()))
            scale = np.power(10, n)
        else:            
            # Scaling by inter-quartile range 
            scale = dat[var].quantile(0.75) - dat[var].quantile(0.25)
            if scale < 1: scale = 1.0/scale
            
    params = result.params
    conf = result.conf_int()
    conf['OR'] = params                                     
    conf.columns = ['2.5%', '97.5%', 'OR']    
    oddrat = np.exp(scale * conf)

    val = min(oddrat['OR'][1], 100)
    ci0 = min(oddrat['2.5%'][1], 100)    
    ci1 = min(oddrat['97.5%'][1], 100)
    
    return [val, ci0, ci1]

### Calculating Associations between outcome and all binary variables

We iterate over all the categorical variables, use [pivot_table](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.pivot_table.html) to get the contingency table, skipping those with more than two categories, and then compute the odds-ratio and P-value using the [two-tailed Exact Fisher](http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html) test, and the Confidence Interval for the Odds Ratio, as described [here](http://www.biostat.umn.edu/~susant/Fall10ph6414/Lesson14_complete.pdf) and [here](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2938757/). The [Wikipedia article](https://en.wikipedia.org/wiki/Odds_ratio) on odds ratio is also very informative.

In [7]:
names = []
perc = []
total_frac = []
total_perc = []
pvalues = []
miss_frac = []
miss_perc = []
odd_ratios = []
odd_ratios_ci = []
surv_frac = []
surv_perc = []
died_frac = []
died_perc = []
tot_count = len(data)
for var in categorical:
    if var in excluded_vars: continue
    print(var)
    
    dat = data.loc[:,(var, depvar)].dropna()
    dat["VALUES"] = pd.Series(np.ones(len(dat[var])), index=dat.index)

    try:
        counts = pd.pivot_table(dat, values="VALUES", index=[var], columns=[depvar], aggfunc=np.sum, fill_value=0)
    except:
        print('Cannot generate counts table for', var)
        continue
        
    if counts.shape[0] > 2 or counts.shape[1] > 2: 
        continue
  
    # Sort in descending order so the odds ratios are properly computed 
    counts.sort_index(ascending=False, inplace=True)

    tcount = dat[var].count()
    vcount = dat[var].value_counts()
    if len(vcount) < 2:
        print('Cannot generate counts table for', var) 
        continue

    f = float(vcount[1]) / float(tcount)        
    total_frac.append(str(vcount[1]) + "/" + str(tcount))
    if var == 'PatientSex':
        total_perc.append(str(int(100 * f)) + ' male')
    else:
        total_perc.append(str(vcount[1]) + "/" + str(tcount) + " (" + str(int(100 * f)) + ")")
        
    ndiedt = len(dat[(dat['Disposition'] == 'Died')])
    nsurvt = len(dat[(dat['Disposition'] == 'Survived')])
    if var == 'malaria1':    
        ndied1 = len(dat[(dat['Disposition'] == 'Died') & (dat[var] == 'Positive')])
        nsurv1 = len(dat[(dat['Disposition'] == 'Survived') & (dat[var] == 'Positive')])
    else:
        ndied1 = len(dat[(dat['Disposition'] == 'Died') & (dat[var] == 1)])
        nsurv1 = len(dat[(dat['Disposition'] == 'Survived') & (dat[var] == 1)])

    died_frac.append(str(ndied1) + "/" + str(ndiedt))
    died_perc.append(str(ndied1) + "/" + str(ndiedt) + " (" + str(int(float(100 * ndied1) / ndiedt)) + ")")
    surv_frac.append(str(nsurv1) + "/" + str(nsurvt))    
    surv_perc.append(str(nsurv1) + "/" + str(nsurvt) + " (" + str(int(float(100 * nsurv1) / nsurvt)) + ")")
    
    #     odds, pvalue = fisher_exact(counts)
    odds, pvalue, dof, ex = chi2_contingency(counts, correction=True)
    ci = critical_value * np.sqrt((1.0 / counts).sum().sum())
    
    names.append(var)
    pvalues.append(pvalue)
    
#     odds_ci = [np.exp(np.log(odds) - ci), np.exp(np.log(odds) + ci)]
#     odd_ratios.append(odds)
#     odd_ratios_ci.append("(%.2f, %.2f)" % (odds_ci[0], odds_ci[1]))
    
    # Calculate OR from univariate LR
    if var == 'malaria1':
        odds_lr = calc_odds_ratio(dat, var, True, {'Negative':0.0, 'Positive':1.0})
    else:
        odds_lr = calc_odds_ratio(dat, var, True)
    odd_ratios.append("%.3f" % odds_lr[0])
    odd_ratios_ci.append("(%.2f, %.2f)" % (odds_lr[1], odds_lr[2]))
    
    miss_count = data[var].isnull().values.ravel().sum()
    miss_frac.append(str(miss_count) + "/" + str(tot_count))
    miss_perc.append(str(miss_count) + "/" + str(tot_count) + 
                     " (" + str(int(100.0 * float(miss_count)/tot_count)) + ")")

cat_table = pd.DataFrame({'Name':pd.Series(np.array(names)),
                          'Prevalence (Surv)':pd.Series(np.array(surv_perc)),
                          'Prevalence (Died)':pd.Series(np.array(died_perc)),
                          'Prevalence (overall)':pd.Series(np.array(total_perc)),
                          'P-value':pd.Series(np.array(pvalues)),
                          'Odd ratio':pd.Series(np.array(odd_ratios)),
                          'OR 95% CI':pd.Series(np.array(odd_ratios_ci)),
                          'Missing':pd.Series(np.array(miss_perc))}, 
                         columns=['Name', 'Prevalence (Surv)', 
                                          'Prevalence (Died)', 
                                          'Prevalence (overall)', 
                                          'P-value', 
                                          'Odd ratio', 'OR 95% CI', 'Missing'])

sort_by_pvalue(cat_table)
cat_table.to_csv("correlation-categorical.csv")
cat_table

ETUKey
PatientSex
Optimization terminated successfully.
         Current function value: 0.681020
         Iterations: 9
         Function evaluations: 10
         Gradient evaluations: 10
Disposition
Cannot generate counts table for Disposition
Fever
Optimization terminated successfully.
         Current function value: 0.681722
         Iterations: 8
         Function evaluations: 10
         Gradient evaluations: 10
Headache
Optimization terminated successfully.
         Current function value: 0.679085
         Iterations: 9
         Function evaluations: 10
         Gradient evaluations: 10
Nausea
Optimization terminated successfully.
         Current function value: 0.671733
         Iterations: 10
         Function evaluations: 11
         Gradient evaluations: 11
Vomit
Optimization terminated successfully.
         Current function value: 0.681285
         Iterations: 7
         Function evaluations: 9
         Gradient evaluations: 9
BloodyVomit
Optimization terminated success

Unnamed: 0,Name,Prevalence (Surv),Prevalence (Died),Prevalence (overall),P-value,Odd ratio,OR 95% CI,Missing
1,Jaundice,4/197 (2),20/267 (7),24/464 (5),0.015825,3.907,"(1.31, 11.62)",1/470 (0)
2,HaemorrhagicEyes,64/197 (32),64/267 (23),128/464 (27),0.054368,0.655,"(0.43, 0.99)",1/470 (0)
3,Coma,0/83 (0),5/95 (5),5/178 (2),0.095833,100.0,"(0.00, 100.00)",292/470 (62)
4,Confusion,4/83 (4),12/95 (12),16/178 (8),0.119885,2.855,"(0.88, 9.23)",292/470 (62)
5,Breathlessness,39/197 (19),70/267 (26),109/464 (23),0.133192,1.44,"(0.92, 2.24)",1/470 (0)
6,Headache,122/197 (61),146/267 (54),268/464 (57),0.142351,0.742,"(0.51, 1.08)",1/470 (0)
7,Bleeding,7/197 (3),19/267 (7),26/464 (5),0.148409,2.079,"(0.86, 5.05)",1/470 (0)
8,AstheniaWeakness,135/197 (68),199/267 (74),334/464 (71),0.187215,1.344,"(0.89, 2.02)",1/470 (0)
9,Diarrhoea,96/187 (51),138/243 (56),234/430 (54),0.303988,1.246,"(0.85, 1.83)",35/470 (7)
10,malaria1,17/94 (18),32/131 (24),49/225 (21),0.330524,1.464,"(0.76, 2.83)",241/470 (51)


### Calculating Associations between outcome and all numerical variables

There are different ways to calculate correlation between ordinal and numerical variables. [This](https://www.andrews.edu/~calkins/math/edrm611/edrm13.htm#WHY) is a nice table summarizing independency tests available for different combinations of variable types. It seems that the [Biserial Correlation Coefficient](https://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient) should be adequate to test the independency between numerical and categorical. Available in scipy through the [pointbiserialr](http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pointbiserialr.html) function. The [T-test](http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html) for the mean of two samples, or [one-way ANOVA](http://www.biostathandbook.com/onewayanova.html), seem like good options to test that the mean of the numerical variable is different across two categories.

In [8]:
names = []
pearson = []
pvalues = []
mean_surv = []
mean_died = []
ci_surv = []
ci_died = []
mean_died = []

odd_ratios = []
odd_ratios_ci = []
# ci_critical_value = 1.645 # For 90% CI (assuming Gaussian distribution)
ci_critical_value = 1.96 # For 95% CI 
miss_frac = []
miss_perc = []

tot_count = len(data)
for var in numerical:
    if var in excluded_vars: continue
        
    dat = data.loc[:,(var, depvar)].dropna()
    cat = dat[depvar] == outcomes[1]
    r, pvalue = pointbiserialr(cat, dat[var])

    mean0 = dat[dat[depvar] == outcomes[0]][var].mean()
    std0 = dat[dat[depvar] == outcomes[0]][var].std()
    mean1 = dat[dat[depvar] == outcomes[1]][var].mean()
    std1 = dat[dat[depvar] == outcomes[1]][var].std()    
    
#     mean_std_rec.append("%.2f (%.2f)" % (mean0, std0))
#     mean_std_died.append("%.2f (%.2f)" % (mean1, std1))
    
    mean_surv.append("%.2f" % mean0)
    mean_died.append("%.2f" % mean1)
    ci_surv.append("(%.2f, %.2f)" % (max(0, mean0 - ci_critical_value * std0), mean0 + ci_critical_value * std0))
    ci_died.append("(%.2f, %.2f)" % (max(0, mean1 - ci_critical_value * std1), mean1 + ci_critical_value * std1))        
    
    odds_lr = calc_odds_ratio(dat, var, False)
    odd_ratios.append("%.3f" % odds_lr[0])
    odd_ratios_ci.append("(%.2f, %.2f)" % (odds_lr[1], odds_lr[2]))    
    
    names.append(var)
    pearson.append("%.2f" % r)
    pvalues.append(pvalue)
    
    miss_count = data[var].isnull().values.ravel().sum()
    miss_frac.append(str(miss_count) + "/" + str(tot_count))
    miss_perc.append(str(miss_count) + "/" + str(tot_count) + " (" + str(int(100.0 * float(miss_count)/tot_count)) + ")")
    
num_table = pd.DataFrame({'Name':pd.Series(np.array(names)), 
                          'Mean Surv':pd.Series(np.array(mean_surv)),
                          '95% CI Surv':pd.Series(np.array(ci_surv)),                          
                          'Mean Died':pd.Series(np.array(mean_died)),                          
                          '95% CI Died':pd.Series(np.array(ci_died)),                          
                          'P-value':pd.Series(np.array(pvalues)),
                          'Pearson\'s R':pd.Series(np.array(pearson)),
                          'Odd ratio':pd.Series(np.array(odd_ratios)),
                          'OR 95% CI':pd.Series(np.array(odd_ratios_ci)),                       
                          'Missing':pd.Series(np.array(miss_perc))},
                         columns=['Name', 'Mean Surv', '95% CI Surv', 
                                  'Mean Died', '95% CI Died', 'P-value', 
                                  'Pearson\'s R', 'Odd ratio', 'OR 95% CI',
                                  'Missing'])
sort_by_pvalue(num_table)
num_table.to_csv("correlation-numerical.csv")
num_table

Optimization terminated successfully.
         Current function value: 0.677408
         Iterations: 9
         Function evaluations: 12
         Gradient evaluations: 12
Optimization terminated successfully.
         Current function value: 0.678970
         Iterations: 7
         Function evaluations: 9
         Gradient evaluations: 9
Optimization terminated successfully.
         Current function value: 0.664496
         Iterations: 19
         Function evaluations: 23
         Gradient evaluations: 23
Optimization terminated successfully.
         Current function value: 0.611534
         Iterations: 8
         Function evaluations: 13
         Gradient evaluations: 13
Optimization terminated successfully.
         Current function value: 0.612211
         Iterations: 8
         Function evaluations: 11
         Gradient evaluations: 11
Optimization terminated successfully.
         Current function value: 0.677777
         Iterations: 7
         Function evaluations: 9
         G

Unnamed: 0,Name,Mean Surv,95% CI Surv,Mean Died,95% CI Died,P-value,Pearson's R,Odd ratio,OR 95% CI,Missing
1,cycletime,26.72,"(15.92, 37.52)",22.23,"(11.18, 33.28)",3.61966e-12,-0.37,0.331,"(0.23, 0.47)",137/470 (29)
2,WellnessScale,2.49,"(0.82, 4.17)",3.2,"(1.17, 5.22)",3.132048e-07,0.34,4.572,"(2.44, 8.58)",247/470 (52)
3,PatientAge,28.49,"(0.00, 58.72)",32.03,"(0.00, 72.10)",0.04250099,0.09,1.326,"(1.01, 1.74)",4/470 (0)
4,FeverTemperature,37.41,"(35.50, 39.32)",37.67,"(35.34, 40.01)",0.09924309,0.12,1.391,"(0.94, 2.06)",265/470 (56)
5,ReferralTime,4.9,"(0.00, 19.17)",4.23,"(0.00, 10.26)",0.2832211,-0.06,0.904,"(0.75, 1.10)",172/470 (36)
6,DaysSinceSymptomFeverOnset,4.28,"(0.00, 12.03)",4.2,"(0.00, 9.90)",0.8278132,-0.01,0.973,"(0.76, 1.25)",108/470 (22)
