In [23]:
from IPython.display import HTML

HTML('''<h3>Model Interpretation for ANOVA:</h3>

<p>When examining the association between current number of cigarettes smoked (quantitative response) and past year nicotine dependence 
(categorical explanatory), an Analysis of Variance (ANOVA) revealed that among daily, young adult smokers (my sample), those with nicotine 
dependence reported smoking significantly more cigarettes per day (Mean=14.6, s.d. ±9.15) compared to those without nicotine dependence 
(Mean=11.4, s.d. ±7.43), F(1, 1313)=44.68, p&lt;0001.</p>

<p>Note that the degrees of freedom (that I report in parentheses) following ‘F’ can be found in the OLS table as the DF model and DF 
residuals. In this example 44.68 is the actual F value from the OLS table and we commonly report a very small p value as simply &lt;.0001.</p>

<h3>Model Interpretation for post hoc ANOVA results:</h3>

<p>ANOVA revealed that among daily, young adult smokers (my sample), number of cigarettes smoked per day (collapsed into 5 ordered categories, 
which is the categorical explanatory variable) and number of nicotine dependence symptoms (quantitative response variable) were significantly 
associated, F (4, 1308)=11.79, p=0001. Post hoc comparisons of mean number of nicotine dependence symptoms by pairs of cigarettes per day 
categories revealed that those individuals smoking more than 10 cigarettes per day (i.e. 11 to 15, 16 to 20 and &gt;20) reported significantly 
more nicotine dependence symptoms compared to those smoking 10 or fewer cigarettes per day (i.e. 1 to 5 and 6 to 10). All other comparisons 
were statistically similar.</p>''')

In [24]:

import numpy
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi 

nesarc_pds_df = pd.read_csv('../../data/csv/nesarc_pds.csv', low_memory=False)
nesarc_pds_df.columns

Index(['ETHRACE2A', 'ETOTLCA2', 'IDNUM', 'PSU', 'STRATUM', 'WEIGHT', 'CDAY',
       'CMON', 'CYEAR', 'REGION',
       ...
       'SOL12ABDEP', 'SOLP12ABDEP', 'HAL12ABDEP', 'HALP12ABDEP', 'MAR12ABDEP',
       'MARP12ABDEP', 'HER12ABDEP', 'HERP12ABDEP', 'OTHB12ABDEP',
       'OTHBP12ABDEP'],
      dtype='object', length=3008)

In [25]:

# Set variables you will be working with to numeric
working_column_list = ['AGE', 'S3AQ3B1', 'S3AQ3C1', 'CHECK321']
for column in working_column_list:
    nesarc_pds_df[column] = pd.to_numeric(nesarc_pds_df[column], errors='coerce')

In [26]:

# Subset nesarc_pds_df to young adults age 18 to 25 who have smoked in the past 12 months
mask_series = (nesarc_pds_df['AGE']>=18) & (nesarc_pds_df['AGE']<=25) & (nesarc_pds_df['CHECK321']==1)
sub1_df = nesarc_pds_df[mask_series].copy()

In [27]:

sub1_df['S3AQ3B1'].unique()
sub1_df['S3AQ3C1'].unique()

array([ 1.,  2.,  4.,  3.,  5.,  6.,  9.])

array([  3.,  10.,  20.,   5.,   8.,   1.,   2.,  98.,  30.,   4.,  12.,
        99.,   6.,  13.,   7.,  15.,  40.,  14.,  11.,  60.,  17.,  25.,
        16.,  27.,  80.,   9.,  35.,  24.,  19.,  18.,  28.])

In [28]:

# SET MISSING nesarc_pds_df
mask_series = sub1_df['S3AQ3B1'].isnull()
sub1_df.loc[sub1_df[mask_series].index, 'S3AQ3B1'] = 9
mask_series = sub1_df['S3AQ3C1'].isnull()
sub1_df.loc[sub1_df[mask_series].index, 'S3AQ3C1'] = 99

In [29]:

# Recode number of days smoked in the past month
recode1 = {1: 30, 2: 22, 3: 14, 4: 5, 5: 2.5, 6: 1}
sub1_df['USFREQMO'] = sub1_df['S3AQ3B1'].map(recode1)

In [30]:

# Convert new variable USFREQMMO to numeric
sub1_df['USFREQMO'] = pd.to_numeric(sub1_df['USFREQMO'], errors='coerce')

In [31]:

# Create a secondary variable multiplying the days smoked/month and the number of cig/per day
sub1_df['NUMCIGMO_EST'] = sub1_df['USFREQMO'] * sub1_df['S3AQ3C1']

In [32]:

sub1_df['NUMCIGMO_EST'] = pd.to_numeric(sub1_df['NUMCIGMO_EST'], errors='coerce')

In [33]:

ct1_group = sub1_df.groupby('NUMCIGMO_EST').size()
#print(ct1_group)

In [34]:

# Use ols function for calculating the F-statistic and associated p value
model1_ols = smf.ols(formula='NUMCIGMO_EST ~ C(MAJORDEPLIFE)', data=sub1_df)
results1_fitted = model1_ols.fit()
print(results1_fitted.summary())

                            OLS Regression Results                            
Dep. Variable:           NUMCIGMO_EST   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     3.567
Date:                Wed, 29 Nov 2017   Prob (F-statistic):             0.0591
Time:                        13:01:26   Log-Likelihood:                -12197.
No. Observations:                1703   AIC:                         2.440e+04
Df Residuals:                    1701   BIC:                         2.441e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept              320.6635 

In [36]:

column_list = ['NUMCIGMO_EST', 'MAJORDEPLIFE']
sub2_df = sub1_df[column_list].dropna().copy()

In [37]:

print('means for numcigmo_est by major depression status')
m1_df = sub2_df.groupby('MAJORDEPLIFE').mean()
print(m1_df)

means for numcigmo_est by major depression status
              NUMCIGMO_EST
MAJORDEPLIFE              
0               320.663484
1               353.162556


In [38]:

print('standard deviations for numcigmo_est by major depression status')
sd1_df = sub2_df.groupby('MAJORDEPLIFE').std()
print(sd1_df)

# I will call it sub3
sub3_df = sub1_df[['NUMCIGMO_EST', 'ETHRACE2A']].dropna()

standard deviations for numcigmo_est by major depression status
              NUMCIGMO_EST
MAJORDEPLIFE              
0               302.827616
1               337.299279


In [45]:
from IPython.display import HTML

HTML('''<h3>Model Interpretation for ANOVA:</h3>

<p>When examining the association between estimated number of cigarettes smoked per month (quantitative response) and major lifetime 
depression (categorical explanatory), an Analysis of Variance (ANOVA) revealed that those with major lifetime depression reported smoking 
insignificantly more cigarettes per day (Mean=''' + 
     str('%.1f' % m1_df.loc[1, 'NUMCIGMO_EST']) + ''', s.d. ±''' + 
     str('%.1f' % sd1_df.loc[1, 'NUMCIGMO_EST']) + ''') compared to those without major lifetime depression (Mean=''' + 
     str('%.1f' % m1_df.loc[0, 'NUMCIGMO_EST']) + ''', s.d. ±''' + 
     str('%.1f' % sd1_df.loc[0, 'NUMCIGMO_EST']) + '''), F(''' + 
     str('%d' % results1_fitted.df_model) + ''', ''' + 
     str('%d' % results1_fitted.df_resid) + ''')=''' + 
     str('%.2f' % results1_fitted.fvalue) + ''', p=''' + 
     str('%.3f' % results1_fitted.f_pvalue) + '''.</p>''')

In [47]:

model2_fitted = smf.ols(formula='NUMCIGMO_EST ~ C(ETHRACE2A)', data=sub3_df).fit()
print(model2_fitted.summary())

                            OLS Regression Results                            
Dep. Variable:           NUMCIGMO_EST   R-squared:                       0.041
Model:                            OLS   Adj. R-squared:                  0.039
Method:                 Least Squares   F-statistic:                     18.29
Date:                Wed, 29 Nov 2017   Prob (F-statistic):           1.00e-14
Time:                        13:28:27   Log-Likelihood:                -12163.
No. Observations:                1703   AIC:                         2.434e+04
Df Residuals:                    1698   BIC:                         2.436e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept           377.8724      9.41

In [48]:

print('means for numcigmo_est by ethnic group')
m2_df = sub3_df.groupby('ETHRACE2A').mean()
print(m2_df)

means for numcigmo_est by ethnic group
           NUMCIGMO_EST
ETHRACE2A              
1            377.872401
2            259.273810
3            310.988095
4            244.258621
5            236.177612


In [49]:

print('standard deviations for numcigmo_est by ethnic group')
sd2_df = sub3_df.groupby('ETHRACE2A').std()
print(sd2_df)

standard deviations for numcigmo_est by ethnic group
           NUMCIGMO_EST
ETHRACE2A              
1            317.996738
2            278.677392
3            260.116964
4            195.076441
5            305.792022


In [50]:

mc1 = multi.MultiComparison(sub3_df['NUMCIGMO_EST'], sub3_df['ETHRACE2A'])
res1 = mc1.tukeyhsd()
print(res1.summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2  meandiff   lower    upper   reject
-------------------------------------------------
  1      2    -118.5986 -181.7829 -55.4143  True 
  1      3     -66.8843 -198.4768 64.7082  False 
  1      4    -133.6138 -246.4056 -20.822   True 
  1      5    -141.6948 -194.1287 -89.2608  True 
  2      3     51.7143   -89.6593 193.0879 False 
  2      4     -15.0152 -139.0793 109.0489 False 
  2      5     -23.0962  -96.7116 50.5192  False 
  3      4     -66.7295 -236.1881 102.7291 False 
  3      5     -74.8105 -211.7176 62.0966  False 
  4      5      -8.081  -127.0304 110.8684 False 
-------------------------------------------------


In [113]:
import io

output = io.StringIO(initial_value=res1._results_table.as_csv().strip())
res1_df = pd.read_csv(output).reset_index()
output.close()
res1_df.columns = res1_df.iloc[0]
res1_df = res1_df.reindex(res1_df.index.drop(0))
res1_df['reject'] = res1_df['reject'].map(lambda x: x.strip() == 'True')
res1_df['group1'] = res1_df['group1'].map(lambda x: int(x.strip()))
res1_df['group2'] = res1_df['group2'].map(lambda x: int(x.strip()))
mask_series = (res1_df['reject'] == True)
column_list = ['group1', 'group2']
race_dict = {1: 'White, Not Hispanic or Latino',
             2: 'Black, Not Hispanic or Latino',
             3: 'American Indian/Alaska Native, Not Hispanic or Latino',
             4: 'Asian/Native Hawaiian/Pacific Islander, Not Hispanic or Latino',
             5: 'Hispanic or Latino'}

def f(row):
    left_race = race_dict[row['group1']]
    right_race = race_dict[row['group2']]
    
    return('<code>' + left_race + 
           '</code> individuals reported significantly more estimated number of cigarettes smoked per month compared to <code>' + 
           right_race + '</code> individuals')

if model2_fitted.f_pvalue < 0.0001:
    f_pvalue_statement = ', p<0.0001'
else:
    f_pvalue_statement = ', p=' + str('%.4f' % model2_fitted.f_pvalue)

In [114]:
from IPython.display import HTML

HTML('''<h3>Model Interpretation for post hoc ANOVA results:</h3>

<p>ANOVA revealed that among smokers, imputed race/ethnicity (5 categories, which is the categorical explanatory variable) and estimated 
number of cigarettes smoked per month (quantitative response variable) were significantly associated, F(''' + 
     str('%d' % model2_fitted.df_model) + ''', ''' + 
     str('%d' % model2_fitted.df_resid) + ''')=''' + 
     str('%.2f' % model2_fitted.fvalue) + 
     f_pvalue_statement + '''. Post hoc comparisons of estimated number of cigarettes smoked per month by pairs of imputed 
     race/ethnicity categories revealed that ''' + 
     ', and '.join(res1_df[mask_series][column_list].apply(f, axis=1).tolist()) + 
     '''. All other comparisons were statistically similar.</p>''')