In [1]:

import numpy
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi 

nesarc_pds_df = pd.read_csv('../../data/csv/nesarc_pds.csv', low_memory=False)
nesarc_pds_df.columns

Index(['ETHRACE2A', 'ETOTLCA2', 'IDNUM', 'PSU', 'STRATUM', 'WEIGHT', 'CDAY',
       'CMON', 'CYEAR', 'REGION',
       ...
       'SOL12ABDEP', 'SOLP12ABDEP', 'HAL12ABDEP', 'HALP12ABDEP', 'MAR12ABDEP',
       'MARP12ABDEP', 'HER12ABDEP', 'HERP12ABDEP', 'OTHB12ABDEP',
       'OTHBP12ABDEP'],
      dtype='object', length=3008)

In [2]:

# Set variables you will be working with to numeric
working_column_list = ['S3AQ3B1', 'S3AQ3C1', 'CHECK321']
for column in working_column_list:
    nesarc_pds_df[column] = pd.to_numeric(nesarc_pds_df[column], errors='coerce')

In [3]:

# Subset nesarc_pds_df to young adults age 18 to 25 who have smoked in the past 12 months
match_series = (nesarc_pds_df['AGE']>=18) & (nesarc_pds_df['AGE']<=25) & (nesarc_pds_df['CHECK321']==1)
sub1_df = nesarc_pds_df[match_series].copy()

In [4]:

sub1_df['S3AQ3B1'].unique()
sub1_df['S3AQ3C1'].unique()

array([ 1.,  2.,  4.,  3.,  5.,  6.,  9.])

array([  3.,  10.,  20.,   5.,   8.,   1.,   2.,  98.,  30.,   4.,  12.,
        99.,   6.,  13.,   7.,  15.,  40.,  14.,  11.,  60.,  17.,  25.,
        16.,  27.,  80.,   9.,  35.,  24.,  19.,  18.,  28.])

In [5]:

# SET MISSING nesarc_pds_df
match_series = sub1_df['S3AQ3B1'].isnull()
sub1_df.loc[sub1_df[match_series].index, 'S3AQ3B1'] = 9
match_series = sub1_df['S3AQ3C1'].isnull()
sub1_df.loc[sub1_df[match_series].index, 'S3AQ3C1'] = 99

In [6]:

# Recode number of days smoked in the past month
recode1 = {1: 30, 2: 22, 3: 14, 4: 5, 5: 2.5, 6: 1}
sub1_df['USFREQMO'] = sub1_df['S3AQ3B1'].map(recode1)

In [7]:

# Convert new variable USFREQMMO to numeric
sub1_df['USFREQMO'] = pd.to_numeric(sub1_df['USFREQMO'], errors='coerce')

In [8]:

# Create a secondary variable multiplying the days smoked/month and the number of cig/per day
sub1_df['NUMCIGMO_EST'] = sub1_df['USFREQMO'] * sub1_df['S3AQ3C1']

In [9]:

sub1_df['NUMCIGMO_EST'] = pd.to_numeric(sub1_df['NUMCIGMO_EST'], errors='coerce')

In [10]:

ct1_group = sub1_df.groupby('NUMCIGMO_EST').size()
print(ct1_group)

NUMCIGMO_EST
1.0        29
2.0        14
2.5        11
3.0        12
4.0         2
5.0        34
6.0         1
7.5        12
8.0         1
10.0       38
12.5        9
14.0        3
15.0       14
17.5        1
20.0       13
22.0        4
24.0        1
25.0       14
28.0       17
30.0       25
35.0        2
42.0       19
44.0        9
50.0        7
56.0       15
60.0       28
66.0       14
70.0       22
84.0        3
88.0        6
         ... 
154.0       3
176.0       3
180.0      47
210.0      39
220.0      12
240.0      36
270.0       6
280.0       1
300.0     350
330.0       4
360.0      25
390.0       7
420.0       2
450.0      97
480.0       5
510.0       2
540.0       3
570.0       1
600.0     357
750.0      13
810.0       1
840.0       1
900.0      38
1050.0      1
1200.0     29
1800.0      2
2178.0      1
2400.0      1
2940.0      1
2970.0      5
Length: 68, dtype: int64


In [11]:

# Use ols function for calculating the F-statistic and associated p value
model1_ols = smf.ols(formula='NUMCIGMO_EST ~ C(MAJORDEPLIFE)', data=sub1_df)
results1_fitted = model1_ols.fit()
print(results1_fitted.summary())

                            OLS Regression Results                            
Dep. Variable:           NUMCIGMO_EST   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     3.567
Date:                Wed, 29 Nov 2017   Prob (F-statistic):             0.0591
Time:                        11:41:45   Log-Likelihood:                -12197.
No. Observations:                1703   AIC:                         2.440e+04
Df Residuals:                    1701   BIC:                         2.441e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept              320.6635 

In [12]:

column_list = ['NUMCIGMO_EST', 'MAJORDEPLIFE']
sub2_df = sub1_df[column_list].dropna().copy()

In [13]:

print('means for numcigmo_est by major depression status')
m1_df = sub2_df.groupby('MAJORDEPLIFE').mean()
print(m1_df)

means for numcigmo_est by major depression status
              NUMCIGMO_EST
MAJORDEPLIFE              
0               320.663484
1               353.162556


In [14]:

print('standard deviations for numcigmo_est by major depression status')
sd1_df = sub2_df.groupby('MAJORDEPLIFE').std()
print(sd1_df)

# I will call it sub3
sub3_df = sub1_df[['NUMCIGMO_EST', 'ETHRACE2A']].dropna()

standard deviations for numcigmo_est by major depression status
              NUMCIGMO_EST
MAJORDEPLIFE              
0               302.827616
1               337.299279


In [15]:

model2_fitted = smf.ols(formula='NUMCIGMO_EST ~ C(ETHRACE2A)', data=sub3_df).fit()
print(model2_fitted.summary())

                            OLS Regression Results                            
Dep. Variable:           NUMCIGMO_EST   R-squared:                       0.041
Model:                            OLS   Adj. R-squared:                  0.039
Method:                 Least Squares   F-statistic:                     18.29
Date:                Wed, 29 Nov 2017   Prob (F-statistic):           1.00e-14
Time:                        11:42:16   Log-Likelihood:                -12163.
No. Observations:                1703   AIC:                         2.434e+04
Df Residuals:                    1698   BIC:                         2.436e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept           377.8724      9.41

In [16]:

print('means for numcigmo_est by ethnic group')
m2_df = sub3_df.groupby('ETHRACE2A').mean()
print(m2_df)

means for numcigmo_est by ethnic group
           NUMCIGMO_EST
ETHRACE2A              
1            377.872401
2            259.273810
3            310.988095
4            244.258621
5            236.177612


In [17]:

print('standard deviations for numcigmo_est by ethnic group')
sd2_df = sub3_df.groupby('ETHRACE2A').std()
print(sd2_df)

standard deviations for numcigmo_est by ethnic group
           NUMCIGMO_EST
ETHRACE2A              
1            317.996738
2            278.677392
3            260.116964
4            195.076441
5            305.792022


In [18]:

mc1 = multi.MultiComparison(sub3_df['NUMCIGMO_EST'], sub3_df['ETHRACE2A'])
res1 = mc1.tukeyhsd()
print(res1.summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2  meandiff   lower    upper   reject
-------------------------------------------------
  1      2    -118.5986 -181.7829 -55.4143  True 
  1      3     -66.8843 -198.4768 64.7082  False 
  1      4    -133.6138 -246.4056 -20.822   True 
  1      5    -141.6948 -194.1287 -89.2608  True 
  2      3     51.7143   -89.6593 193.0879 False 
  2      4     -15.0152 -139.0793 109.0489 False 
  2      5     -23.0962  -96.7116 50.5192  False 
  3      4     -66.7295 -236.1881 102.7291 False 
  3      5     -74.8105 -211.7176 62.0966  False 
  4      5      -8.081  -127.0304 110.8684 False 
-------------------------------------------------
