# Anxiety/depression and chi-squared statistical test

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

warnings.filterwarnings("ignore")

In [2]:
df_pulse = pd.read_csv('../data/interim/pulse2020_puf_all.csv')

In [3]:
df_pulse.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197365 entries, 0 to 1197364
Columns: 207 entries, SCRAM to depression_disorder
dtypes: bool(2), float64(203), object(2)
memory usage: 1.8+ GB


In [4]:
df_pulse.columns

Index(['SCRAM', 'WEEK', 'EST_ST', 'EST_MSA', 'PWEIGHT', 'ABIRTH_YEAR',
       'EGENDER', 'AGENDER', 'RHISPANIC', 'AHISPANIC',
       ...
       'PSWHYCHG3', 'PSWHYCHG4', 'PSWHYCHG5', 'PSWHYCHG6', 'PSWHYCHG7',
       'PSWHYCHG8', 'PSWHYCHG9', 'AGE', 'anxiety_disorder',
       'depression_disorder'],
      dtype='object', length=207)

### Dataset post-data wrangling

In [5]:
df_pulse.head()

Unnamed: 0,SCRAM,WEEK,EST_ST,EST_MSA,PWEIGHT,ABIRTH_YEAR,EGENDER,AGENDER,RHISPANIC,AHISPANIC,...,PSWHYCHG3,PSWHYCHG4,PSWHYCHG5,PSWHYCHG6,PSWHYCHG7,PSWHYCHG8,PSWHYCHG9,AGE,anxiety_disorder,depression_disorder
0,V010000001S10011099370111,2020-04-23,47.0,,1973.489532,2.0,2.0,2.0,1.0,2.0,...,,,,,,,,34.0,True,False
1,V010000001S10011900470112,2020-04-23,1.0,,1929.488419,2.0,1.0,2.0,1.0,2.0,...,,,,,,,,65.0,True,True
2,V010000001S18010744940111,2020-04-23,26.0,,1526.61238,2.0,2.0,2.0,1.0,2.0,...,,,,,,,,44.0,False,False
3,V010000001S37010301340112,2020-04-23,1.0,,2734.635354,2.0,1.0,2.0,1.0,2.0,...,,,,,,,,56.0,True,True
4,V010000001S37010480340111,2020-04-23,1.0,,681.211425,2.0,2.0,2.0,1.0,2.0,...,,,,,,,,57.0,False,False


In [6]:
df_pulse.shape

(1197365, 207)

For a quick analysis, we are going to ignore the secundary variables and using only those variables asked to the whole sample:

In [7]:
secondary_columns = [
                    'KINDWORK',
                    'RSNNOWRK',
                    'UNEMPPAY',
                    'FOODSUFRSN1',
                    'FOODSUFRSN2',
                    'FOODSUFRSN3',
                    'FOODSUFRSN4',
                    'FOODSUFRSN5',
                    'WHEREFREE1',
                    'WHEREFREE2',
                    'WHEREFREE3',
                    'WHEREFREE4',
                    'WHEREFREE5',
                    'WHEREFREE6',
                    'WHEREFREE7',
                    'FOODCONF',
                    'MORTLMTH',
                    'MORTCONF',
                    'TEACH1',
                    'TEACH2',
                    'TEACH3',
                    'TEACH4',
                    'TEACH5',
                    'COMPAVAIL',
                    'COMP1',
                    'COMP2',
                    'COMP3',
                    'INTRNTAVAIL',
                    'INTRNT1',
                    'INTRNT2',
                    'INTRNT3',
                    'TSCHLHRS',
                    'TTCH_HRS'
                    ]

In [8]:
print('Number of contextual variables: {}'.format(len(secondary_columns)))

Number of contextual variables: 33


Deleting contextual variables

In [9]:
df_universal = df_pulse.drop(columns=secondary_columns)

In [10]:
df_universal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197365 entries, 0 to 1197364
Columns: 174 entries, SCRAM to depression_disorder
dtypes: bool(2), float64(170), object(2)
memory usage: 1.5+ GB


We are going to do a chi-squared test to find the universal variables more related to the Anxiety and Depression outcomes, calculated previously based on the variables `ANXIOUS`, `INTEREST`, `DOWN` and `WORRY`.

Drop columns when they have lost up to 75% of information

In [11]:
threshold = int(0.75*len(df_universal))
threshold

898023

In [12]:
df_universal1 = df_universal.dropna(axis=1, thresh=threshold)
df_universal1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197365 entries, 0 to 1197364
Data columns (total 46 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   SCRAM                1197365 non-null  object 
 1   WEEK                 1197365 non-null  object 
 2   EST_ST               1197365 non-null  float64
 3   PWEIGHT              1197365 non-null  float64
 4   ABIRTH_YEAR          1197365 non-null  float64
 5   EGENDER              1197365 non-null  float64
 6   AGENDER              1197365 non-null  float64
 7   RHISPANIC            1197365 non-null  float64
 8   AHISPANIC            1197365 non-null  float64
 9   RRACE                1197365 non-null  float64
 10  ARACE                1197365 non-null  float64
 11  EEDUC                1197365 non-null  float64
 12  AEDUC                1197365 non-null  float64
 13  MS                   1188120 non-null  float64
 14  THHLD_NUMPER         1197365 non-null  float64
 15

Drop ID, WEEK, and variables used to calculate the outcomes of the chi-squared test (`ANXIOUS`, `WORRY` were used to estimate the ANXIETY DISORDER and `DOWN`, `INTEREST` were used to score DEPRESSION DISORDER).

In [13]:
df_universal1.drop(columns=['SCRAM', 'WEEK', 'ANXIOUS', 'WORRY', 'DOWN', 'INTEREST'], inplace=True)

Drop duplicates and missing datapoints

In [14]:
df_universal1.drop_duplicates(inplace=True)

In [15]:
df_universal1.dropna(inplace=True)

In [16]:
df_universal1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 817105 entries, 0 to 1197364
Data columns (total 40 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   EST_ST               817105 non-null  float64
 1   PWEIGHT              817105 non-null  float64
 2   ABIRTH_YEAR          817105 non-null  float64
 3   EGENDER              817105 non-null  float64
 4   AGENDER              817105 non-null  float64
 5   RHISPANIC            817105 non-null  float64
 6   AHISPANIC            817105 non-null  float64
 7   RRACE                817105 non-null  float64
 8   ARACE                817105 non-null  float64
 9   EEDUC                817105 non-null  float64
 10  AEDUC                817105 non-null  float64
 11  MS                   817105 non-null  float64
 12  THHLD_NUMPER         817105 non-null  float64
 13  AHHLD_NUMPER         817105 non-null  float64
 14  THHLD_NUMKID         817105 non-null  float64
 15  AHHLD_NUMKID    

## Selecting K Best features for Anxiety disorders

In [17]:
np.random.seed(100)

y = df_universal1.anxiety_disorder
X = df_universal1.drop(columns=['anxiety_disorder', 'depression_disorder'])
print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100, test_size=0.3)
print(X_train.shape)

(817105, 38)
(571973, 38)


In [18]:
K = 20 # select 20 features
sel_chi2 = SelectKBest(chi2, k=K)    
X_train_chi2 = sel_chi2.fit_transform(X_train, y_train)

In [19]:
set_chi2_list = list(sel_chi2.get_support())
columns_list = list(df_universal1.drop(columns=['anxiety_disorder', 'depression_disorder']))

In [20]:
dictionary_anxiety_variables = {'variable': columns_list, 'chi-squared scores': np.round(sel_chi2.scores_, 3), 'selected': set_chi2_list}
df_chiSelectionAnxiety = pd.DataFrame(dictionary_anxiety_variables)

In [21]:
df_chiSelectionAnxiety.sort_values('chi-squared scores', ascending=False)

Unnamed: 0,variable,chi-squared scores,selected
1,PWEIGHT,2800628.42,True
37,AGE,78597.324,True
23,TSPNDFOOD,21723.779,True
25,HLTHSTATUS,17556.675,True
21,CURFOODSUF,14106.833,True
36,INCOME,13901.842,True
11,MS,9692.085,True
20,PRIFOODSUF,7443.217,True
33,DELAY,3900.684,True
34,NOTGET,3549.19,True


## Select K-Best features for Depression Disorders

In [22]:
np.random.seed(100)

y = df_universal1.depression_disorder
X = df_universal1.drop(columns=['anxiety_disorder', 'depression_disorder'])
print(X.shape)


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100, test_size=0.3)
print(X_train.shape)

(817105, 38)
(571973, 38)


In [23]:
sel_chi2 = SelectKBest(chi2, k=K)
X_train_chi2 = sel_chi2.fit_transform(X_train, y_train)

In [24]:
set_chi2_list = list(sel_chi2.get_support())

In [25]:
dictionary_anxiety_variables = {'variable': columns_list, 'chi-squared scores': np.round(sel_chi2.scores_, 3), 'selected': set_chi2_list}
df_chiSelectionDepression = pd.DataFrame(dictionary_anxiety_variables)

In [26]:
df_chiSelectionDepression.sort_values('chi-squared scores', ascending=False)

Unnamed: 0,variable,chi-squared scores,selected
1,PWEIGHT,16343110.0,True
37,AGE,43408.84,True
36,INCOME,22568.4,True
25,HLTHSTATUS,21850.51,True
21,CURFOODSUF,16030.3,True
11,MS,14945.16,True
20,PRIFOODSUF,9162.641,True
23,TSPNDFOOD,5485.259,True
9,EEDUC,3589.818,True
35,TENURE,3065.05,True
