In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.linear_model import LassoCV, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [23]:
# config: depends whether you're on Google Colab or local

# On google colab
# Mount GDrive and attach it to the colab for data I/O
# from google.colab import drive
# drive.mount('/content/drive')
# input_dir = '/content/drive/My Drive/covid19_argentina/data/input/'
# output_dir = '/content/drive/My Drive/covid19_argentina/data/output/'

# Locally
input_dir = './data/input/'
output_dir = './data/output/'



In [2]:
def add_top_column(df, top_col, inplace=True):
    if not inplace:
        df = df.copy()
    
    df.columns = pd.MultiIndex.from_product([[top_col], df.columns])
    return df

In [3]:
# Hyperparameter tuning

# Parameters of pipelines can be set using ‘__’ separated parameter names:

def lasso_standardscaler_alpha(X,y,cv=10):
    alphas= np.logspace(-4, 0, 30) #equally spaced on log scale from 10**-4 (0.0004) to 10**0 (1)
    pipe = Pipeline(steps=[
        ('scaler', StandardScaler()), 
        ('model', Lasso())
    ])
    param_grid = {
        'model__alpha': alphas,
    }
    search = GridSearchCV(pipe,param_grid, cv=cv,n_jobs=-1)
    search.fit(X,y)
    results = pd.DataFrame(search.cv_results_)
    best = results[results['rank_test_score']==1]
    mean_score = best.mean_test_score.values
    std_score = best.std_test_score.values
    alpha = float(best.param_model__alpha.values)
    return alpha

In [4]:
# Load data

df = pd.read_csv(input_dir+'Emotional symptoms COVID19_Arg_May20_v2.csv', sep = ';')
df

Unnamed: 0,Date,Age,Age_groups,Gender,Education_raw,Family_Income,Province,Region,Mental_health_tx,PHQ9_1,...,Riskofcontagion,Lockdown_adherence,Lockdown_difficulty,Financialworry_present,Financialworry_future,Daily_stress_Index,IU_total,UCLA_LS_Total,Loneliness_unidimensional,Negat_thinking
0,5/21/2020,24,1,1,Posgrado o especializaci�n de menos de 2 a�os,2,C�rdoba,0,1,1,...,3,9,7,4,4,1,27,9,2,1
1,5/21/2020,25,1,2,Universitario completo,1,Pcia. Buenos Aires (conurbano/AMBA),1,0,2,...,4,10,8,7,4,2,43,21,10,1
2,5/21/2020,20,1,1,Universitario incompleto,3,Pcia. Buenos Aires (conurbano/AMBA),1,0,1,...,1,10,1,10,10,2,13,0,0,1
3,5/21/2020,19,1,1,Universitario incompleto,2,Pcia. Buenos Aires (conurbano/AMBA),1,0,1,...,1,10,7,5,3,-4,27,9,5,1
4,5/21/2020,25,1,1,Terciario incompleto,2,CABA,1,0,1,...,2,10,7,8,6,-1,35,12,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3612,6/3/2020,66,4,1,Universitario incompleto,1,Pcia. Buenos Aires (conurbano/AMBA),1,1,1,...,7,9,4,9,5,-1,37,18,7,1
3613,6/3/2020,67,4,1,Universitario completo,2,CABA,1,0,2,...,6,6,7,8,6,2,25,5,4,0
3614,6/3/2020,75,4,1,Posgrado de m�s de 2 a�os (Maestr�as o similar),1,CABA,1,0,0,...,7,8,7,7,5,3,27,6,4,1
3615,6/4/2020,70,4,1,Posgrado de m�s de 2 a�os (Maestr�as o similar),2,Pcia. Buenos Aires (conurbano/AMBA),1,1,1,...,7,8,7,9,7,-4,22,18,7,1


In [5]:
df.columns

Index(['Date', 'Age', 'Age_groups', 'Gender', 'Education_raw', 'Family_Income',
       'Province', 'Region', 'Mental_health_tx', 'PHQ9_1', 'PHQ9_2', 'PHQ9_3',
       'PHQ9_4', 'PHQ9_5', 'PHQ9_6', 'PHQ9_7', 'PHQ9_8', 'PHQ9_9', 'GAD7_1',
       'GAD7_2', 'GAD7_3', 'GAD7_4', 'GAD7_5', 'GAD7_6', 'GAD7_7', 'UCLA_LS_1',
       'UCLA_LS_2', 'UCLA_LS_3', 'UCLA_LS_4', 'UCLA_LS_5', 'UCLA_LS_6',
       'UCLA_LS_7', 'UCLA_LS_8', 'UCLA_LS_9', 'UCLA_LS_10', 'COGN_PLAN',
       'COGN_CONC', 'COGN_ERROR', 'COGN_WM', 'COGN_OLV', 'COGN_MWAND',
       'COGN_IMPROV', 'COGN_DIFFICULTIES_index', 'Dailystress1',
       'Dailystress2', 'Dailystress3', 'Dailystress4', 'Dailystress5',
       'Dailystress6', 'FAS3', 'FAS6', 'FAS7', 'FAS8', 'FAS9', 'IUS12_1',
       'IUS12_2', 'IUS12_3', 'IUS12_4', 'IUS12_5', 'IUS12_6', 'IUS12_7',
       'IUS12_8', 'IUS12_9', 'IUS12_10', 'IUS12_11', 'IUS12_12', 'PHQ9_dx',
       'PHQ9_Severity', 'GAD7_dx', 'GAD7_severity', 'PHQ9_Total', 'GAD7_Total',
       'Mental_fatigue', 'Thr

In [6]:
# with regards to the correlation, we added a few variables 
X = df[[
        'PHQ9_Total',
        'GAD7_Total',
        'Lockdown_adherence',
        'Age',
        'Gender', #added (binary)
        'Family_Income',  #added (ordinal)
        'Negat_thinking', #added (binary)
        'COGN_DIFFICULTIES_index',
        'Mental_fatigue', 'Threat', 'Riskofcontagion', 
       'Lockdown_difficulty', 'Financialworry_present',
       'Financialworry_future', 'Daily_stress_Index', 'IU_total',
       'UCLA_LS_Total',          
       ]]

# Create dummy variable for gender
gender = X['Gender']
gender = pd.get_dummies(gender,prefix='Gender')
print(gender)
print(gender.sum())


X = X.drop('Gender', axis=1)
X['Gender_female'] = gender['Gender_1']
X['Gender_male'] = gender['Gender_2']
# other genders were not included since there were only 9. 

variables = X.columns.values


      Gender_1  Gender_2  Gender_3
0            1         0         0
1            0         1         0
2            1         0         0
3            1         0         0
4            1         0         0
...        ...       ...       ...
3612         1         0         0
3613         1         0         0
3614         1         0         0
3615         1         0         0
3616         1         0         0

[3617 rows x 3 columns]
Gender_1    3083
Gender_2     525
Gender_3       9
dtype: int64


In [8]:
# Create DFs for each IV

# Depression

variables_depression = X.drop('PHQ9_Total',axis=1).columns.values
X_depression = X.drop('PHQ9_Total',axis=1).values
y_depression = X['PHQ9_Total'].values
print('depression covariates', variables_depression)
print(X_depression.shape, y_depression.shape)
print('\n\n')

# Anxiety

variables_anxiety = X.drop('GAD7_Total',axis=1).columns.values
X_anxiety = X.drop('GAD7_Total',axis=1).values
y_anxiety = X['GAD7_Total'].values
print('Anxiety covariates', variables_anxiety)
print(X_anxiety.shape, y_anxiety.shape)
print('\n\n')


# Lockdown Adherence

variables_lockdown = X.drop('Lockdown_adherence',axis=1).columns.values
X_lockdown = X.drop('Lockdown_adherence',axis=1).values
y_lockdown = X['Lockdown_adherence'].values
print('Lockdown Adherence covariates', variables_lockdown)
print(X_lockdown.shape, y_lockdown.shape)
print('\n\n')



depression covariates ['GAD7_Total' 'Lockdown_adherence' 'Age' 'Family_Income' 'Negat_thinking'
 'COGN_DIFFICULTIES_index' 'Mental_fatigue' 'Threat' 'Riskofcontagion'
 'Lockdown_difficulty' 'Financialworry_present' 'Financialworry_future'
 'Daily_stress_Index' 'IU_total' 'UCLA_LS_Total' 'Gender_female'
 'Gender_male']
(3617, 17) (3617,)



Anxiety covariates ['PHQ9_Total' 'Lockdown_adherence' 'Age' 'Family_Income' 'Negat_thinking'
 'COGN_DIFFICULTIES_index' 'Mental_fatigue' 'Threat' 'Riskofcontagion'
 'Lockdown_difficulty' 'Financialworry_present' 'Financialworry_future'
 'Daily_stress_Index' 'IU_total' 'UCLA_LS_Total' 'Gender_female'
 'Gender_male']
(3617, 17) (3617,)



Lockdown Adherence covariates ['PHQ9_Total' 'GAD7_Total' 'Age' 'Family_Income' 'Negat_thinking'
 'COGN_DIFFICULTIES_index' 'Mental_fatigue' 'Threat' 'Riskofcontagion'
 'Lockdown_difficulty' 'Financialworry_present' 'Financialworry_future'
 'Daily_stress_Index' 'IU_total' 'UCLA_LS_Total' 'Gender_female'
 'Gender_male']

In [9]:
# Observe the range of the covariates
X.drop('PHQ9_Total',axis=1).describe()

Unnamed: 0,GAD7_Total,Lockdown_adherence,Age,Family_Income,Negat_thinking,COGN_DIFFICULTIES_index,Mental_fatigue,Threat,Riskofcontagion,Lockdown_difficulty,Financialworry_present,Financialworry_future,Daily_stress_Index,IU_total,UCLA_LS_Total,Gender_female,Gender_male
count,3617.0,3617.0,3617.0,3617.0,3617.0,3617.0,3617.0,3617.0,3617.0,3617.0,3617.0,3617.0,3617.0,3617.0,3617.0,3617.0,3617.0
mean,7.033177,8.441803,47.310755,1.840752,0.735693,2.969035,10.442908,6.617639,5.447885,5.279513,6.909594,5.864805,0.425491,31.962123,8.664086,0.852364,0.145148
std,5.171025,1.831086,12.765485,0.77543,0.441025,2.189863,4.095191,2.417796,2.502014,2.908373,2.45566,2.402,2.930999,10.75401,6.675201,0.354788,0.352299
min,0.0,1.0,18.0,1.0,0.0,0.0,5.0,1.0,0.0,1.0,1.0,1.0,-8.0,12.0,0.0,0.0,0.0
25%,3.0,8.0,38.0,1.0,0.0,1.0,8.0,5.0,4.0,3.0,5.0,4.0,-2.0,24.0,3.0,1.0,0.0
50%,6.0,9.0,46.0,2.0,1.0,3.0,10.0,7.0,5.0,5.0,7.0,6.0,0.0,31.0,8.0,1.0,0.0
75%,10.0,10.0,56.0,2.0,1.0,5.0,12.0,8.0,7.0,8.0,9.0,8.0,2.0,39.0,13.0,1.0,0.0
max,21.0,10.0,95.0,3.0,1.0,7.0,25.0,10.0,10.0,10.0,10.0,10.0,10.0,60.0,30.0,1.0,1.0


In [13]:
# Provide clean names for manuscript table
variables

array(['PHQ9_Total', 'GAD7_Total', 'Lockdown_adherence', 'Age',
       'Family_Income', 'Negat_thinking', 'COGN_DIFFICULTIES_index',
       'Mental_fatigue', 'Threat', 'Riskofcontagion',
       'Lockdown_difficulty', 'Financialworry_present',
       'Financialworry_future', 'Daily_stress_Index', 'IU_total',
       'UCLA_LS_Total', 'Gender_female', 'Gender_male'], dtype=object)

In [14]:
clean_names= dict(zip(
    variables,
    ['PHQ9', 'GAD7','Lockdown adherence','Age','Family income', 'Negative thinking','Cognitive troubles', 'Mental fatigue', 'Perceived threat', 'Perceived risk','Lockdown difficulty','Financial worries (present)','Financial worries (future)','Daily stress', 'Intolerance of uncertainty', 'Loneliness scale', 'Female', 'Male']
    
    ))

clean_names

{'PHQ9_Total': 'PHQ9',
 'GAD7_Total': 'GAD7',
 'Lockdown_adherence': 'Lockdown adherence',
 'Age': 'Age',
 'Family_Income': 'Family income',
 'Negat_thinking': 'Negative thinking',
 'COGN_DIFFICULTIES_index': 'Cognitive troubles',
 'Mental_fatigue': 'Mental fatigue',
 'Threat': 'Perceived threat',
 'Riskofcontagion': 'Perceived risk',
 'Lockdown_difficulty': 'Lockdown difficulty',
 'Financialworry_present': 'Financial worries (present)',
 'Financialworry_future': 'Financial worries (future)',
 'Daily_stress_Index': 'Daily stress',
 'IU_total': 'Intolerance of uncertainty',
 'UCLA_LS_Total': 'Loneliness scale',
 'Gender_female': 'Female',
 'Gender_male': 'Male'}

In [19]:
# Train and test

coefs_all= []
r2_all=[]


for X_i,y_i,variables,name in [
    [X_depression, y_depression, variables_depression,'depression'],
    [X_anxiety, y_anxiety, variables_anxiety,'anxiety'],
    [X_lockdown, y_lockdown,  variables_lockdown,'lockdown'],
                            ]:
    #Train test split
    X_train, X_test, y_train, y_test = train_test_split(X_i, y_i, test_size=0.2, random_state=42)

    # Hyperparameter tuning     
    best_alpha = lasso_standardscaler_alpha(X_train,y_train,cv=10) # Find best alpha through CV
    
    # Train final model      
    pipe = Pipeline(steps=[
        ('scaler', StandardScaler()), 
        ('model', Lasso(alpha = best_alpha))
    ])
    pipe.fit(X_train,y_train)
    
    # Test     
    y_pred = pipe.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    
    # Obtain standardized beta coefficients     
    coefs= pd.DataFrame(pipe['model'].coef_,index=variables, columns= ['Coef.'])
    
    # Clean dataframe     
    coefs.index = [clean_names.get(n) for n in coefs.index.values]
    coefs['Abs. Coef.'] = coefs['Coef.'].abs()
    coefs= coefs.sort_values('Abs. Coef.')[::-1].reset_index()
    coefs.index +=1
    coefs= coefs.reset_index().round(2)
    coefs.columns= ['Importance', 'Covariate', 'Coef.','Abs. Coef.']
    coefs['Coef.'] =     coefs['Coef.'].astype(str).replace('^(-)0.|^0.',r'\1.',regex=True)
    if name == 'depression':
        coefs= coefs.drop(['Abs. Coef.'], axis=1)#drop abs value
    else:
        coefs= coefs.drop(['Abs. Coef.','Importance'], axis=1)#drop abs value
    
    coefs = add_top_column(coefs,f'{name.capitalize()} R^2 = {np.round(r2,2)}' )
    coefs.to_csv(output_dir+f'coefs_{name}.csv',index=False)
    coefs_all.append(coefs)
    r2_all.append([name,np.round(r2,2)])



In [20]:
# Alternative that does not return best params
#     reg = LassoCV(cv=10,alphas = alphas,normalize=True, random_state=0).fit(X_train, y_train) #normalize
#     r2 = reg.score(X_test, y_test)

In [22]:
# Concatenate 3 dataframes
coefs_all_clean = pd.concat(coefs_all,axis=1)
coefs_all_clean.to_csv(output_dir+'coefs_all.csv', index=False)
coefs_all_clean

Unnamed: 0_level_0,Depression R^2 = 0.69,Depression R^2 = 0.69,Depression R^2 = 0.69,Anxiety R^2 = 0.63,Anxiety R^2 = 0.63,Lockdown R^2 = 0.1,Lockdown R^2 = 0.1
Unnamed: 0_level_1,Importance,Covariate,Coef.,Covariate,Coef.,Covariate,Coef.
0,1,GAD7,2.57,PHQ9,2.62,Perceived threat,0.39
1,2,Mental fatigue,1.53,Intolerance of uncertainty,0.83,Female,0.12
2,3,Cognitive troubles,0.68,Lockdown difficulty,0.48,Financial worries (present),0.1
3,4,Loneliness scale,0.49,Cognitive troubles,0.32,Perceived risk,0.09
4,5,Daily stress,-0.29,Loneliness scale,0.31,Negative thinking,-0.08
5,6,Lockdown difficulty,0.22,Family income,0.29,Intolerance of uncertainty,0.08
6,7,Financial worries (present),-0.17,Negative thinking,0.29,Cognitive troubles,-0.07
7,8,Family income,-0.15,Daily stress,-0.21,Daily stress,0.05
8,9,Intolerance of uncertainty,0.14,Financial worries (present),-0.19,Family income,0.05
9,10,Age,-0.12,Financial worries (future),-0.19,GAD7,-0.05
