In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.linear_model import LassoCV, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [None]:
# config: depends whether you're on Google Colab or local

# On google colab
# Mount GDrive and attach it to the colab for data I/O
# from google.colab import drive
# drive.mount('/content/drive')
# input_dir = '/content/drive/My Drive/covid19_argentina/data/input/'
# output_dir = '/content/drive/My Drive/covid19_argentina/data/output/'

# Locally
input_dir = './data/input/'
output_dir = './data/output/'



In [None]:
def add_top_column(df, top_col, inplace=True):
    if not inplace:
        df = df.copy()
    
    df.columns = pd.MultiIndex.from_product([[top_col], df.columns])
    return df

In [None]:
# Hyperparameter tuning

# Parameters of pipelines can be set using ‘__’ separated parameter names:

def lasso_standardscaler_alpha(X,y,cv=10):
    alphas= np.logspace(-4, 0, 30) #equally spaced on log scale from 10**-4 (0.0004) to 10**0 (1)
    pipe = Pipeline(steps=[
        ('scaler', StandardScaler()), 
        ('model', Lasso())
    ])
    param_grid = {
        'model__alpha': alphas,
    }
    search = GridSearchCV(pipe,param_grid, cv=cv,n_jobs=-1)
    search.fit(X,y)
    results = pd.DataFrame(search.cv_results_)
    best = results[results['rank_test_score']==1]
    mean_score = best.mean_test_score.values
    std_score = best.std_test_score.values
    alpha = float(best.param_model__alpha.values)
    return alpha

In [None]:
# Load data

df = pd.read_csv(input_dir+'Emotional symptoms COVID19_Arg_May20_v2.csv', sep = ';')
df

In [None]:
df.columns

In [None]:
# with regards to the correlation, we added a few variables 
X = df[[
        'PHQ9_Total',
        'GAD7_Total',
        'Lockdown_adherence',
        'Age',
        'Gender', #added (binary)
        'Family_Income',  #added (ordinal)
        'Negat_thinking', #added (binary)
        'COGN_DIFFICULTIES_index',
        'Mental_fatigue', 'Threat', 'Riskofcontagion', 
       'Lockdown_difficulty', 'Financialworry_present',
       'Financialworry_future', 'Daily_stress_Index', 'IU_total',
       'UCLA_LS_Total',          
       ]]

# Create dummy variable for gender
gender = X['Gender']
gender = pd.get_dummies(gender,prefix='Gender')
print(gender)
print(gender.sum())


X = X.drop('Gender', axis=1)
X['Gender_female'] = gender['Gender_1']
X['Gender_male'] = gender['Gender_2']
# other genders were not included since there were only 9. 

variables = X.columns.values


In [None]:
# Create DFs for each IV

# Depression

variables_depression = X.drop('PHQ9_Total',axis=1).columns.values
X_depression = X.drop('PHQ9_Total',axis=1).values
y_depression = X['PHQ9_Total'].values
print('depression covariates', variables_depression)
print(X_depression.shape, y_depression.shape)
print('\n\n')

# Anxiety

variables_anxiety = X.drop('GAD7_Total',axis=1).columns.values
X_anxiety = X.drop('GAD7_Total',axis=1).values
y_anxiety = X['GAD7_Total'].values
print('Anxiety covariates', variables_anxiety)
print(X_anxiety.shape, y_anxiety.shape)
print('\n\n')


# Lockdown Adherence

variables_lockdown = X.drop('Lockdown_adherence',axis=1).columns.values
X_lockdown = X.drop('Lockdown_adherence',axis=1).values
y_lockdown = X['Lockdown_adherence'].values
print('Lockdown Adherence covariates', variables_lockdown)
print(X_lockdown.shape, y_lockdown.shape)
print('\n\n')



In [None]:
# Observe the range of the covariates
X.drop('PHQ9_Total',axis=1).describe()

In [None]:
# Provide clean names for manuscript table
variables

In [None]:
clean_names= dict(zip(
    variables,
    ['PHQ9', 'GAD7','Lockdown adherence','Age','Family income', 'Negative thinking','Cognitive troubles', 'Mental fatigue', 'Perceived threat', 'Perceived risk','Lockdown difficulty','Financial worries (present)','Financial worries (future)','Daily stress', 'Intolerance of uncertainty', 'Loneliness scale', 'Female', 'Male']
    
    ))

clean_names

In [None]:
# Train and test

coefs_all= []
r2_all=[]


for X_i,y_i,variables,name in [
    [X_depression, y_depression, variables_depression,'depression'],
    [X_anxiety, y_anxiety, variables_anxiety,'anxiety'],
    [X_lockdown, y_lockdown,  variables_lockdown,'lockdown'],
                            ]:
    #Train test split
    X_train, X_test, y_train, y_test = train_test_split(X_i, y_i, test_size=0.2, random_state=42)

    # Hyperparameter tuning     
    best_alpha = lasso_standardscaler_alpha(X_train,y_train,cv=10) # Find best alpha through CV
    
    # Train final model      
    pipe = Pipeline(steps=[
        ('scaler', StandardScaler()), 
        ('model', Lasso(alpha = best_alpha))
    ])
    pipe.fit(X_train,y_train)
    
    # Test     
    y_pred = pipe.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    
    # Obtain standardized beta coefficients     
    coefs= pd.DataFrame(pipe['model'].coef_,index=variables, columns= ['Coef.'])
    
    # Clean dataframe     
    coefs.index = [clean_names.get(n) for n in coefs.index.values]
    coefs['Abs. Coef.'] = coefs['Coef.'].abs()
    coefs= coefs.sort_values('Abs. Coef.')[::-1].reset_index()
    coefs.index +=1
    coefs= coefs.reset_index().round(2)
    coefs.columns= ['Importance', 'Covariate', 'Coef.','Abs. Coef.']
    coefs['Coef.'] =     coefs['Coef.'].astype(str).replace('^(-)0.|^0.',r'\1.',regex=True)
    if name == 'depression':
        coefs= coefs.drop(['Abs. Coef.'], axis=1)#drop abs value
    else:
        coefs= coefs.drop(['Abs. Coef.','Importance'], axis=1)#drop abs value
    
    coefs = add_top_column(coefs,f'{name.capitalize()} R^2 = {np.round(r2,2)}' )
    coefs.to_csv(output_dir+f'coefs_{name}.csv',index=False)
    coefs_all.append(coefs)
    r2_all.append([name,np.round(r2,2)])



In [None]:
# Alternative that does not return best params
#     reg = LassoCV(cv=10,alphas = alphas,normalize=True, random_state=0).fit(X_train, y_train) #normalize
#     r2 = reg.score(X_test, y_test)

In [None]:
# Concatenate 3 dataframes
coefs_all_clean = pd.concat(coefs_all,axis=1)
coefs_all_clean.to_csv(output_dir+'coefs_all.csv', index=False)
coefs_all_clean