In [8]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split

data = pd.read_csv('DASS/data.csv', delimiter='\t', on_bad_lines='skip')



In [9]:
# cleaning
data_clean = data.drop_duplicates().dropna()
columns_to_check = [f'Q{i}E' for i in range(1, 43)]
data_clean = data_clean[data_clean[columns_to_check].apply(lambda row: (row >= 1000).all() and (row <= 60000).all(), axis=1)]
data_clean = data_clean[~data_clean[columns_to_check].apply(lambda row: (row == row.iloc[0]).all(), axis=1)]

data_clean['depression_score'] = data_clean[[f'Q{i}A' for i in [3, 5, 10, 13, 16, 17, 21, 24, 26, 31, 34, 37, 38, 42]]].sum(axis=1)
data_clean['anxiety_score'] = data_clean[[f'Q{i}A' for i in [2, 4, 7, 9, 15, 19, 20, 23, 25, 28, 30, 36, 40, 41]]].sum(axis=1)
data_clean['stress_score'] = data_clean[[f'Q{i}A' for i in [1, 6, 8, 11, 12, 14, 18, 22, 27, 29, 32, 33, 35, 39]]].sum(axis=1)



In [10]:
# ExpVar
features = [f'Q{i}A' for i in range(1, 43)]
X = data_clean[features]

# RespVar
y = data_clean[['depression_score', 'anxiety_score', 'stress_score']]


In [11]:
# training and testing sets splite
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# LightGBM
from sklearn.multioutput import MultiOutputRegressor


params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

lgb_regressor = lgb.LGBMRegressor(**params)
model = MultiOutputRegressor(lgb_regressor)
model.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000818 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 210
[LightGBM] [Info] Number of data points in the train set: 16016, number of used features: 42
[LightGBM] [Info] Start training from score 34.096341
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000430 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 210
[LightGBM] [Info] Number of data points in the train set: 16016, number of used features: 42
[LightGBM] [Info] Start training from score 29.111201
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

In [13]:
# Predict on the test set
from sklearn.metrics import mean_squared_error


y_pred = model.predict(X_test)

# model evaluation
mse_depression = mean_squared_error(y_test['depression_score'], y_pred[:, 0])
mse_anxiety = mean_squared_error(y_test['anxiety_score'], y_pred[:, 1])
mse_stress = mean_squared_error(y_test['stress_score'], y_pred[:, 2])

print(f'MSE for Depression: {mse_depression}')
print(f'MSE for Anxiety: {mse_anxiety}')
print(f'MSE for Stress: {mse_stress}')


MSE for Depression: 0.3403872734386352
MSE for Anxiety: 0.4130153433893837
MSE for Stress: 0.4145043827284424
