In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, accuracy_score

data = pd.read_csv('DASS/data.csv', sep='\t')


In [10]:
# Drop unnecessary columns
data = data.drop(['major', 'country'], axis=1)

# duplicates and  null values
data = data.drop_duplicates().dropna()

# 3-sigma 
def three_sigma_filter(data, column):
    mean = data[column].mean()
    std = data[column].std()
    lower_bound = mean - 3 * std
    upper_bound = mean + 3 * std
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

data = three_sigma_filter(data, 'testelapse')
data = three_sigma_filter(data, 'surveyelapse')

# identical answers
n = list(range(0, 126, 3)) 
question_cols = data.columns[n]
data = data[~data[question_cols].apply(lambda row: row.nunique() == 1, axis=1)]


In [11]:
def calculate_scores(df, questions):
    return df[[f'Q{i}A' for i in questions]].sum(axis=1)

data['depression_score'] = calculate_scores(data, [3, 5, 10, 13, 16, 17, 21, 24, 26, 31, 34, 37, 38, 42])
data['anxiety_score'] = calculate_scores(data, [2, 4, 7, 9, 15, 19, 20, 23, 25, 28, 30, 36, 40, 41])
data['stress_score'] = calculate_scores(data, [1, 6, 8, 11, 12, 14, 18, 22, 27, 29, 32, 33, 35, 39])

def map_to_levels(score, thresholds):
    for i, threshold in enumerate(thresholds):
        if score <= threshold:
            return i
    return len(thresholds)

data['depression_level'] = data['depression_score'].apply(lambda x: map_to_levels(x, [9, 13, 20, 27]))
data['anxiety_level'] = data['anxiety_score'].apply(lambda x: map_to_levels(x, [7, 9, 14, 19]))
data['stress_level'] = data['stress_score'].apply(lambda x: map_to_levels(x, [14, 18, 25, 33]))

data['total_emotion_level'] = data.apply(lambda row: round((row['depression_level'] + 
                                                            row['anxiety_level'] + 
                                                            row['stress_level']) * 11 / 13), axis=1)


In [12]:
# Multioutput  
X = data[[f'Q{i}A' for i in range(1, 43)]]
y = data[['depression_score', 'anxiety_score', 'stress_score']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

lgb_regressor = LGBMRegressor(num_leaves=31, learning_rate=0.05, n_estimators=100)
model = MultiOutputRegressor(lgb_regressor)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
print(f'Mean Squared Error: Depression: {mse[0]}, Anxiety: {mse[1]}, Stress: {mse[2]}')


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 210
[LightGBM] [Info] Number of data points in the train set: 29597, number of used features: 42
[LightGBM] [Info] Start training from score 35.028246
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000754 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 210
[LightGBM] [Info] Number of data points in the train set: 29597, number of used features: 42
[LightGBM] [Info] Start training from score 29.999932
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001462 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

In [13]:
# Using Classifier
from lightgbm import LGBMClassifier

y_class = data['total_emotion_level']
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y_class, test_size=0.25, random_state=42)

gbm_classifier = LGBMClassifier(num_leaves=31, learning_rate=0.05, n_estimators=100)
gbm_classifier.fit(X_train_class, y_train_class)

y_pred_class = gbm_classifier.predict(X_test_class)
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f'Classification Accuracy: {accuracy}')

print('Feature importances:', list(gbm_classifier.feature_importances_))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001365 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 210
[LightGBM] [Info] Number of data points in the train set: 29597, number of used features: 42
[LightGBM] [Info] Start training from score -6.684510
[LightGBM] [Info] Start training from score -4.533377
[LightGBM] [Info] Start training from score -3.447423
[LightGBM] [Info] Start training from score -3.016799
[LightGBM] [Info] Start training from score -2.781174
[LightGBM] [Info] Start training from score -1.689492
[LightGBM] [Info] Start training from score -1.778035
[LightGBM] [Info] Start training from score -0.709945
Classification Accuracy: 0.8448205959862153
Feature importances: [np.int32(710), np.int32(640), np.int32(424), np.int32(296), np.int32(540), np.int32(654), np.int32(290), np.int32(681), np.int32(733), np.int32(533), 