In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("../data/mental_health_risk_dataset.csv")
df

Unnamed: 0,age,gender,marital_status,education_level,employment_status,sleep_hours,physical_activity_hours_per_week,screen_time_hours_per_day,social_support_score,work_stress_level,...,depression_score,stress_level,mood_swings_frequency,concentration_difficulty_level,panic_attack_history,family_history_mental_illness,previous_mental_health_diagnosis,therapy_history,substance_use,mental_health_risk
0,56,Other,Single,Bachelor,Unemployed,8.6,2.8,9.6,7,10,...,4,8,8,3,1,0,1,1,1,1
1,47,Male,Single,Bachelor,Unemployed,4.5,2.7,3.0,10,6,...,7,4,9,3,0,0,0,0,0,0
2,56,Female,Divorced,Bachelor,Student,3.1,14.1,7.2,10,5,...,3,1,4,2,1,1,1,1,1,2
3,59,Other,Married,Bachelor,Employed,7.0,0.5,10.3,2,10,...,8,5,2,5,1,1,0,1,1,2
4,58,Male,Single,High School,Self-Employed,5.1,2.5,1.2,8,1,...,8,3,3,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,36,Female,Single,Bachelor,Unemployed,3.3,8.4,4.6,4,3,...,1,2,5,1,0,0,1,0,0,0
24996,49,Female,Divorced,Master,Student,4.7,6.6,2.1,6,6,...,2,8,7,1,1,0,0,0,1,1
24997,35,Female,Married,PhD,Self-Employed,4.0,12.8,11.3,4,10,...,7,3,3,4,1,0,1,0,1,2
24998,44,Male,Divorced,High School,Student,5.1,2.0,3.0,5,4,...,7,7,8,1,0,1,1,0,1,0


In [3]:
# Пустых значений не найдено
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 25 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   age                               25000 non-null  int64  
 1   gender                            25000 non-null  str    
 2   marital_status                    25000 non-null  str    
 3   education_level                   25000 non-null  str    
 4   employment_status                 25000 non-null  str    
 5   sleep_hours                       25000 non-null  float64
 6   physical_activity_hours_per_week  25000 non-null  float64
 7   screen_time_hours_per_day         25000 non-null  float64
 8   social_support_score              25000 non-null  int64  
 9   work_stress_level                 25000 non-null  int64  
 10  academic_pressure_level           25000 non-null  int64  
 11  job_satisfaction_score            25000 non-null  int64  
 12  financial_stres

С помощью расширения data wrangler в vs code заметим что есть 4 категориальных признака у гендера и семейного положения по 3 разных значения, у уровня образования и статуса занятости по 4 уникальных значения.

При анализе числовых признаков я понял что есть 10 признаков с целочисленными значениями от 1 до 10(10-тибальная система), возвраст от 18-60 с достаточно равномерным распределением возрастных групп(если считать возраст группами с промежутком в 3 года), сон в часах от 3 до 10, физическая активность от 3 до 15 часов ,время проведенное за экраном от 1 до 12 часов, количество часов за работой в неделю от 20 до 70, и 5 признаков с бинарными значениями(0 и 1).

In [4]:
X = df.drop(columns=["mental_health_risk"]).copy()
y = df["mental_health_risk"].copy()

In [5]:
low_risk = 9357/25000
moderate_risk = 11823/25000
high_risk = 3820/25000

print(f"Распределения классов: Низкий риск: {low_risk}, Умеренный риск: {moderate_risk}, Высокий риск: {high_risk}")

Распределения классов: Низкий риск: 0.37428, Умеренный риск: 0.47292, Высокий риск: 0.1528


In [6]:
# Поменяем целевые значения для лучшей интерпритируемости результата
y = y.map({0: "Low", 1: "Moderate", 2: "High"})

In [7]:
# Разделим датасет на тренировочный и обучающий набор с сохранением пропорции целевых классов
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [15]:
# Разбиение столбцов на типы
cat_columns = ["gender", "marital_status", "education_level", "employment_status"]
num_columns = ["age", "sleep_hours", "physical_activity_hours_per_week", "screen_time_hours_per_day", "work_stress_level", "social_support_score",
               "academic_pressure_level", "job_satisfaction_score", "financial_stress_level", "working_hours_per_week", "anxiety_score", 
               "depression_score", "stress_level", "mood_swings_frequency", "concentration_difficulty_level"]
bin_columns = ["panic_attack_history", "family_history_mental_illness", "previous_mental_health_diagnosis", "therapy_history", "substance_use"]

In [9]:
preprocessor = ColumnTransformer([
    ('numerical', StandardScaler(), num_columns),  
    ('categorical', OneHotEncoder(sparse_output=False, drop="first"), cat_columns),
    ('binary', 'passthrough', bin_columns)  
])

In [10]:
# Создаем пайплайн в котором предобрабатываем данные и создаем модель
pipeline_log_reg = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=500))
])

In [11]:
#обучаем модель
pipeline_log_reg = pipeline_log_reg.fit(X_train, y_train)

In [12]:
predicts = pipeline_log_reg.predict(X_test)

In [13]:
accuracy = accuracy_score(y_test, predicts)
precision = precision_score(y_test, predicts, average="weighted")
recall = recall_score(y_test, predicts, average="weighted")
f1 = f1_score(y_test, predicts, average="weighted")
print(f"accuracy: {accuracy}, precision: {precision}, recall: {recall}, f1: {f1}")

accuracy: 0.7488, precision: 0.7504959305429445, recall: 0.7488, f1: 0.7484927960831512
