In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestRegressor

In [2]:
df= pd.read_csv(r"C:\Users\HP\Documents\capstone(2)\data\depression_data.csv")

In [3]:
df.isnull().sum()

Name                            0
Age                             0
Marital Status                  0
Education Level                 0
Number of Children              0
Smoking Status                  0
Physical Activity Level         0
Employment Status               0
Income                          0
Alcohol Consumption             0
Dietary Habits                  0
Sleep Patterns                  0
History of Mental Illness       0
History of Substance Abuse      0
Family History of Depression    0
Chronic Medical Conditions      0
dtype: int64

In [4]:
df.head().T


Unnamed: 0,0,1,2,3,4
Name,Christine Barker,Jacqueline Lewis,Shannon Church,Charles Jordan,Michael Rich
Age,31,55,78,58,18
Marital Status,Married,Married,Widowed,Divorced,Single
Education Level,Bachelor's Degree,High School,Master's Degree,Master's Degree,High School
Number of Children,2,1,1,3,0
Smoking Status,Non-smoker,Non-smoker,Non-smoker,Non-smoker,Non-smoker
Physical Activity Level,Active,Sedentary,Sedentary,Moderate,Sedentary
Employment Status,Unemployed,Employed,Employed,Unemployed,Unemployed
Income,26265.67,42710.36,125332.79,9992.78,8595.08
Alcohol Consumption,Moderate,High,Low,Moderate,Low


In [5]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [6]:
df.dtypes

name                             object
age                               int64
marital_status                   object
education_level                  object
number_of_children                int64
smoking_status                   object
physical_activity_level          object
employment_status                object
income                          float64
alcohol_consumption              object
dietary_habits                   object
sleep_patterns                   object
history_of_mental_illness        object
history_of_substance_abuse       object
family_history_of_depression     object
chronic_medical_conditions       object
dtype: object

In [7]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 1)
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 42)

In [8]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
numerical_columns = list(df.dtypes[df.dtypes == 'number'].index)

In [9]:
len(df_test),len(df_train),len(df_val)

(82754, 248260, 82754)

In [10]:
y_train = df_train['family_history_of_depression']
y_test = df_test['family_history_of_depression']
y_val = df_val['family_history_of_depression']

In [11]:
X_train = df_train.drop(['family_history_of_depression'], axis=1)
X_test = df_test.drop(['family_history_of_depression'], axis=1)
X_val = df_val.drop(['family_history_of_depression'], axis=1)



In [12]:
X_train = X_train.select_dtypes(exclude=['object'])
X_test  = X_test.select_dtypes(exclude=['object'])
X_val   = X_val.select_dtypes(exclude=['object'])


## Statistics summary

In [13]:
df.describe()

Unnamed: 0,age,number_of_children,income
count,413768.0,413768.0,413768.0
mean,49.000713,1.298972,50661.707971
std,18.158759,1.237054,40624.100565
min,18.0,0.0,0.41
25%,33.0,0.0,21001.03
50%,49.0,1.0,37520.135
75%,65.0,2.0,76616.3
max,80.0,4.0,209995.22


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413768 entries, 0 to 413767
Data columns (total 16 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   name                          413768 non-null  object 
 1   age                           413768 non-null  int64  
 2   marital_status                413768 non-null  object 
 3   education_level               413768 non-null  object 
 4   number_of_children            413768 non-null  int64  
 5   smoking_status                413768 non-null  object 
 6   physical_activity_level       413768 non-null  object 
 7   employment_status             413768 non-null  object 
 8   income                        413768 non-null  float64
 9   alcohol_consumption           413768 non-null  object 
 10  dietary_habits                413768 non-null  object 
 11  sleep_patterns                413768 non-null  object 
 12  history_of_mental_illness     413768 non-nul

In [15]:
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
for col in numerical_columns:
    plt.figure()
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

In [17]:
for col in numerical_columns:
    plt.figure()
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

In [18]:
target = 'family_history_of_depression'  
for col in numerical_columns:
    if col != target:
        plt.figure()
        sns.scatterplot(x=df[col], y=df[target])
        plt.title(f'{col} vs {target}')
        plt.show()

In [19]:
X_train_dicts = X_train.to_dict(orient='records')
X_val_dicts = X_val.to_dict(orient='records')
X_test_dicts = X_test.to_dict(orient='records')

## Linear Regression

In [20]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test  = le.transform(y_test)
y_val   = le.transform(y_val)


In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

baseline_model = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('lr', LinearRegression())
])

baseline_model.fit(X_train, y_train)

In [22]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = baseline_model.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
rmse = mse ** 0.5              
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

rmse, mae, r2

(0.44077800649671917, 0.38765550373776064, 0.015263574558066928)

## Decision Tree Regressor

In [23]:
from sklearn.tree import DecisionTreeRegressor

dt_model = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('dt', DecisionTreeRegressor(random_state=42))
])

dt_model.fit(X_train, y_train)

In [36]:
y_pred = dt_model.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
rmse = mse ** 0.5               
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

rmse, mae, r2

(0.6239934979951922, 0.3893890325543176, -0.9735143959366672)

## Random Forest Regressor

In [25]:
from sklearn.ensemble import RandomForestRegressor

rf_model = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor(
        n_estimators=200,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    ))
])

rf_model.fit(X_train, y_train)

In [35]:
y_pred = rf_model.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
rmse = mse ** 0.5               
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

rmse, mae, r2

(0.48859874302655176, 0.3901806654864619, -0.20999858027457186)

## Gradient Boosting Code

In [31]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('gb', GradientBoostingRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    ))
])

gb_model.fit(X_train, y_train)

In [34]:
y_pred = gb_model.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
rmse = mse ** 0.5             
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

rmse, mae, r2

(0.439884795579857, 0.3860551434215645, 0.019250552612398164)

## KNN Regressor

In [29]:
from sklearn.neighbors import KNeighborsRegressor

knn_model = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=5, weights='distance'))
])

knn_model.fit(X_train, y_train)

In [32]:
y_pred = knn_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = mse ** 0.5               
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

rmse, mae, r2

(0.5159138321381138, 0.3860831371176072, -0.34907009010356016)