# Data Preprocessing

## 1. Load Data

In [47]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/student_depression_dataset.csv', index_col=0)
df.shape

(27901, 17)

## 2. Data Quality Check

In [48]:
#Drop rows where the following columns are Others
# print(df['Dietary Habits'].unique())
#print(df['Sleep Duration'].unique())


#print(df['City'].unique())

print(df['Work Pressure'].unique())
print((df['Work Pressure'] == 2).sum())

# print(df['Job Satisfaction'].unique())
# print((df['Job Satisfaction'] == 1).sum())

#Drop columns where Profession is not Student
#print(df['Profession'].value_counts())

# print(df['Academic Pressure'].value_counts())
# print(df['Degree'].value_counts())
# print((df['Degree'].value_counts()).count())
#print((df['City'].value_counts()).count())

[0. 5. 2.]
1


In [49]:
print((df['Dietary Habits'] == 'Others').sum())
print((df['Sleep Duration'] == 'Others').sum())
print(((df['Dietary Habits'] == 'Others') & (df['Sleep Duration'] == 'Others')).sum())

12
18
0


## 3. Data Cleaning

**Remove irrelevant columns:** Work Pressure and Job Satisfaction are predominantly zero (not applicable to students).

**Filter to students only:** Limiting scope to students only

**Drop Profession column:** Now redundant after filtering

In [50]:
filtered_df = df.copy()
print(filtered_df.shape)

# Delete these two columns, as most values are 0
filtered_df.drop(['Work Pressure', 'Job Satisfaction'], axis=1, inplace=True)

# Delete the small number of rows where either of these two columns have 'Others'
filtered_df = filtered_df[~((filtered_df['Dietary Habits'] == 'Others') | (filtered_df['Sleep Duration'] == 'Others'))]
filtered_df = filtered_df[(filtered_df['Profession'] == 'Student')]
filtered_df.drop(['Profession'], axis=1, inplace=True)

# Delete the small number of rows where Financial Stress is '?', convert to float
filtered_df = filtered_df[~(filtered_df['Financial Stress'] == '?')]
filtered_df['Financial Stress'] = filtered_df['Financial Stress'].astype(float)

print(filtered_df.shape)

(27901, 17)
(27837, 14)


## 3. Data Cleaning

Removing Work Pressure and Job Satisfaction (predominantly zeros). Filtering to students only and dropping redundant Profession column.

In [51]:
filtered_df.head()
#print(filtered_df.shape)
# print(filtered_df['Academic Pressure'].value_counts())
filtered_df['Academic Pressure'] = filtered_df['Academic Pressure'].replace(0.0, 1.0)

diet_map = {
    'Unhealthy': 0,
    'Moderate': 1,
    'Healthy': 2
}
filtered_df['Dietary Habits'] = filtered_df['Dietary Habits'].map(diet_map)

sleep_map = {
    "'Less than 5 hours'": 0,
    "'5-6 hours'": 1, 
    "'7-8 hours'": 2,
    "'More than 8 hours'": 3
}
filtered_df['Sleep Duration'] = filtered_df['Sleep Duration'].map(sleep_map)

if 'Gender' in filtered_df.columns:
    filtered_df['Sex'] = (filtered_df['Gender'] == 'Male').astype(int)
    filtered_df = filtered_df.drop(columns=['Gender'])

if 'Have you ever had suicidal thoughts ?' in filtered_df.columns:
    filtered_df['Suicidal_Thoughts'] = (filtered_df['Have you ever had suicidal thoughts ?'] == 'Yes').astype(int)
    filtered_df = filtered_df.drop(columns=['Have you ever had suicidal thoughts ?'])


if 'Family History of Mental Illness' in filtered_df.columns:
    filtered_df['Family_Mental_Illness'] = (filtered_df['Family History of Mental Illness'] == 'Yes').astype(int)
    filtered_df = filtered_df.drop(columns=['Family History of Mental Illness'])

## 4. Clean City Column

Correcting obvious typos in city names and marking invalid entries as 'Unknown'.

In [52]:
#print((df['City'].value_counts()))

valid_cities = {
    'Kalyan', 'Srinagar', 'Hyderabad', 'Vasai-Virar', 'Lucknow', 'Thane', 
    'Ludhiana', 'Agra', 'Surat', 'Kolkata', 'Jaipur', 'Patna', 'Visakhapatnam', 
    'Pune', 'Ahmedabad', 'Bhopal', 'Chennai', 'Meerut', 'Rajkot', 'Delhi', 
    'Bangalore', 'Ghaziabad', 'Mumbai', 'Vadodara', 'Varanasi', 'Nagpur', 
    'Indore', 'Kanpur', 'Nashik', 'Faridabad'
}

city_corrections = {
    'Khaziabad': 'Ghaziabad',
}

filtered_df['City'] = filtered_df['City'].replace(city_corrections)

invalid_mask = ~filtered_df['City'].isin(valid_cities)
filtered_df.loc[invalid_mask, 'City'] = 'Unknown'

print((filtered_df['City'].value_counts()))

City
Kalyan           1563
Srinagar         1370
Hyderabad        1337
Vasai-Virar      1287
Lucknow          1154
Thane            1139
Ludhiana         1107
Agra             1090
Surat            1078
Kolkata          1064
Jaipur           1033
Patna            1006
Pune              968
Visakhapatnam     967
Ahmedabad         946
Bhopal            933
Chennai           884
Meerut            820
Rajkot            815
Bangalore         765
Delhi             765
Ghaziabad         744
Mumbai            697
Vadodara          692
Varanasi          684
Nagpur            649
Indore            642
Kanpur            606
Nashik            547
Faridabad         460
Unknown            25
Name: count, dtype: int64


In [53]:
print(filtered_df['Dietary Habits'].unique())
print(filtered_df['Sleep Duration'].unique())

[2 1 0]
[1 0 2 3]


In [54]:
#filtered_df.head()
filtered_df.dtypes
# filtered_df.shape

#filtered_df['Financial Stress'].value_counts()
#print(filtered_df['City'].unique())

Age                      float64
City                      object
Academic Pressure        float64
CGPA                     float64
Study Satisfaction       float64
Sleep Duration             int64
Dietary Habits             int64
Degree                    object
Work/Study Hours         float64
Financial Stress         float64
Depression                 int64
Sex                        int32
Suicidal_Thoughts          int32
Family_Mental_Illness      int32
dtype: object

## 5. Save Cleaned Data

In [55]:
# filtered_df.to_csv("data/cleaned_student_depression_dataset.csv", index=False)