In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [3]:
train_dataframe = pd.read_csv("data/train.csv")

In [6]:
#dataset is not balanced, so for the evaluation our focus will be on F1 Score

depression_percentages = train_dataframe['Depression'].value_counts(normalize=True) * 100

print(f"Percentage of 'Depression' = 0: {depression_percentages.get(0, 0):.2f}%")
print(f"Percentage of 'Depression' = 1: {depression_percentages.get(1, 0):.2f}%")


Percentage of 'Depression' = 0: 81.83%
Percentage of 'Depression' = 1: 18.17%


# First Analysis

In [67]:
train_dataframe

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140695,140695,Vidya,Female,18.0,Ahmedabad,Working Professional,,,5.0,,,4.0,5-6 hours,Unhealthy,Class 12,No,2.0,4.0,Yes,1
140696,140696,Lata,Female,41.0,Hyderabad,Working Professional,Content Writer,,5.0,,,4.0,7-8 hours,Moderate,B.Tech,Yes,6.0,5.0,Yes,0
140697,140697,Aanchal,Female,24.0,Kolkata,Working Professional,Marketing Manager,,3.0,,,1.0,More than 8 hours,Moderate,B.Com,No,4.0,4.0,No,0
140698,140698,Prachi,Female,49.0,Srinagar,Working Professional,Plumber,,5.0,,,2.0,5-6 hours,Moderate,ME,Yes,10.0,1.0,No,0


In [68]:
train_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140700 entries, 0 to 140699
Data columns (total 20 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   id                                     140700 non-null  int64  
 1   Name                                   140700 non-null  object 
 2   Gender                                 140700 non-null  object 
 3   Age                                    140700 non-null  float64
 4   City                                   140700 non-null  object 
 5   Working Professional or Student        140700 non-null  object 
 6   Profession                             104070 non-null  object 
 7   Academic Pressure                      27897 non-null   float64
 8   Work Pressure                          112782 non-null  float64
 9   CGPA                                   27898 non-null   float64
 10  Study Satisfaction                     27897 non-null   

In [69]:
#How many missing values are there?

total_rows = len(train_dataframe)
missing_percentage = (train_dataframe.isnull().sum() / total_rows) * 100
print(missing_percentage.round(5))

id                                        0.00000
Name                                      0.00000
Gender                                    0.00000
Age                                       0.00000
City                                      0.00000
Working Professional or Student           0.00000
Profession                               26.03412
Academic Pressure                        80.17271
Work Pressure                            19.84222
CGPA                                     80.17200
Study Satisfaction                       80.17271
Job Satisfaction                         19.83653
Sleep Duration                            0.00000
Dietary Habits                            0.00284
Degree                                    0.00142
Have you ever had suicidal thoughts ?     0.00000
Work/Study Hours                          0.00000
Financial Stress                          0.00284
Family History of Mental Illness          0.00000
Depression                                0.00000


In [70]:
# Checking the number of null values ​​in specific columns
print(train_dataframe.isnull().sum())

id                                            0
Name                                          0
Gender                                        0
Age                                           0
City                                          0
Working Professional or Student               0
Profession                                36630
Academic Pressure                        112803
Work Pressure                             27918
CGPA                                     112802
Study Satisfaction                       112803
Job Satisfaction                          27910
Sleep Duration                                0
Dietary Habits                                4
Degree                                        2
Have you ever had suicidal thoughts ?         0
Work/Study Hours                              0
Financial Stress                              4
Family History of Mental Illness              0
Depression                                    0
dtype: int64


In [72]:
# workers have 'Working Professional or Student' == 'Working Professional'
workers_df = train_dataframe[train_dataframe['Working Professional or Student'] == 'Working Professional']
#there are employed and unemployed workers, this can be detected from the 'Profession' column, which is empty for unemployed workers
unemployed_df = workers_df[workers_df['Profession'].isnull()]
employed_df = workers_df[workers_df['Profession'].notnull()]

#printing statistics
total_workers = len(workers_df)
total_employed = len(employed_df)
total_unemployed = len(unemployed_df)

employed_percentage = (total_employed / total_workers) * 100
unemployed_percentage = (total_unemployed / total_workers) * 100
workers_percentage = (total_workers / total_rows) * 100

print("Persentage of workers (considering the whole train set):", round(workers_percentage, 2), "%")
print("Percentage of employed workers (considering total workers):", round(employed_percentage, 2), "%")
print("Percentage of unemployed workers (considering total workers):", round(unemployed_percentage, 2), "%")

Persentage of workers (considering the whole train set): 80.17 %
Percentage of employed workers (considering total workers): 92.23 %
Percentage of unemployed workers (considering total workers): 7.77 %


In [73]:
#students have 'Working Professional or Student' == 'Student'
students_df = train_dataframe[train_dataframe['Working Professional or Student'] == 'Student']

#there are employed and unemployed students, this can be detected from the 'Profession' column, which is empty for unemployed students
unemployed_students_df = students_df[students_df['Profession'].isnull()]
employed_students_df = students_df[students_df['Profession'].notnull()]

#printing statistics
total_students = len(students_df)
total_employed_students = len(employed_students_df)
total_unemployed_students = len(unemployed_students_df)

employed_students_percentage = (total_employed_students / total_students) * 100
unemployed_students_percentage = (total_unemployed_students / total_students) * 100
students_percentage = (total_students / total_rows) * 100

print("Persentage of students (considering the whole train set):", round(students_percentage, 2), "%")
print("Percentage of employed students (considering total students):", round(employed_students_percentage, 2), "%")
print("Percentage of unemployed students (considering total students):", round(unemployed_students_percentage, 2), "%")

Persentage of students (considering the whole train set): 19.83 %
Percentage of employed students (considering total students): 0.12 %
Percentage of unemployed students (considering total students): 99.88 %


In [74]:
#total percentages
print("Persentage of students (considering the whole train set):", round(students_percentage, 2), "%")
print("\tOf which employed:", round((total_employed_students / total_rows) * 100, 2), "%")
print("\tOf which UNemployed:", round((total_unemployed_students / total_rows) * 100, 2), "%\n")

print("Persentage of workers (considering the whole train set):", round(workers_percentage, 2), "%")
print("\tOf which employed:", round((total_employed / total_rows) * 100, 2), "%")
print("\tOf which UNemployed:", round((total_unemployed / total_rows) * 100, 2), "%\n")

Persentage of students (considering the whole train set): 19.83 %
	Of which employed: 0.02 %
	Of which UNemployed: 19.81 %

Persentage of workers (considering the whole train set): 80.17 %
	Of which employed: 73.94 %
	Of which UNemployed: 6.23 %



### How is depression affecting those groups?

In [75]:
#workers
workers_with_depression = len(workers_df[workers_df['Depression'] == 1])
workers_without_depression = total_workers - workers_with_depression

workers_with_depression = workers_with_depression / total_workers * 100
workers_without_depression = workers_without_depression / total_workers * 100

employed_with_depression = len(employed_df[employed_df['Depression'] == 1])
employed_without_depression = total_employed - employed_with_depression

employed_with_depression = employed_with_depression / total_employed * 100
employed_without_depression = employed_without_depression / total_employed * 100

unemployed_with_depression = len(unemployed_df[unemployed_df['Depression'] == 1])
unemployed_without_depression = total_unemployed - unemployed_with_depression

unemployed_with_depression = unemployed_with_depression / total_unemployed * 100
unemployed_without_depression = unemployed_without_depression / total_unemployed * 100

print("Workers: \n")
print("Percentage of workers with depression:", round(workers_with_depression, 2), "%")
print("Percentage of workers without depression:", round(workers_without_depression, 2), "%")
print("Percentage of employed workers with depression:", round(employed_with_depression, 2), "%")
print("Percentage of employed workers without depression:", round(employed_without_depression, 2), "%")
print("Percentage of unemployed workers with depression:", round(unemployed_with_depression, 2), "%")
print("Percentage of unemployed workers without depression:", round(unemployed_without_depression, 2), "%")
print("\n\n")

#students
students_with_depression = len(students_df[students_df['Depression'] == 1])
students_without_depression = total_students - students_with_depression

students_with_depression = students_with_depression / total_students * 100
students_without_depression = students_without_depression / total_students * 100

employed_students_with_depression = len(employed_students_df[employed_students_df['Depression'] == 1])
employed_students_withouth_depression = total_employed_students - employed_students_with_depression

employed_students_with_depression = employed_students_with_depression / total_employed_students * 100
employed_students_withouth_depression = employed_students_withouth_depression / total_employed_students * 100

unemployed_students_with_depression = len(unemployed_students_df[unemployed_students_df['Depression'] == 1])
unemployed_students_without_depression = total_unemployed_students - unemployed_students_with_depression

unemployed_students_with_depression = unemployed_students_with_depression / total_unemployed_students * 100
unemployed_students_without_depression = unemployed_students_without_depression / total_unemployed_students * 100

print("Students: \n")
print("Percentage of students with depression:", round(students_with_depression, 2), "%")
print("Percentage of students without depression:", round(students_without_depression, 2), "%")
print("Percentage of employed students with depression:", round(employed_students_with_depression, 2), "%")
print("Percentage of employed students without depression:", round(employed_students_withouth_depression, 2), "%")
print("Percentage of unemployed students with depression:", round(unemployed_students_with_depression, 2), "%")
print("Percentage of unemployed students without depression:", round(unemployed_students_without_depression, 2), "%")

Workers: 

Percentage of workers with depression: 8.18 %
Percentage of workers without depression: 91.82 %
Percentage of employed workers with depression: 5.71 %
Percentage of employed workers without depression: 94.29 %
Percentage of unemployed workers with depression: 37.51 %
Percentage of unemployed workers without depression: 62.49 %



Students: 

Percentage of students with depression: 58.55 %
Percentage of students without depression: 41.45 %
Percentage of employed students with depression: 82.35 %
Percentage of employed students without depression: 17.65 %
Percentage of unemployed students with depression: 58.52 %
Percentage of unemployed students without depression: 41.48 %


### Unique values

In [76]:
for column in train_dataframe.columns:
    if column=='id':
        continue
    
    unique_values = train_dataframe[column].unique()
    unique_count = len(unique_values)
    
    nan_count = train_dataframe[column].isna().sum()
    
    print(f"Column '{column}' has {unique_count} unique values (including NaN if present).")
    print(f"Number of NaN values: {nan_count}")
    
    sorted_unique_values = sorted(map(str, unique_values))
    print("\nSorted unique values:")
    print(sorted_unique_values)
    
    print("\nCounts of each unique value:")
    value_counts = train_dataframe[column].value_counts(dropna=False)
    for value, count in value_counts.items():
        print(f"{value}: {count}")
    
    print("\n")

Column 'Name' has 422 unique values (including NaN if present).
Number of NaN values: 0

Sorted unique values:
['18', 'A.Ed', 'Aadhya', 'Aahana', 'Aakash', 'Aam', 'Aan', 'Aanchal', 'Aani', 'Aanket', 'Aanya', 'Aaradhya', 'Aarand', 'Aarani', 'Aaransh', 'Aaranya', 'Aarash', 'Aarat', 'Aarav', 'Aariket', 'Aariv', 'Aarla', 'Aarohi', 'Aarsh', 'Aarsush', 'Aarti', 'Aarun', 'Aarush', 'Aarvi', 'Aarya', 'Aavya', 'Abarav', 'Abhinav', 'Abhishek', 'Abishma', 'Adachi', 'Aditi', 'Aditya', 'Adiya', 'Advait', 'Aieter', 'Aikash', 'Airav', 'Aisha', 'Aishwarya', 'Aiya', 'Amit', 'Anahk', 'Anakash', 'Anand', 'Anar', 'Anariv', 'Anarush', 'Anaya', 'Anh', 'Anhil', 'Ani', 'Anika', 'Aniket', 'Anil', 'Anirudh', 'Anish', 'Anisha', 'Anishi', 'Aniv', 'Anjali', 'Anjun', 'Anohi', 'Ansh', 'Anupal', 'Anushka', 'Anvi', 'Anya', 'Aohi', 'Apoorva', 'Arav', 'Ariti', 'Arjun', 'Armaan', 'Arnar', 'Arnav', 'Arsha', 'Arvik', 'Arya', 'Aryan', 'Asha', 'Atharv', 'Avni', 'Ayaan', 'Ayansh', 'Ayash', 'Ayhan', 'Ayoub', 'Ayush', 'Ayut', 'A

# Data Preprocessing

In [77]:
#replace gender with 0 (female) and 1 (male)
train_dataframe['Gender'] = train_dataframe['Gender'].map({'Female': 0, 'Male': 1})

#replace Working Professional or Student with 0 (student) and 1 (worker)    
train_dataframe['Working Professional or Student'] = train_dataframe['Working Professional or Student'].map({'Working Professional': 1, 'Student': 0})

In [79]:
#add new category for cities
#cities with less than a 1000 frequency on the dataset are mapped to category "other"

frequency = train_dataframe['City'].value_counts()
category_mapping = {category: ('Others' if count < 1000 else category) for category, count in frequency.items()}
train_dataframe['City'] = train_dataframe['City'].map(category_mapping)

In [80]:
#add other category for professions
#professions with less than a 1000 frequency on the dataset are mapped to category "other"

frequency = train_dataframe['Profession'].value_counts()
category_mapping = {category: ('Others' if count < 300 else category) for category, count in frequency.items()}
train_dataframe['Profession'] = train_dataframe['Profession'].map(category_mapping)

In [81]:
#sleep duration

#turn 
# Less than 5 hours: 38784
# 7-8 hours: 36969
# More than 8 hours: 32726
# 5-6 hours: 32142

#into

# Less than 5 hours
# 5-8 hours
# More than 8 hours

# Define the mapping dictionary
mapping = {
    '3-4 hours': 'Less than 5 hours',
    '4-5 hours': 'Less than 5 hours',
    '4-6 hours': 'Less than 5 hours',
    '2-3 hours': 'Less than 5 hours',
    'Less than 5 hours': 'Less than 5 hours',
    '5-6 hours': '5-8 hours',
    '6-7 hours': '5-8 hours',
    '7-8 hours': '5-8 hours',
    'More than 8 hours': 'More than 8 hours'
}

# Apply the mapping
train_dataframe['Sleep Duration'] = train_dataframe['Sleep Duration'].map(mapping)

# Map all remaining rows (NaN after mapping) to 'Less than 5 hours'
train_dataframe['Sleep Duration'] = train_dataframe['Sleep Duration'].fillna('Less than 5 hours')


In [82]:
#mapping diatary habits less than 1000 to Unhealthy

frequency = train_dataframe['Dietary Habits'].value_counts()
category_mapping = {category: ('Unhealthy' if count < 1000 else category) for category, count in frequency.items()}
train_dataframe['Dietary Habits'] = train_dataframe['Dietary Habits'].map(category_mapping)

In [83]:
#degree

frequency = train_dataframe['Degree'].value_counts()
category_mapping = {category: ('Others' if count < 1000 else category) for category, count in frequency.items()}
train_dataframe['Degree'] = train_dataframe['Degree'].map(category_mapping)

In [84]:
#mapping suicidal thoughths to 0 (no) and 1 (yes)

train_dataframe['Have you ever had suicidal thoughts ?'] = train_dataframe['Have you ever had suicidal thoughts ?'].map({'No': 0, 'Yes': 1})
train_dataframe['Family History of Mental Illness'] = train_dataframe['Family History of Mental Illness'].map({'No': 0, 'Yes': 1})

In [85]:
#if 'Working Professional or Student' is 'Student' (0) and profession is NaN, replace it with 'Student'
train_dataframe.loc[
    (train_dataframe['Working Professional or Student'] == 0) & (train_dataframe['Profession'].isna()), 
    'Profession'
] = 'Student'

#fill NaN professions with 'Unknown'
train_dataframe['Profession'] = train_dataframe['Profession'].fillna('Unknown')

In [86]:
#replacing last NaN values
#NaN degree is replaced with 'Others'
train_dataframe['Degree'] = train_dataframe['Degree'].fillna('Others')

#NaN Financial Stress is replaced with column mean
train_dataframe['Financial Stress'] = train_dataframe['Financial Stress'].fillna(int(train_dataframe['Financial Stress'].mean()))

#NaN Dietary Habits is replaced with 'Moderate'
train_dataframe['Dietary Habits'] = train_dataframe['Dietary Habits'].fillna('Moderate')

### Normalizing in [0,1]

In [94]:
from sklearn.preprocessing import MinMaxScaler

numeric_columns = train_dataframe.select_dtypes(include=['int', 'float', 'int64', 'float64']).columns


scaler = MinMaxScaler()
train_dataframe[numeric_columns] = scaler.fit_transform(train_dataframe[numeric_columns])

In [95]:
cleaned_dataframe = train_dataframe.copy()

In [97]:
cleaned_dataframe

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0.000000,Aaradhya,0.0,0.738095,Ludhiana,1.0,Chef,,1.00,,,0.25,More than 8 hours,Healthy,BHM,0.0,0.083333,0.25,0.0,0.0
1,0.000007,Vivan,1.0,0.190476,Varanasi,1.0,Teacher,,0.75,,,0.50,Less than 5 hours,Unhealthy,LLB,1.0,0.583333,0.50,0.0,1.0
2,0.000014,Yuvraj,1.0,0.357143,Visakhapatnam,0.0,Student,1.00,,0.792757,0.25,,5-8 hours,Healthy,B.Pharm,1.0,0.250000,0.00,0.0,1.0
3,0.000021,Yuvraj,1.0,0.095238,Mumbai,1.0,Teacher,,1.00,,,0.00,Less than 5 hours,Moderate,BBA,1.0,0.833333,0.00,1.0,1.0
4,0.000028,Rhea,0.0,0.285714,Kanpur,1.0,Business Analyst,,0.00,,,0.00,5-8 hours,Unhealthy,BBA,1.0,0.750000,0.75,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140695,0.999972,Vidya,0.0,0.000000,Ahmedabad,1.0,Unknown,,1.00,,,0.75,5-8 hours,Unhealthy,Class 12,0.0,0.166667,0.75,1.0,1.0
140696,0.999979,Lata,0.0,0.547619,Hyderabad,1.0,Content Writer,,1.00,,,0.75,5-8 hours,Moderate,B.Tech,1.0,0.500000,1.00,1.0,0.0
140697,0.999986,Aanchal,0.0,0.142857,Kolkata,1.0,Marketing Manager,,0.50,,,0.00,More than 8 hours,Moderate,B.Com,0.0,0.333333,0.75,0.0,0.0
140698,0.999993,Prachi,0.0,0.738095,Srinagar,1.0,Plumber,,1.00,,,0.25,5-8 hours,Moderate,ME,1.0,0.833333,0.00,0.0,0.0


In [98]:
#missing elements are not random
#that means that missing elements carry some sort of information
#for example if CGPA is NaN, then it's highly likely that the person is a worker and not a student

#replacing all numerical NaN values with -1
cleaned_dataframe = cleaned_dataframe.fillna(-1)

In [99]:
print(cleaned_dataframe.isnull().sum())

id                                       0
Name                                     0
Gender                                   0
Age                                      0
City                                     0
Working Professional or Student          0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64


### One Hot Encoding

In [35]:
#one hot encoding for categorical features

categorical_columns = ['City', 'Profession', 'Dietary Habits', 'Degree', 'Sleep Duration']

# Define the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop=None, sparse_output=False), categorical_columns)
    ],
    remainder='passthrough'  # Keep other columns as they are
)

In [30]:
# Apply the transformer to the data
transformed_data = preprocessor.fit_transform(cleaned_dataframe)

# Extract feature names for the encoded columns
encoded_columns = preprocessor.named_transformers_['onehot'].get_feature_names_out(categorical_columns).tolist()

# Convert the transformed data into a DataFrame
transformed_df = pd.DataFrame(transformed_data, columns=encoded_columns + [col for col in cleaned_dataframe.columns if col not in categorical_columns])

columns_order = ['id', 'Name'] + [col for col in transformed_df.columns if col not in ['id', 'Name']]
reordered_df = transformed_df[columns_order]
    
# converting all columns to int except for CGPA
columns_to_convert = [col for col in reordered_df.columns if col not in ['CGPA', 'Name']]
reordered_df[columns_to_convert] = reordered_df[columns_to_convert].astype(int)

In [31]:
# Save the reordered DataFrame to a CSV file
reordered_df.to_csv('transformed_data.csv', index=False)

In [32]:
reordered_df

Unnamed: 0,id,Name,City_Agra,City_Ahmedabad,City_Bangalore,City_Bhopal,City_Chennai,City_Delhi,City_Faridabad,City_Ghaziabad,...,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,0,0,0,0,0,0,0,0,...,-1,5,-1.0,-1,2,0,1,2,0,0
1,1,Vivan,0,0,0,0,0,0,0,0,...,-1,4,-1.0,-1,3,1,7,3,0,1
2,2,Yuvraj,0,0,0,0,0,0,0,0,...,5,-1,0.792757,2,-1,1,3,1,0,1
3,3,Yuvraj,0,0,0,0,0,0,0,0,...,-1,5,-1.0,-1,1,1,10,1,1,1
4,4,Rhea,0,0,0,0,0,0,0,0,...,-1,1,-1.0,-1,1,1,9,4,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140695,140695,Vidya,0,1,0,0,0,0,0,0,...,-1,5,-1.0,-1,4,0,2,4,1,1
140696,140696,Lata,0,0,0,0,0,0,0,0,...,-1,5,-1.0,-1,4,1,6,5,1,0
140697,140697,Aanchal,0,0,0,0,0,0,0,0,...,-1,3,-1.0,-1,1,0,4,4,0,0
140698,140698,Prachi,0,0,0,0,0,0,0,0,...,-1,5,-1.0,-1,2,1,10,1,0,0


### Creating Dataset with NaN values replaced with mean

In [127]:
cleaned_dataframe = train_dataframe.copy()
cleaned_dataframe

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0.000000,Aaradhya,0.0,0.738095,Ludhiana,1.0,Chef,,1.00,,,0.25,More than 8 hours,Healthy,BHM,0.0,0.083333,0.25,0.0,0.0
1,0.000007,Vivan,1.0,0.190476,Varanasi,1.0,Teacher,,0.75,,,0.50,Less than 5 hours,Unhealthy,LLB,1.0,0.583333,0.50,0.0,1.0
2,0.000014,Yuvraj,1.0,0.357143,Visakhapatnam,0.0,Student,1.00,,0.792757,0.25,,5-8 hours,Healthy,B.Pharm,1.0,0.250000,0.00,0.0,1.0
3,0.000021,Yuvraj,1.0,0.095238,Mumbai,1.0,Teacher,,1.00,,,0.00,Less than 5 hours,Moderate,BBA,1.0,0.833333,0.00,1.0,1.0
4,0.000028,Rhea,0.0,0.285714,Kanpur,1.0,Business Analyst,,0.00,,,0.00,5-8 hours,Unhealthy,BBA,1.0,0.750000,0.75,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140695,0.999972,Vidya,0.0,0.000000,Ahmedabad,1.0,Unknown,,1.00,,,0.75,5-8 hours,Unhealthy,Class 12,0.0,0.166667,0.75,1.0,1.0
140696,0.999979,Lata,0.0,0.547619,Hyderabad,1.0,Content Writer,,1.00,,,0.75,5-8 hours,Moderate,B.Tech,1.0,0.500000,1.00,1.0,0.0
140697,0.999986,Aanchal,0.0,0.142857,Kolkata,1.0,Marketing Manager,,0.50,,,0.00,More than 8 hours,Moderate,B.Com,0.0,0.333333,0.75,0.0,0.0
140698,0.999993,Prachi,0.0,0.738095,Srinagar,1.0,Plumber,,1.00,,,0.25,5-8 hours,Moderate,ME,1.0,0.833333,0.00,0.0,0.0


In [128]:
# Convert all 'int' columns to 'float'
int_columns = cleaned_dataframe.select_dtypes(include=['int']).columns
cleaned_dataframe[int_columns] = cleaned_dataframe[int_columns].astype(float)

# Compute the mean and replace NaN only for 'float' columns
float_columns = cleaned_dataframe.select_dtypes(include=['float']).columns
cleaned_dataframe[float_columns] = cleaned_dataframe[float_columns].fillna(cleaned_dataframe[float_columns].mean())

cleaned_dataframe

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0.000000,Aaradhya,0.0,0.738095,Ludhiana,1.0,Chef,0.535568,1.00000,0.528901,0.486235,0.250000,More than 8 hours,Healthy,BHM,0.0,0.083333,0.25,0.0,0.0
1,0.000007,Vivan,1.0,0.190476,Varanasi,1.0,Teacher,0.535568,0.75000,0.528901,0.486235,0.500000,Less than 5 hours,Unhealthy,LLB,1.0,0.583333,0.50,0.0,1.0
2,0.000014,Yuvraj,1.0,0.357143,Visakhapatnam,0.0,Student,1.000000,0.49975,0.792757,0.250000,0.493601,5-8 hours,Healthy,B.Pharm,1.0,0.250000,0.00,0.0,1.0
3,0.000021,Yuvraj,1.0,0.095238,Mumbai,1.0,Teacher,0.535568,1.00000,0.528901,0.486235,0.000000,Less than 5 hours,Moderate,BBA,1.0,0.833333,0.00,1.0,1.0
4,0.000028,Rhea,0.0,0.285714,Kanpur,1.0,Business Analyst,0.535568,0.00000,0.528901,0.486235,0.000000,5-8 hours,Unhealthy,BBA,1.0,0.750000,0.75,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140695,0.999972,Vidya,0.0,0.000000,Ahmedabad,1.0,Unknown,0.535568,1.00000,0.528901,0.486235,0.750000,5-8 hours,Unhealthy,Class 12,0.0,0.166667,0.75,1.0,1.0
140696,0.999979,Lata,0.0,0.547619,Hyderabad,1.0,Content Writer,0.535568,1.00000,0.528901,0.486235,0.750000,5-8 hours,Moderate,B.Tech,1.0,0.500000,1.00,1.0,0.0
140697,0.999986,Aanchal,0.0,0.142857,Kolkata,1.0,Marketing Manager,0.535568,0.50000,0.528901,0.486235,0.000000,More than 8 hours,Moderate,B.Com,0.0,0.333333,0.75,0.0,0.0
140698,0.999993,Prachi,0.0,0.738095,Srinagar,1.0,Plumber,0.535568,1.00000,0.528901,0.486235,0.250000,5-8 hours,Moderate,ME,1.0,0.833333,0.00,0.0,0.0


In [129]:
mean_filled_dataframe = cleaned_dataframe.copy()

### KNN for filling values

In [36]:
def OH_encoding(cleaned_dataframe):
    categorical_columns = ['City', 'Profession', 'Dietary Habits', 'Degree', 'Sleep Duration']

    preprocessor = ColumnTransformer(
        transformers=[
            ('onehot', OneHotEncoder(drop=None, sparse_output=False), categorical_columns)
        ],
        remainder='passthrough' 
    )

    transformed_data = preprocessor.fit_transform(cleaned_dataframe)
    encoded_columns = preprocessor.named_transformers_['onehot'].get_feature_names_out(categorical_columns).tolist()

    transformed_df = pd.DataFrame(transformed_data, columns=encoded_columns + [col for col in cleaned_dataframe.columns if col not in categorical_columns])

    columns_order = ['id', 'Name'] + [col for col in transformed_df.columns if col not in ['id', 'Name']]
    reordered_df = transformed_df[columns_order]
        
    columns_to_convert = [col for col in reordered_df.columns if col not in ['CGPA', 'Name', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Financial Stress']]
    reordered_df[columns_to_convert] = reordered_df[columns_to_convert].astype(int)
    
    columns_to_convert = ['Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Financial Stress']
    reordered_df[columns_to_convert] = reordered_df[columns_to_convert].apply(pd.to_numeric, errors='coerce')

    
    return reordered_df

In [37]:
from sklearn.impute import KNNImputer
import pandas as pd

knn_df = train_dataframe.copy()
knn_df = OH_encoding(knn_df)

In [44]:
excluded_columns = ['id', 'Name', 'Depression']
numeric_columns = knn_df.select_dtypes(include=['float', 'int']).columns.difference(excluded_columns)

# Apply KNNImputer only on numerical columns
imputer = KNNImputer(n_neighbors=5
knn_df[numeric_columns] = imputer.fit_transform(knn_df[numeric_columns])

In [45]:
#print knn_df clomuns that are objects
knn_df.select_dtypes(include=['object']).columns

Index(['Name'], dtype='object')

In [46]:
numeric_columns

Index(['Academic Pressure', 'Age', 'CGPA', 'City_Agra', 'City_Ahmedabad',
       'City_Bangalore', 'City_Bhopal', 'City_Chennai', 'City_Delhi',
       'City_Faridabad',
       ...
       'Profession_Travel Consultant', 'Profession_UX/UI Designer',
       'Profession_Unknown', 'Sleep Duration_5-8 hours',
       'Sleep Duration_Less than 5 hours', 'Sleep Duration_More than 8 hours',
       'Study Satisfaction', 'Work Pressure', 'Work/Study Hours',
       'Working Professional or Student'],
      dtype='object', length=114)

In [47]:
knn_df

Unnamed: 0,id,Name,City_Agra,City_Ahmedabad,City_Bangalore,City_Bhopal,City_Chennai,City_Delhi,City_Faridabad,City_Ghaziabad,...,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.4,5.0,0.341247,3.6,2.0,0.0,1.0,2,0.0,0
1,1,Vivan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,4.0,0.462374,1.6,3.0,1.0,7.0,3,0.0,1
2,2,Yuvraj,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,4.0,0.792757,2.0,2.0,1.0,3.0,1,0.0,1
3,3,Yuvraj,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,5.0,0.433803,2.0,1.0,1.0,10.0,1,1.0,1
4,4,Rhea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.6,1.0,0.624950,2.2,1.0,1.0,9.0,4,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140695,140695,Vidya,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,5.0,0.442656,4.4,4.0,0.0,2.0,4,1.0,1
140696,140696,Lata,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.8,5.0,0.467203,3.6,4.0,1.0,6.0,5,1.0,0
140697,140697,Aanchal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,3.0,0.512676,3.2,1.0,0.0,4.0,4,0.0,0
140698,140698,Prachi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.2,5.0,0.567404,2.4,2.0,1.0,10.0,1,0.0,0


### One Hot Encoding of these 2 datasets

In [105]:
datasets = {
    "mean_dataset": pd.read_csv('datasets/mean_data.csv'),
    "knn_dataset": pd.read_csv('datasets/knn_data.csv'),
    "missing_dataset": pd.read_csv('datasets/marking_missing_data.csv'),
    "merged_features_dataset": pd.read_csv("datasets\merged_features_data.csv")
}

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [116]:
dataset2 = knn_df
dataset3 = OH_encoding(cleaned_dataframe)

In [118]:
numeric_columns = dataset2.select_dtypes(include=['int', 'float', 'int64', 'float64']).columns

# Apply MinMaxScaler to normalize values in the range [0, 1]
scaler = MinMaxScaler()
dataset2[numeric_columns] = scaler.fit_transform(dataset2[numeric_columns])