### EDA, Data Visualization and Feature engineering


In [1]:
import numpy as np
import pandas as pd 
import re
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import copy
import optuna
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/sample_submission.csv")

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140700 entries, 0 to 140699
Data columns (total 20 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   id                                     140700 non-null  int64  
 1   Name                                   140700 non-null  object 
 2   Gender                                 140700 non-null  object 
 3   Age                                    140700 non-null  float64
 4   City                                   140700 non-null  object 
 5   Working Professional or Student        140700 non-null  object 
 6   Profession                             104070 non-null  object 
 7   Academic Pressure                      27897 non-null   float64
 8   Work Pressure                          112782 non-null  float64
 9   CGPA                                   27898 non-null   float64
 10  Study Satisfaction                     27897 non-null   

In [7]:
numerical_col = train.select_dtypes(include= ['number']).columns.to_list()
catagorical_col = train.select_dtypes(include= ['category','object']).columns.to_list()
print(numerical_col)
print(catagorical_col)

['id', 'Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress', 'Depression']
['Name', 'Gender', 'City', 'Working Professional or Student', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']


#### Exploring Numerical features

In [8]:

train[numerical_col].isna().sum()

id                         0
Age                        0
Academic Pressure     112803
Work Pressure          27918
CGPA                  112802
Study Satisfaction    112803
Job Satisfaction       27910
Work/Study Hours           0
Financial Stress           4
Depression                 0
dtype: int64

#### Checking for null counts across all columns

In [11]:
import os
import pandas as pd


null_counts = train.isna().sum()

null_percentage = (null_counts / len(train)) * 100

null_analysis = pd.DataFrame({
    'Column': null_counts.index,
    'Null Count': null_counts.values,
    'Null Percentage': null_percentage.values
}).sort_values(by='Null Percentage', ascending=False)

output_dir = "../data/analysis"
os.makedirs(output_dir, exist_ok=True) 

output_file = os.path.join(output_dir, "null_analysis.csv")
null_analysis.to_csv(output_file, index=False)

print(null_analysis)


                                   Column  Null Count  Null Percentage
10                     Study Satisfaction      112803        80.172708
7                       Academic Pressure      112803        80.172708
9                                    CGPA      112802        80.171997
6                              Profession       36630        26.034115
8                           Work Pressure       27918        19.842217
11                       Job Satisfaction       27910        19.836532
13                         Dietary Habits           4         0.002843
17                       Financial Stress           4         0.002843
14                                 Degree           2         0.001421
18       Family History of Mental Illness           0         0.000000
16                       Work/Study Hours           0         0.000000
15  Have you ever had suicidal thoughts ?           0         0.000000
0                                      id           0         0.000000
12    