In [None]:
import pandas as pd

# ✅ Corrected File Loading with Delimiter
file_url = "https://raw.githubusercontent.com/fufi-tuti/dataMiningprj/main/Dataset/Student_Depression_Dataset.csv"

# Read the dataset using the correct delimiter
df = pd.read_csv(file_url, delimiter=";")

# ✅ Display first few rows to check
print(df.head())



   id  Gender  Age           City Profession  Academic Pressure  \
0   2    Male   33  Visakhapatnam    Student                  5   
1   8  Female   24      Bangalore    Student                  2   
2  26    Male   31       Srinagar    Student                  3   
3  30  Female   28       Varanasi    Student                  3   
4  32  Female   25         Jaipur    Student                  4   

   Work Pressure  CGPA  Study Satisfaction  Job Satisfaction  \
0              0  8.97                   2                 0   
1              0  5.90                   5                 0   
2              0  7.03                   5                 0   
3              0  5.59                   2                 0   
4              0  8.13                   3                 0   

      Sleep Duration Dietary Habits   Degree  \
0          5-6 hours        Healthy  B.Pharm   
1          5-6 hours       Moderate      BSc   
2  Less than 5 hours        Healthy       BA   
3          7-8 hours

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# ✅ Step 1: Clean Column Names
df.columns = df.columns.str.strip()  # Remove spaces around column names

# ✅ Step 2: Define Numerical Features
numerical_features = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA',
                      'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours']

# ✅ Step 3: Apply Normalization
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# ✅ Step 4: Display the Transformed Data
print("✅ Normalization Complete! Here are the first 5 rows:")
print(df.head())


✅ Normalization Complete! Here are the first 5 rows:
   id  Gender       Age           City Profession  Academic Pressure  \
0   2    Male  0.365854  Visakhapatnam    Student                1.0   
1   8  Female  0.146341      Bangalore    Student                0.4   
2  26    Male  0.317073       Srinagar    Student                0.6   
3  30  Female  0.243902       Varanasi    Student                0.6   
4  32  Female  0.170732         Jaipur    Student                0.8   

   Work Pressure   CGPA  Study Satisfaction  Job Satisfaction  \
0            0.0  0.897                 0.4               0.0   
1            0.0  0.590                 1.0               0.0   
2            0.0  0.703                 1.0               0.0   
3            0.0  0.559                 0.4               0.0   
4            0.0  0.813                 0.6               0.0   

      Sleep Duration Dietary Habits   Degree  \
0          5-6 hours        Healthy  B.Pharm   
1          5-6 hours       

In [None]:
df.columns = df.columns.str.strip()  # Remove spaces before and after column names
print(df.columns)  # Verify that spaces are removed


Index(['id', 'Gender', 'Age', 'City', 'Profession', 'Academic Pressure',
       'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
       'Sleep Duration', 'Dietary Habits', 'Degree',
       'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
       'Financial Stress', 'Family History of Mental Illness', 'Depression'],
      dtype='object')


In [None]:
print(df.columns.tolist())  # Show all column names in a list format


['id', 'Gender', 'Age', 'City', 'Profession', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Financial Stress', 'Family History of Mental Illness', 'Depression']


In [None]:
df.columns = df.columns.str.strip()  # Remove spaces
df.columns = df.columns.str.replace("  ", " ")  # Fix double spaces
print(df.columns.tolist())  # Verify again


['id', 'Gender', 'Age', 'City', 'Profession', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Financial Stress', 'Family History of Mental Illness', 'Depression']


In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd

# ✅ 1. Label Encoding for Binary Features
binary_features = ['Gender', 'Have you ever had suicidal thoughts ?', 'Financial Stress',
                   'Family History of Mental Illness', 'Depression']


label_encoder = LabelEncoder()
for feature in binary_features:
    df[feature] = label_encoder.fit_transform(df[feature])

# ✅ 2. Ordinal Encoding for Ordered Categories
sleep_duration_mapping = {'Less than 5 hours': 0, '5-6 hours': 1, '7-8 hours': 2, 'More than 8 hours': 3}
degree_mapping = {'Class 12': 0, 'B.Sc': 1, 'M.Tech': 2, 'Ph.D': 3}

df['Sleep Duration'] = df['Sleep Duration'].map(sleep_duration_mapping)
df['Degree'] = df['Degree'].map(degree_mapping)

# ✅ 3. One-Hot Encoding for Cities and Dietary Habits
df = pd.get_dummies(df, columns=['City', 'Dietary Habits'])

# ✅ Display Transformed Dataset
print("✅ Categorical Encoding Complete! Here are the first 5 rows:")
print(df.head())


✅ Categorical Encoding Complete! Here are the first 5 rows:
   id  Gender       Age Profession  Academic Pressure  Work Pressure   CGPA  \
0   2       1  0.365854    Student                1.0            0.0  0.897   
1   8       0  0.146341    Student                0.4            0.0  0.590   
2  26       1  0.317073    Student                0.6            0.0  0.703   
3  30       0  0.243902    Student                0.6            0.0  0.559   
4  32       0  0.170732    Student                0.8            0.0  0.813   

   Study Satisfaction  Job Satisfaction  Sleep Duration  ...  City_Thane  \
0                 0.4               0.0             1.0  ...       False   
1                 1.0               0.0             1.0  ...       False   
2                 1.0               0.0             0.0  ...       False   
3                 0.4               0.0             2.0  ...       False   
4                 0.6               0.0             1.0  ...       False   

   City_

In [None]:
import pandas as pd
import numpy as np

# Select only numerical columns
numerical_df = df.select_dtypes(include=['number'])

# Check for skewness
skew_values = numerical_df.skew()

# Show skewness values
print("Skewness of Numerical Columns:")
print(skew_values)

# Optional: Check if transformation might help
print("\nRecommended for Discretization (Skew > 1 or Skew < -1):")
print(skew_values[abs(skew_values) > 1])


Skewness of Numerical Columns:
id                                        -0.005380
Gender                                    -0.230407
Age                                        0.132239
Academic Pressure                         -0.135165
Work Pressure                            108.594361
CGPA                                      -0.113063
Study Satisfaction                         0.010423
Job Satisfaction                          74.105663
Sleep Duration                             0.081650
Degree                                     2.029520
Have you ever had suicidal thoughts ?     -0.551059
Work/Study Hours                          -0.454769
Financial Stress                          -0.130023
Family History of Mental Illness           0.064192
Depression                                -0.347126
dtype: float64

Recommended for Discretization (Skew > 1 or Skew < -1):
Work Pressure       108.594361
Job Satisfaction     74.105663
Degree                2.029520
dtype: float64


In [None]:
import pandas as pd
import numpy as np

# Handle cases where all values are the same by checking unique values
if df['Work Pressure'].nunique() > 1:
    df['Work Pressure'] = pd.cut(df['Work Pressure'], bins=4, labels=[1, 2, 3, 4], duplicates='drop')
else:
    print("⚠️ Skipping 'Work Pressure' discretization as all values are the same.")

if df['Job Satisfaction'].nunique() > 1:
    df['Job Satisfaction'] = pd.cut(df['Job Satisfaction'], bins=4, labels=[1, 2, 3, 4], duplicates='drop')
else:
    print("⚠️ Skipping 'Job Satisfaction' discretization as all values are the same.")

# Manual binning for 'Degree' since it's ordinal
df['Degree'] = pd.cut(df['Degree'], bins=[-1, 0.5, 1.5, 2.5, 3.5], labels=[0, 1, 2, 3])

# **Fix NaN Handling for 'Degree'**
degree_mode = df['Degree'].mode()[0]  # Get the most common category

# Ensure mode is not already in categories before adding
if degree_mode not in df['Degree'].cat.categories:
    df['Degree'] = df['Degree'].cat.add_categories([degree_mode])

# Fill NaN values with the most common category
df['Degree'] = df['Degree'].fillna(degree_mode)

# Verify results
print("✅ Discretization Completed Successfully! Preview:")
print(df[['Work Pressure', 'Job Satisfaction', 'Degree']].head())


✅ Discretization Completed Successfully! Preview:
  Work Pressure Job Satisfaction Degree
0             1                1      0
1             1                1      0
2             1                1      0
3             1                1      0
4             1                1      2


In [None]:
import pandas as pd
from scipy import stats

# Convert categorical 'Degree' to numeric
df['Degree'] = pd.to_numeric(df['Degree'], errors='coerce')

# Convert 'Work Pressure' and 'Job Satisfaction' to numeric as well (if they are not already)
df['Work Pressure'] = pd.to_numeric(df['Work Pressure'], errors='coerce')
df['Job Satisfaction'] = pd.to_numeric(df['Job Satisfaction'], errors='coerce')

# Drop any remaining NaN values after conversion
df = df.dropna(subset=['Work Pressure', 'Job Satisfaction', 'Degree'])

# Define a threshold for outliers (z-score > 3 or < -3)
z_scores = stats.zscore(df[['Work Pressure', 'Job Satisfaction', 'Degree']])

# Remove rows where any of the selected columns have a z-score outside the range (-3, 3)
df_cleaned = df[(abs(z_scores) < 3).all(axis=1)]

# Print the number of removed noisy data points
print(f"✅ Removed {len(df) - len(df_cleaned)} noisy data points.")

# Update the dataset
df = df_cleaned

# Verify results
print("✅ Cleaned Data Preview:")
print(df[['Work Pressure', 'Job Satisfaction', 'Degree']].head())


✅ Removed 1029 noisy data points.
✅ Cleaned Data Preview:
   Work Pressure  Job Satisfaction  Degree
0              1                 1     0.0
1              1                 1     0.0
2              1                 1     0.0
3              1                 1     0.0
5              1                 1     0.0


In [None]:
df.to_csv("Processed_Student_Depression_Dataset.csv", index=False)
print("✅ Processed dataset saved successfully!")


✅ Processed dataset saved successfully!
