# =============================================================================
# Step 1: Import Libraries and Load Dataset
# =============================================================================


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('/kaggle/input/student-depression-dataset/Student Depression Dataset.csv')

# Display the initial preview of the data
print("Initial Data Preview:")
print(df.head())

# =============================================================================
# Step 2: Data Cleaning and Preprocessing
# =============================================================================

In [None]:
# Convert appropriate columns to 'category' types
df['Gender'] = df['Gender'].astype('category')
df['City'] = df['City'].astype('category')

# Rename the column for suicidal thoughts to a simpler name and convert it using a mapping
df.rename(columns={'Have you ever had suicidal thoughts ?': 'Suicidal_Thoughts'}, inplace=True)
df['Suicidal_Thoughts'] = df['Suicidal_Thoughts'].map({'Yes': 1, 'No': 0})

# Convert Yes/No in 'Family History of Mental Illness' to binary
df['Family History of Mental Illness'] = df['Family History of Mental Illness'].map({'Yes': 1, 'No': 0})

# ----- Clean "Sleep Duration" column -----
# Remove any double quotes and single quotes, plus extra whitespace.
df['Sleep Duration'] = df['Sleep Duration'].str.replace('"', '', regex=False)\
                                      .str.strip("'").str.strip()

# Debug: Print distinct cleaned values in Sleep Duration
print("\nDistinct values in 'Sleep Duration' after cleaning:")
print(df['Sleep Duration'].unique())

# Convert textual sleep duration descriptions to numeric values using a mapping.
sleep_mapping = {
    'Less than 5 hours': 4.0,
    '5-6 hours': 5.5,
    '7-8 hours': 7.5,
    'More than 8 hours': 9.0
}
df['Sleep_Duration_Num'] = df['Sleep Duration'].replace(sleep_mapping)

# For any entries that remain as "Others", convert them to NaN
df['Sleep_Duration_Num'] = df['Sleep_Duration_Num'].replace("Others", np.nan)

# Fill missing sleep duration values (from "Others" or other mapping issues) with the median.
median_sleep = df['Sleep_Duration_Num'].median()
df['Sleep_Duration_Num'] = df['Sleep_Duration_Num'].fillna(median_sleep)

# Explicitly convert Sleep_Duration_Num to float
df['Sleep_Duration_Num'] = df['Sleep_Duration_Num'].astype(float)

# Convert other numeric columns to proper numeric types.
numeric_cols = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 
                'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 
                'Financial Stress', 'Depression']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Also ensure 'Family History of Mental Illness' is numeric.
df['Family History of Mental Illness'] = pd.to_numeric(df['Family History of Mental Illness'], errors='coerce')

# Display dataset information after cleaning
print("\nDataset Info After Cleaning:")
print(df.info())
print("\nData Preview After Cleaning:")
print(df.head())

# =============================================================================
# Pre-EDA Data Integrity Checks
# =============================================================================

In [None]:
# 1. Check for missing values and verify data types
print("\nMissing Values per Column:")
print(df.isnull().sum())

print("\nData Types:")
print(df.dtypes)

# 2. Summary statistics for numeric features
print("\nSummary Statistics for Numeric Features:")
print(df.describe())

# 3. Distribution checks (histograms and boxplots)
numeric_columns = [
    'Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 
    'Study Satisfaction', 'Job Satisfaction', 'Sleep_Duration_Num', 
    'Work/Study Hours', 'Financial Stress', 
    'Family History of Mental Illness', 'Depression'
]

plt.figure(figsize=(12, 10))
for i, col in enumerate(numeric_columns):
    plt.subplot(4, 3, i+1)
    plt.hist(df[col].dropna(), bins=20, edgecolor='black')
    plt.title(col)
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 10))
for i, col in enumerate(numeric_columns):
    plt.subplot(4, 3, i+1)
    sns.boxplot(y=df[col])
    plt.title(col)
plt.tight_layout()
plt.show()

# 4. Preliminary Correlation Matrix
plt.figure(figsize=(10, 8))
corr_pre = df[numeric_columns].corr()
sns.heatmap(corr_pre, annot=True, cmap='coolwarm')
plt.title('Preliminary Correlation Matrix')
plt.show()


# =============================================================================
# Step 3: Exploratory Data Analysis (EDA)
# =============================================================================

In [None]:
# Univariate Analysis: Distribution of CGPA and Age
plt.figure(figsize=(8, 4))
plt.hist(df['CGPA'].dropna(), bins=20, edgecolor='black')
plt.xlabel('CGPA')
plt.ylabel('Frequency')
plt.title('Distribution of CGPA')
plt.show()

plt.figure(figsize=(8, 4))
plt.hist(df['Age'].dropna(), bins=15, edgecolor='black')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Distribution of Age')
plt.show()

# Bivariate Analysis: Scatter plot of Academic Pressure vs. CGPA
plt.figure(figsize=(8, 4))
plt.scatter(df['Academic Pressure'], df['CGPA'])
plt.xlabel('Academic Pressure')
plt.ylabel('CGPA')
plt.title('Academic Pressure vs. CGPA')
plt.show()

# Correlation Analysis: Visualize correlation among key numeric features
numeric_features_eda = [
    'Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 
    'Study Satisfaction', 'Job Satisfaction', 'Sleep_Duration_Num', 
    'Work/Study Hours', 'Financial Stress', 
    'Family History of Mental Illness', 'Depression'
]

corr_matrix = df[numeric_features_eda].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# =============================================================================
# Step 4: Data Synthesizing (Feature Engineering)
# =============================================================================

In [None]:
# a. Create a "Stress_Score" by combining Academic Pressure and Work Pressure.
df['Stress_Score'] = (df['Academic Pressure'] + df['Work Pressure']) / 2

# b. Synthesize a "Lifestyle_Balance" metric using Sleep Duration, Study Satisfaction, and Job Satisfaction.
lifestyle_features = ['Sleep_Duration_Num', 'Study Satisfaction', 'Job Satisfaction']

scaler = StandardScaler()
lifestyle_scaled = scaler.fit_transform(df[lifestyle_features])

# Create a DataFrame for scaled values with the same index as df.
lifestyle_scaled_df = pd.DataFrame(
    lifestyle_scaled,
    columns=[f'{feat}_scaled' for feat in lifestyle_features],
    index=df.index
)

# Concatenate the scaled DataFrame with the original DataFrame.
df = pd.concat([df, lifestyle_scaled_df], axis=1)

# Create the composite Lifestyle Balance metric with adjusted weights:
df['Lifestyle_Balance'] = (
    0.4 * df['Sleep_Duration_Num_scaled'] +
    0.3 * df['Study Satisfaction_scaled'] +
    0.3 * df['Job Satisfaction_scaled']
)

print("\nNew columns created in Step 4:")
print(df[['Stress_Score', 'Lifestyle_Balance']].head())

# =============================================================================
# Step 5: Advanced Analysis — Clustering and Cluster Profiling
# =============================================================================

In [None]:


# Select features for clustering (including synthesized features)
cluster_features = ['Age', 'CGPA', 'Sleep_Duration_Num', 'Stress_Score', 'Lifestyle_Balance']

# Perform K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(df[cluster_features])

# Use PCA to reduce dimensions to 2 for visualization.
pca = PCA(n_components=2)
components = pca.fit_transform(df[cluster_features])
df['Component1'] = components[:, 0]
df['Component2'] = components[:, 1]

# Plot the PCA scatter plot with clusters.
plt.figure(figsize=(8, 6))
scatter = plt.scatter(df['Component1'], df['Component2'], c=df['Cluster'], cmap='viridis', alpha=0.6)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('Clusters Based on Synthesized & Adjusted Features')
plt.colorbar(scatter, ticks=[0, 1, 2])
plt.show()

# =======================
# Cluster Profiling
# =======================
# Compute descriptive statistics for each cluster (mean values)
cluster_profile_mean = df.groupby('Cluster')[cluster_features].mean()
print("Cluster Profiling (Mean Values):")
print(cluster_profile_mean)

# Compute standard deviations for each cluster
cluster_profile_std = df.groupby('Cluster')[cluster_features].std()
print("\nCluster Profiling (Standard Deviations):")
print(cluster_profile_std)

# -----------------------
# Domain-Specific Labeling
# -----------------------
# Based on printed outputs
#   - Cluster 0 show high Stress_Score, moderate Sleep_Duration_Num, and high CGPA,
#   - Cluster 1 show moderate Stress_Score, high Sleep_Duration_Num, and lower CGPA, and
#   - Cluster 2 show lower Stress_Score, moderate Sleep_Duration_Num, and moderate CGPA,
cluster_labels = {
    0: "High Stress / Moderate Sleep / High CGPA",
    1: "Moderate Stress / High Sleep / Lower CGPA",
    2: "Low Stress / Moderate Sleep / Moderate CGPA"
}

# Map labels to a new column in DataFrame.
df['Cluster_Label'] = df['Cluster'].map(cluster_labels)

print("\nSample with Cluster Labels:")
print(df[['Cluster', 'Cluster_Label']].head())

# Annotate the PCA scatter plot with the domain-specific labels.
plt.figure(figsize=(8, 6))
for clus in sorted(df['Cluster'].unique()):
    cluster_data = df[df['Cluster'] == clus]
    plt.scatter(cluster_data['Component1'], cluster_data['Component2'],
                label=cluster_labels.get(clus, f"Cluster {clus}"),
                alpha=0.6)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('Clusters with Domain-Specific Labels')
plt.legend()
plt.show()

# =============================================================================
# Step 6: Predictive Modeling with Logistic Regression
# =============================================================================

In [None]:
# Define predictor columns (original + synthesized features).
predictor_cols = [
    'Age', 
    'Academic Pressure', 
    'Work Pressure', 
    'CGPA', 
    'Study Satisfaction', 
    'Job Satisfaction', 
    'Sleep_Duration_Num', 
    'Stress_Score', 
    'Lifestyle_Balance', 
    'Work/Study Hours', 
    'Financial Stress', 
    'Family History of Mental Illness'
]

X = df[predictor_cols]
y = df['Depression']  #(0 or 1)

# Impute any remaining missing values in X using the median.
X = X.fillna(X.median())

# Split the dataset into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train the logistic regression model.
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate model performance.
print("Logistic Regression Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


**In this analysis, a logistic regression model was developed to predict the binary outcome of student depression. The final model achieved an accuracy of approximately 78.6%, with class-specific performance indicating a precision of 80% and a recall of 84% for the depressed class.**

**These results suggest that the model reliably identifies students at risk while maintaining a balanced performance across classes.Additionally, K-Means clustering was applied using key features (Age, CGPA, Sleep Duration, Stress Score, and Lifestyle Balance) and visualized via PCA, enabling the segmentation of the student population into three distinct groups. This clustering facilitates the identification of inherent patterns within the data that may inform targeted intervention strategies.**

**Overall, the integration of unsupervised clustering with supervised logistic regression provides a complementary framework for both profiling and predictive analysis, offering valuable insights for early detection and potential intervention in student depression.**
