In [66]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv('data/cleaned_student_depression_dataset.csv')

In [67]:
# print(df['Degree'].unique())
print(df['City'].unique())

degree_category_map = {
    "Class 12": "School",
    "BA": "Arts_Commerce_Management",
    "MA": "Arts_Commerce_Management",
    "B.Com": "Arts_Commerce_Management",
    "M.Com": "Arts_Commerce_Management",
    "BBA": "Arts_Commerce_Management",
    "MBA": "Arts_Commerce_Management",
    "BSc": "Science_IT",
    "MSc": "Science_IT",
    "BCA": "Science_IT",
    "MCA": "Science_IT",
    "BE": "Engineering_Architecture",
    "B.Tech": "Engineering_Architecture",
    "ME": "Engineering_Architecture",
    "M.Tech": "Engineering_Architecture",
    "B.Arch": "Engineering_Architecture",
    "MBBS": "Medical_Pharma",
    "MD": "Medical_Pharma",
    "B.Pharm": "Medical_Pharma",
    "M.Pharm": "Medical_Pharma",
    "LLB": "Law_Education_Hospitality",
    "LLM": "Law_Education_Hospitality",
    "B.Ed": "Law_Education_Hospitality",
    "M.Ed": "Law_Education_Hospitality",
    "BHM": "Law_Education_Hospitality",
    "MHM": "Law_Education_Hospitality",
    "PhD": "Doctorate_Others",
    "Others": "Doctorate_Others"
}

city_to_region = {
    
    'Bangalore': 'South',
    'Chennai': 'South',
    'Hyderabad': 'South',
    'Visakhapatnam': 'South',


    'Srinagar': 'North',
    'Varanasi': 'North',
    'Jaipur': 'North',
    'Lucknow': 'North',
    'Meerut': 'North',
    'Agra': 'North',
    'Ghaziabad': 'North',
    'Faridabad': 'North',
    'Delhi': 'North',
    'Kanpur': 'North',
    'Patna': 'North',
    'Ludhiana': 'North',


    'Mumbai': 'West',
    'Thane': 'West',
    'Pune': 'West',
    'Nashik': 'West',
    'Vadodara': 'West',
    'Kalyan': 'West',
    'Rajkot': 'West',
    'Ahmedabad': 'West',
    'Surat': 'West',
    'Indore': 'West',  

    
    'Kolkata': 'East',
    
    'Bhopal': 'Central',

    'Unknown': 'Unknown'
}

['Visakhapatnam' 'Bangalore' 'Srinagar' 'Varanasi' 'Jaipur' 'Pune' 'Thane'
 'Chennai' 'Nagpur' 'Nashik' 'Vadodara' 'Kalyan' 'Rajkot' 'Ahmedabad'
 'Kolkata' 'Mumbai' 'Lucknow' 'Indore' 'Surat' 'Ludhiana' 'Bhopal'
 'Meerut' 'Agra' 'Ghaziabad' 'Hyderabad' 'Vasai-Virar' 'Kanpur' 'Patna'
 'Faridabad' 'Delhi' 'Unknown']


In [68]:
# Prepare 3 datasets to try logistic regression on to figure out how to deal with City and Degree. Base - drop region, Grouped - group and one hot encode, Freq - use frequency encoding
df_base = df.copy()
df_grouped = df.copy()
df_freq = df.copy()

df_base = df_base.drop(columns=['City', 'Degree'])

In [69]:
# Map Degree and Region into broader groups
df_grouped['Grouped_Degree']=df_grouped['Degree'].map(degree_category_map)
#print(df_grouped['Grouped_Degree'].value_counts())
df_grouped = df_grouped.drop(columns=['Degree'])

df_grouped['Region']=df_grouped['City'].map(city_to_region)
#print(df_grouped['Region'].value_counts())
df_grouped = df_grouped.drop(columns=['City'])

# One hot encode for each of the grouped variables
df_grouped = pd.get_dummies(
    df_grouped,
    columns=['Grouped_Degree', 'Region'],
    drop_first=True,
    dtype=int
)
df_grouped.head()

Unnamed: 0,Age,Academic Pressure,CGPA,Study Satisfaction,Sleep Duration,Dietary Habits,Work/Study Hours,Financial Stress,Depression,Sex,...,Grouped_Degree_Doctorate_Others,Grouped_Degree_Engineering_Architecture,Grouped_Degree_Law_Education_Hospitality,Grouped_Degree_Medical_Pharma,Grouped_Degree_Science_IT,Region_East,Region_North,Region_South,Region_Unknown,Region_West
0,33.0,5.0,8.97,2.0,1,2,3.0,1.0,1,1,...,0,0,0,1,0,0,0,1,0,0
1,24.0,2.0,5.9,5.0,1,1,3.0,2.0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,31.0,3.0,7.03,5.0,0,2,9.0,1.0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,28.0,3.0,5.59,2.0,2,1,4.0,5.0,1,0,...,0,0,0,0,1,0,1,0,0,0
4,25.0,4.0,8.13,3.0,1,1,1.0,1.0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [70]:
#Frequency encode Degree and Region
degree_freq = df_freq['Degree'].value_counts(normalize=True)
df_freq['Degree_freq'] = df_freq['Degree'].map(degree_freq)

city_freq = df_freq['City'].value_counts(normalize=True)
df_freq['City_freq'] = df_freq['City'].map(city_freq)

df_freq = df_freq.drop(columns=['Degree', 'City'])
df_freq.head()

Unnamed: 0,Age,Academic Pressure,CGPA,Study Satisfaction,Sleep Duration,Dietary Habits,Work/Study Hours,Financial Stress,Depression,Sex,Suicidal_Thoughts,Family_Mental_Illness,Degree_freq,City_freq
0,33.0,5.0,8.97,2.0,1,2,3.0,1.0,1,1,1,0,0.029026,0.034738
1,24.0,2.0,5.9,5.0,1,1,3.0,2.0,0,0,0,1,0.03172,0.027481
2,31.0,3.0,7.03,5.0,0,2,9.0,1.0,0,1,0,1,0.02141,0.049215
3,28.0,3.0,5.59,2.0,2,1,4.0,5.0,1,0,1,1,0.05137,0.024572
4,25.0,4.0,8.13,3.0,1,1,1.0,1.0,0,0,1,0,0.036642,0.037109


In [71]:
def train_logistic(df, target='Depression'):
    X = df.drop(columns=[target])
    y = df[target]
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    model = LogisticRegression(max_iter=1000, class_weight='balanced')
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    
    return model

In [74]:
print("=== BASELINE MODEL ===")
model_baseline = train_logistic(df_base)


=== BASELINE MODEL ===
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.84      0.82      2310
           1       0.88      0.85      0.87      3258

    accuracy                           0.85      5568
   macro avg       0.84      0.85      0.84      5568
weighted avg       0.85      0.85      0.85      5568

Confusion Matrix:
 [[1942  368]
 [ 479 2779]]


In [75]:
print("\n=== GROUPED MODEL ===")
model_grouped = train_logistic(df_grouped)


=== GROUPED MODEL ===
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.84      0.82      2310
           1       0.88      0.85      0.87      3258

    accuracy                           0.85      5568
   macro avg       0.84      0.85      0.84      5568
weighted avg       0.85      0.85      0.85      5568

Confusion Matrix:
 [[1940  370]
 [ 475 2783]]


In [77]:
print("\n=== FREQUENCY ENCODING MODEL ===")
model_freq = train_logistic(df_freq)


=== FREQUENCY ENCODING MODEL ===
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.84      0.82      2310
           1       0.88      0.85      0.87      3258

    accuracy                           0.85      5568
   macro avg       0.84      0.85      0.84      5568
weighted avg       0.85      0.85      0.85      5568

Confusion Matrix:
 [[1941  369]
 [ 477 2781]]
