In [None]:
#CS 513: Knowledge Discovery and Data 
#Group Members: Danica Lacuesta, Joelle An, and Raj Rana 
#Author: Danica Lacuesta
#20014543
#"I pledge my honor that I have abided by the Stevens Honor System"
#Purpose: This project analyzes digital activity patterns to classify user stress levels and identify the behavioral features that most strongly impact predictive accuracy.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv("data/df_modified.csv")
df.head()

In [None]:
df["Stress_Binary"] = (df["Stress_Level(1-10)"] >= 6).astype(int)
df = df.drop("Stress_Level(1-10)", axis=1)

df.head()

In [None]:
def clip_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower_bound, upper_bound)

numeric_cols = [
    "Daily_Screen_Time(hrs)",
    "Sleep_Quality(1-10)",
    "Happiness_Index(1-10)",
    "Days_Without_Social_Media",
    "Exercise_Frequency(week)",
    "Age"
]

for col in numeric_cols:
    clip_outliers(df, col)

df[numeric_cols].describe()

In [None]:
X = df.drop("Stress_Binary", axis=1)
y = df["Stress_Binary"]

X.head(), y.head()

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

X_train.shape, X_test.shape

In [None]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

log_pred = log_model.predict(X_test)
log_accuracy = accuracy_score(y_test, log_pred)

print("Logistic Regression Accuracy:", log_accuracy)
print("\nClassification Report:")
print(classification_report(y_test, log_pred))


In [None]:
cm_log = confusion_matrix(y_test, log_pred)

plt.figure(figsize=(6,4))
sns.heatmap(cm_log, annot=True, fmt="d", cmap="Purples")
plt.title("Logistic Regression Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
log_coeffs = pd.Series(log_model.coef_[0], index=X.columns)

plt.figure(figsize=(8,8))
log_coeffs.sort_values().plot(kind="barh")
plt.title("Influence of Features on Stress Prediction (Logistic Regression)")
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.show()

log_coeffs

In [None]:
from sklearn.metrics import roc_curve, auc

# Get predicted probabilities for the positive class (stress = 1)
log_proba = log_model.predict_proba(X_test)[:, 1]

# Compute ROC values
fpr, tpr, thresholds = roc_curve(y_test, log_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0,1], [0,1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve — Logistic Regression")
plt.legend()

# SAVE FIRST
plt.savefig("visuals_logreg/roc_curve.png", dpi=300, bbox_inches="tight")

# THEN SHOW
plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

precision, recall, thresholds = precision_recall_curve(y_test, log_proba)
avg_precision = average_precision_score(y_test, log_proba)

plt.figure(figsize=(6,5))
plt.plot(recall, precision, label=f"AP = {avg_precision:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall Curve — Logistic Regression")
plt.legend()
plt.savefig("visuals_logreg/precision_recall_curve.png", dpi=300, bbox_inches="tight")
plt.show()


In [None]:
# Coefficients (raw)
coeffs = pd.Series(log_model.coef_[0], index=X.columns)

# Convert to Odds Ratios
odds_ratios = np.exp(coeffs)

plt.figure(figsize=(8,8))
odds_ratios.sort_values().plot(kind="barh")
plt.title("Odds Ratios — Logistic Regression")
plt.xlabel("Odds Ratio (exp(coefficient))")
plt.savefig("visuals_logreg/odds_ratios.png", dpi=300, bbox_inches="tight")

plt.show()

odds_ratios


In [None]:
plt.figure(figsize=(6,5))
sns.histplot(log_proba, bins=20, kde=False)
plt.title("Distribution of Predicted Stress Probabilities")
plt.xlabel("Predicted Probability (Stress = 1)")
plt.ylabel("Count")
plt.savefig("visuals_logreg/probability_histogram.png", dpi=300, bbox_inches="tight")
plt.show()
