In [1]:
from pathlib import Path
import os
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import kruskal, mannwhitneyu

# ============================================
# 1) FILE PATH (EDIT THIS)
# After you upload the CSV to Colab, it will be in /content/
# ============================================

BASE_DIR = Path.cwd().parent

RAW_PATH = "data/raw/academic_stress.csv"

# ============================================
# 2) OUTPUT FOLDERS
# ============================================
OUT_PROCESSED_DIR = "data/processed"
OUT_ANALYSIS_DIR  = "data/analysis"
os.makedirs(OUT_PROCESSED_DIR, exist_ok=True)
os.makedirs(OUT_ANALYSIS_DIR, exist_ok=True)

CLEAN_CSV_PATH     = os.path.join(OUT_PROCESSED_DIR, "student_stress_clean.csv")
DESC_TABLE_PATH    = os.path.join(OUT_ANALYSIS_DIR, "peer_pressure_by_environment.csv")
KRUSKAL_TXT_PATH   = os.path.join(OUT_ANALYSIS_DIR, "kruskal_results.txt")
POSTHOC_CSV_PATH   = os.path.join(OUT_ANALYSIS_DIR, "posthoc_mannwhitney_bonferroni.csv")
BOXPLOT_PNG_PATH   = os.path.join(OUT_ANALYSIS_DIR, "boxplot_peer_pressure_by_environment.png")

# ============================================
# 3) LOAD DATA
# ============================================
df = pd.read_csv(RAW_PATH)

# ============================================
# 4) CLEAN COLUMN NAMES
# ============================================
df.columns = (
    df.columns.astype(str)
    .str.strip()
    .str.lower()
    .str.replace(r"[^\w\s]", "", regex=True)
    .str.replace(r"\s+", "_", regex=True)
)

# ============================================
# 5) KEEP ONLY THE COLUMNS WE NEED
# ============================================
dfa = df[["peer_pressure", "study_environment"]].copy()

# Convert peer_pressure to numeric
dfa["peer_pressure"] = pd.to_numeric(dfa["peer_pressure"], errors="coerce")

# Clean study_environment (categorical)
dfa["study_environment"] = (
    dfa["study_environment"]
    .astype(str)
    .str.strip()
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)
)

# Remove missing values
dfa = dfa.dropna(subset=["peer_pressure", "study_environment"])

# Remove the literal string "nan" if it exists
dfa = dfa[dfa["study_environment"] != "nan"]

# Ensure peer_pressure is between 1 and 5
dfa = dfa[dfa["peer_pressure"].between(1, 5, inclusive="both")]

# Save clean dataset
dfa.to_csv(CLEAN_CSV_PATH, index=False)

# ============================================
# 6) DESCRIPTIVE STATS BY ENVIRONMENT
# ============================================
desc = (
    dfa.groupby("study_environment")["peer_pressure"]
       .agg(count="count", median="median", mean="mean", std="std")
       .reset_index()
       .sort_values("mean", ascending=False)
)

desc.to_csv(DESC_TABLE_PATH, index=False)
print("Descriptive table saved:", DESC_TABLE_PATH)
print(desc)

# ============================================
# 7) BOXPLOT
# ============================================
plt.figure(figsize=(8, 5))
dfa.boxplot(column="peer_pressure", by="study_environment")
plt.title("Peer Pressure by Study Environment")
plt.suptitle("")
plt.xlabel("Study Environment")
plt.ylabel("Peer Pressure (1–5)")
plt.tight_layout()
plt.savefig(BOXPLOT_PNG_PATH)
plt.show()
print("Boxplot saved:", BOXPLOT_PNG_PATH)

# ============================================
# 8) KRUSKAL-WALLIS TEST (non-parametric)
# ============================================
groups = [g["peer_pressure"].values for _, g in dfa.groupby("study_environment")]

H, p_value = kruskal(*groups)

n = len(dfa)
k = dfa["study_environment"].nunique()

# Epsilon-squared effect size for Kruskal-Wallis
epsilon_squared = (H - k + 1) / (n - k) if (n > k) else float("nan")

with open(KRUSKAL_TXT_PATH, "w") as f:
    f.write("Kruskal-Wallis Test\n")
    f.write(f"H statistic: {H}\n")
    f.write(f"p-value: {p_value}\n")
    f.write(f"Epsilon-squared: {epsilon_squared}\n")

print("\nKruskal-Wallis test")
print("H:", H)
print("p-value:", p_value)
print("Effect size (epsilon-squared):", epsilon_squared)
print("Saved stats:", KRUSKAL_TXT_PATH)

# ============================================
# 9) POST-HOC: MANN-WHITNEY + BONFERRONI
# ============================================
envs = list(dfa["study_environment"].unique())
pairs = []

for i in range(len(envs)):
    for j in range(i + 1, len(envs)):
        e1, e2 = envs[i], envs[j]
        x = dfa[dfa["study_environment"] == e1]["peer_pressure"]
        y = dfa[dfa["study_environment"] == e2]["peer_pressure"]

        U, p = mannwhitneyu(x, y, alternative="two-sided")
        pairs.append({"group_1": e1, "group_2": e2, "U": U, "p_value": p})

posthoc = pd.DataFrame(pairs)

# Bonferroni correction
m = len(posthoc)
posthoc["p_adj_bonferroni"] = (posthoc["p_value"] * m).clip(upper=1.0)

posthoc = posthoc.sort_values("p_adj_bonferroni")
posthoc.to_csv(POSTHOC_CSV_PATH, index=False)

print("\nPost-hoc Mann–Whitney (Bonferroni corrected)")
print(posthoc)
print("Saved post-hoc:", POSTHOC_CSV_PATH)

print("\n✅ Done.")


FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/academic_stress.csv'