# Foreign Student Mental Health ML

This project uses machine learning techniques to learn about influences on mental health of international and domestic students at Ritsumeikan Asia Pacific University in Japan and aims to build a model to predict students at risk of mental health issues such as depression and suicidal ideation.

### 1. Imports and Configuration

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# ML imports
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    mean_absolute_error,
    roc_auc_score,
    precision_recall_curve,
    auc,
)

from sklearn.inspection import permutation_importance
import xgboost as xgb
import shap

# Other
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# File paths
DATA_PATH = Path("../data/data.csv")
CODEBOOK_PATH = Path("../data/feature_target_explanations.xlsx")

# Toggles
INCLUDE_FSCORE2 = False  # set True to include feature-score == 2 variables
# FLAGGING FOR LATER: Consider removing
SHAP_SAMPLE_N = 300  # sample size for SHAP
SHAP_TOP_K = 5  # how many top features to explain with SHAP

NOTE - add the following to preprocess the column names in the data csv, and the corresponding names in the codebook also (in the first column).

df.columns = (df.columns.str.strip().str.lower().str.replace(" ", "_"))

### 2. Quick file existence check and preview

In [None]:
print("Data exists:", DATA_PATH.exists())
print("Codebook exists:", CODEBOOK_PATH.exists())

data = pd.read_csv(DATA_PATH)
print("Data shape:", df.shape)
display(df.head())

### 3. Load codebook and inspect - is this necessary?

In [None]:
cb_numeric = pd.read_excel(CODEBOOK_PATH, sheet_name="numeric_variables")
cb_categorical = pd.read_excel(CODEBOOK_PATH, sheet_name="cat_variables")
display(cb_numeric.head())
display(cb_categorical.head())

### 4. Clean column and variable names *
Need to confirm datatypes are correct and 0s are dealt with correctly. See plan

In [None]:
def clean_up(s: pd.Index | pd.Series) -> pd.Index | pd.Series:
    """Cleans by stripping whitespace, converting to lowercase,
    and replacing spaces with underscores."""
    return s.str.strip().str.lower().str.replace(" ", "_", regex=False)


# Clean columns in data
data.columns = clean_up(data.columns)

# Clean columns in codebook
cb_numeric.columns = clean_up(cb_numeric.columns)
cb_categorical.columns = clean_up(cb_categorical.columns)

# Clean variables in codebook
cb_numeric["coded_name"] = clean_up(cb_numeric["coded_name"])
cb_categorical["coded_name"] = clean_up(cb_categorical["coded_name"])

### 5. Aggregate codebook variables for analysis
Based on numeric variables + categorical variables that don't have a numeric equivalent

In [None]:
potential_numeric_vars = cb_numeric[["coded_name", "feature_score", "target_score", "filter_score"]]
potential_cat_vars = cb_categorical.loc[
    cb_categorical["numeric_version_exists"] == "N",
    ["coded_name", "feature_score", "target_score", "filter_score"],
]
potential_vars_df = pd.concat([potential_numeric_vars, potential_cat_vars], ignore_index=True)

### Define targets (does this need its own cell?)

In [2]:
TARGETS = {"depression": "ToDep", "ideation": "Suicide", "acculturative_stress": "ToAS"}

### 5. Build feature list based on "Feature Score" attribute *

In [None]:
# build feature list per rules:
# include variables with feature_score==3, optionally include score==2 when toggle set
fs3 = set(potential_vars_df.loc[potential_vars_df["feature_score"] == 3, "coded_name"].tolist())
fs2 = set(potential_vars_df.loc[potential_vars_df["feature_score"] == 2, "coded_name"].tolist())
if INCLUDE_FSCORE2:
    candidate_features = fs3.union(fs2)
else:
    candidate_features = fs3.copy()

# remove any target columns
for col in TARGETS.values():
    candidate_features.discard(col)

# Ensure features and targets exist in data
assert all(
    feat in data.columns for feat in candidate_features
), "Some candidate features not in data columns"

assert all(
    targ in data.columns for targ in TARGETS.values()
), "Some target variables not in data columns"