# Foreign Student Mental Health ML

This project uses machine learning techniques to learn about influences on mental health of international and domestic students at Ritsumeikan Asia Pacific University in Japan and aims to build a model to predict students at risk of mental health issues such as depression and suicidal ideation.

### 1. Imports and Configuration

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# ML imports
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    mean_absolute_error,
    roc_auc_score,
    precision_recall_curve,
    auc,
)
from sklearn.inspection import permutation_importance
import xgboost as xgb
import shap

# Other
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# File paths
DATA_PATH = Path("../data/data.csv")
CODEBOOK_PATH = Path("../data/feature_target_explanations.xlsx")

# Toggles
INCLUDE_FSCORE2 = False  # set True to include feature-score == 2 variables
# FLAGGING FOR LATER: Consider removing
SHAP_SAMPLE_N = 300  # sample size for SHAP
SHAP_TOP_K = 5  # how many top features to explain with SHAP

NOTE - add the following to preprocess the column names in the data csv, and the corresponding names in the codebook also (in the first column).

df.columns = (df.columns.str.strip().str.lower().str.replace(" ", "_"))

### 2. Quick file existence check and preview

In [None]:
print("Data exists:", DATA_PATH.exists())
print("Codebook exists:", CODEBOOK_PATH.exists())

df = pd.read_csv(DATA_PATH)
print("Data shape:", df.shape)
display(df.head())

### 3. Load cleaned codebook and inspect expected columns *

In [None]:
# Expect columns: 'coded_name', 'feature_score', 'target_score', 'category_filter' (optional)
cb_numeric = pd.read_excel(CODEBOOK_PATH, sheet_name="numeric")
cb_categorical = pd.read_excel(CODEBOOK_PATH, sheet_name="categorical")
cb_columns = list(cb.columns)
print("Codebook columns:", cb_columns)
display(cb.head())