In [None]:
import os

base_path = "/content/drive/MyDrive/Sentinel-AI/data/raw"
os.listdir(base_path)


['survey.csv', 'student-por.csv', 'student-mat.csv']

In [None]:
# load the csv files
import pandas as pd

student_df = pd.read_csv(
    "/content/drive/MyDrive/Sentinel-AI/data/raw/student-mat.csv",
)

mental_df = pd.read_csv(
    "/content/drive/MyDrive/Sentinel-AI/data/raw/survey.csv"
)


In [None]:
student_df.head()


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [None]:
student_df.shape


(395, 33)

In [None]:
student_df.columns


Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

In [None]:
mental_df.head()


Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [None]:
mental_df.shape


(1259, 27)

In [None]:
mental_df.columns


Index(['Timestamp', 'Age', 'Gender', 'Country', 'state', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments'],
      dtype='object')

### **Data Understanding and Feature Mapping**

In [None]:
# selecting final columns
import pandas as pd

student_features = [
    'studytime', 'failures', 'absences',
    'health', 'Dalc', 'Walc',
    'G1', 'G2', 'G3'
]

student_clean = student_df[student_features].copy()


In [None]:
# missing value check
student_clean.isnull().sum()

Unnamed: 0,0
studytime,0
failures,0
absences,0
health,0
Dalc,0
Walc,0
G1,0
G2,0
G3,0


In [None]:
# Normalizing numeric features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
student_scaled = scaler.fit_transform(student_clean)

student_scaled_df = pd.DataFrame(
    student_scaled,
    columns=student_features
)

In [None]:
# save the output
student_scaled_df.to_csv(
    "/content/drive/MyDrive/Sentinel-AI/data/processed/student_behavior_clean.csv",
    index=False
)


In [None]:
mental_df.columns

Index(['Timestamp', 'Age', 'Gender', 'Country', 'state', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments'],
      dtype='object')

In [None]:
# text cleaning
text_col = 'comments'

import re
def clean_text(text):
  text = str(text).lower()
  text = re.sub(r"http\S+","", text)
  text = re.sub(r"[^a-z\s]","", text)
  return text.strip()
mental_df['clean_text'] = mental_df[text_col].apply(clean_text)


In [None]:
# removing empty or short text
mental_clean = mental_df[mental_df['clean_text'].str.len() > 15]

In [None]:
# saving the output
mental_clean[['clean_text']].to_csv(
    "/content/drive/MyDrive/Sentinel-AI/data/processed/mental_text_clean.csv",
    index=False
)

In [None]:
# loading clean data
import pandas as pd

student_data = pd.read_csv(
    "/content/drive/MyDrive/Sentinel-AI/data/processed/student_behavior_clean.csv"
)

student_data.head()


Unnamed: 0,studytime,failures,absences,health,Dalc,Walc,G1,G2,G3
0,-0.042286,-0.449944,0.036424,-0.399289,-0.540699,-1.003789,-1.782467,-1.254791,-0.964934
1,-0.042286,-0.449944,-0.213796,-0.399289,-0.540699,-1.003789,-1.782467,-1.520979,-0.964934
2,-0.042286,3.589323,0.536865,-0.399289,0.583385,0.5511,-1.179147,-0.722415,-0.090739
3,1.150779,-0.449944,-0.464016,1.04107,-0.540699,-1.003789,1.234133,0.874715,1.002004
4,-0.042286,-0.449944,-0.213796,1.04107,-0.540699,-0.226345,-1.480807,-0.190038,-0.090739


In [None]:
# simulating user ids
student_data['user_id'] = range(len(student_data))

In [None]:
# create time steps
import numpy as np
time_steps = []
for _, row in student_data.iterrows():
  time_steps.append([row['G1'],row['G2'], row['G3']])

student_data['grade_sequence'] = time_steps

In [None]:
# statistical baseline profile
baseline_profiles = student_data.groupby('user_id').agg({
    'studytime': ['mean', 'std'],
    'absences': ['mean', 'std'],
    'health': ['mean', 'std'],
    'Dalc': ['mean', 'std'],
    'Walc': ['mean', 'std']
})

baseline_profiles.head()


Unnamed: 0_level_0,studytime,studytime,absences,absences,health,health,Dalc,Dalc,Walc,Walc
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
0,-0.042286,,0.036424,,-0.399289,,-0.540699,,-1.003789,
1,-0.042286,,-0.213796,,-0.399289,,-0.540699,,-1.003789,
2,-0.042286,,0.536865,,-0.399289,,0.583385,,0.5511,
3,1.150779,,-0.464016,,1.04107,,-0.540699,,-1.003789,
4,-0.042286,,-0.213796,,1.04107,,-0.540699,,-0.226345,


In [None]:
# save
baseline_profiles.to_csv(
    "/content/drive/MyDrive/Sentinel-AI/data/processed/user_baselines.csv"
)


In [None]:
# Feature selection for Anamoly detection
from sklearn.preprocessing import StandardScaler

features = [
    'studytime',
    'absences',
    'health',
    'Dalc',
    'Walc'
]

X = student_data[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# train isolation forest
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(
    n_estimators=150,
    contamination=0.1,   # 10% expected abnormal
    random_state=42
)

iso_forest.fit(X_scaled)


In [None]:
# generate anamoly scores
student_data['anomaly_score'] = iso_forest.decision_function(X_scaled)
student_data['anomaly_label'] = iso_forest.predict(X_scaled)


In [None]:
def risk_level(score):
    if score < -0.15:
        return "HIGH"
    elif score < 0:
        return "MEDIUM"
    else:
        return "LOW"

student_data['risk_level'] = student_data['anomaly_score'].apply(risk_level)


In [None]:
student_data[['user_id', 'anomaly_score', 'risk_level']].head(10)


Unnamed: 0,user_id,anomaly_score,risk_level
0,0,0.159248,LOW
1,1,0.162784,LOW
2,2,0.107528,LOW
3,3,0.123344,LOW
4,4,0.153469,LOW
5,5,0.131349,LOW
6,6,0.164363,LOW
7,7,0.09811,LOW
8,8,0.124332,LOW
9,9,0.160051,LOW


In [None]:
# save
student_data.to_csv(
    "/content/drive/MyDrive/Sentinel-AI/data/processed/student_anomaly_scores.csv",
    index=False
)


In [None]:
# define behavioral profile
normal_students = student_data[student_data['anomaly_label'] == 1]

normal_means = normal_students[features].mean()


In [None]:
# calculate deviation from normal
import numpy as np

def feature_deviation(row):
    deviations = {}
    for feature in features:
        deviations[feature] = abs(row[feature] - normal_means[feature])
    return deviations

student_data['deviation'] = student_data.apply(feature_deviation, axis=1)


In [None]:
# identify top risk factors
def top_risk_factors(dev_dict, top_n=2):
    sorted_features = sorted(
        dev_dict.items(),
        key=lambda x: x[1],
        reverse=True
    )
    return [f[0] for f in sorted_features[:top_n]]

student_data['top_risk_factors'] = student_data['deviation'].apply(top_risk_factors)


In [None]:
def generate_explanation(row):
    factors = row['top_risk_factors']
    level = row['risk_level']

    if level == "HIGH":
        return f"High risk due to unusually high {factors[0]} and {factors[1]} compared to peers."
    elif level == "MEDIUM":
        return f"Moderate risk due to deviation in {factors[0]}."
    else:
        return "Behavior within normal range."

student_data['explanation'] = student_data.apply(generate_explanation, axis=1)


In [None]:
# inspect
student_data[['user_id', 'risk_level', 'explanation']].head(10)


Unnamed: 0,user_id,risk_level,explanation
0,0,LOW,Behavior within normal range.
1,1,LOW,Behavior within normal range.
2,2,LOW,Behavior within normal range.
3,3,LOW,Behavior within normal range.
4,4,LOW,Behavior within normal range.
5,5,LOW,Behavior within normal range.
6,6,LOW,Behavior within normal range.
7,7,LOW,Behavior within normal range.
8,8,LOW,Behavior within normal range.
9,9,LOW,Behavior within normal range.


In [None]:
student_data.to_csv(
    "/content/drive/MyDrive/Sentinel-AI/data/processed/student_explainable_risk.csv",
    index=False
)


In [56]:
feature_map = {
    'studytime': 'study duration',
    'absences': 'school absenteeism',
    'health': 'self-reported health status',
    'Dalc': 'weekday alcohol consumption',
    'Walc': 'weekend alcohol consumption'
}


In [63]:
# report genrator function
def generate_clinical_report(row):
    risk = row['risk_level']
    f1 = feature_map[row['top_risk_factors'][0]]
    f2 = feature_map[row['top_risk_factors'][1]] if len(row['top_risk_factors']) > 1 else None

    report = f"""
CONFIDENTIAL STUDENT WELL-BEING ASSESSMENT REPORT

1. Case Overview
This report presents an automated behavioral risk screening based on academic and lifestyle indicators.
The purpose is early identification of students who may benefit from additional support.

Overall Risk Classification: {risk}

2. Observed Behavioral Patterns
Analysis indicates noticeable deviation in {f1}"""

    if f2:
        report += f" and {f2}"

    report += """. These patterns differ significantly from peer norms and warrant attention.

3. Risk Interpretation
"""

    if risk == "HIGH":
        report += (
            "The student demonstrates substantial behavioral irregularities. "
            "Such deviations may reflect heightened stress levels, reduced academic engagement, "
            "or challenges in maintaining healthy routines."
        )
    elif risk == "MEDIUM":
        report += (
            "The student exhibits moderate deviations that may represent emerging concerns. "
            "While not immediately alarming, these trends could intensify if left unaddressed."
        )
    else:
        report += (
            "The student’s behavioral indicators remain within expected ranges. "
            "No immediate psychological or academic risk is inferred at this time."
        )

    report += """

4. Contributing and Protective Factors
- Identified contributing factors are derived solely from behavioral data.
- Protective factors such as peer support, family environment, and personal resilience are not directly measured and may moderate risk.

5. Recommendations
"""

    if risk == "HIGH":
        report += (
            "- Initiate a confidential one-on-one counseling session.\n"
            "- Collaborate with academic mentors to review workload and attendance.\n"
            "- Encourage healthy routines and supportive peer interactions.\n"
            "- Monitor behavioral indicators regularly."
        )
    elif risk == "MEDIUM":
        report += (
            "- Provide periodic check-ins with a counselor or advisor.\n"
            "- Encourage time-management and stress-reduction strategies.\n"
            "- Reassess indicators in subsequent academic terms."
        )
    else:
        report += (
            "- Continue standard academic guidance.\n"
            "- Reinforce positive habits and consistent engagement."
        )

    report += """

6. Ethical Disclaimer
This report is generated using an AI-based screening system and is intended to support,
not replace, professional judgment. It does not constitute a medical or psychological diagnosis.


"""
    return report



In [64]:
student_data['clinical_report'] = student_data.apply(generate_clinical_report, axis=1)


In [65]:
# sample
print(student_data['clinical_report'].iloc[0])



CONFIDENTIAL STUDENT WELL-BEING ASSESSMENT REPORT

1. Case Overview
This report presents an automated behavioral risk screening based on academic and lifestyle indicators. 
The purpose is early identification of students who may benefit from additional support.

Overall Risk Classification: LOW

2. Observed Behavioral Patterns
Analysis indicates noticeable deviation in weekend alcohol consumption and self-reported health status. These patterns differ significantly from peer norms and warrant attention.

3. Risk Interpretation
The student’s behavioral indicators remain within expected ranges. No immediate psychological or academic risk is inferred at this time.

4. Contributing and Protective Factors
- Identified contributing factors are derived solely from behavioral data.
- Protective factors such as peer support, family environment, and personal resilience are not directly measured and may moderate risk.

5. Recommendations
- Continue standard academic guidance.
- Reinforce positive

In [67]:
# save
student_data.to_csv(
    "/content/drive/MyDrive/Sentinel-AI/data/final/student_clinical_reports.csv",
    index=False
)

