In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Dropout
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings('ignore')

# Step 1: Load Preprocessing Objects and Model
try:
    stacking_clf = joblib.load('stacking_classifier_model.pkl')
    scaler = joblib.load('scaler.pkl')
    pca = joblib.load('pca.pkl')
    label_encoders = joblib.load('label_encoders.pkl')
    le_target = joblib.load('le_target.pkl')
    print("Loaded model and preprocessing objects.")
except FileNotFoundError as e:
    raise FileNotFoundError(f"Missing file: {e}. Ensure 'stacking_clf_model.pkl', 'scaler.pkl', 'pca.pkl', 'label_encoders.pkl', and 'le_target.pkl' are in the working directory.")

# Load dataset for rolling statistics context (optional if UserID exists)
try:
    df = pd.read_csv('final_synthetic_stress_dataset.csv')
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    print("Loaded dataset for rolling statistics.")
except FileNotFoundError:
    print("Warning: Dataset not found. Rolling statistics will be estimated for new users.")

# Step 2: Define Features and Columns
features = ['HR', 'HRV', 'SpO2', 'Steps', 'Distance', 'Calories', 'ActiveTime', 'SleepDuration', 'SleepEfficiency',
            'Age', 'Sex', 'DrinkingHabits', 'SmokingHabits', 'PastMedicalHistory', 'Depression', 'Context',
            'Hour', 'DayOfWeek', 'IsWeekend', 'TimeOfDay', 'HR_RollingMean', 'HR_RollingStd', 'HRV_RollingMean',
            'HRV_RollingStd', 'SpO2_RollingMean', 'SpO2_RollingStd', 'HR_Steps_Interaction',
            'HRV_SleepDuration_Interaction', 'ActivityIntensity']
categorical_cols = ['Sex', 'DrinkingHabits', 'SmokingHabits', 'PastMedicalHistory', 'Depression', 'Context', 'TimeOfDay', 'ActivityIntensity']
numerical_cols = [col for col in features if col not in categorical_cols]

# Step 3: Prepare a Single Row of New Data
# Example new data (replace with your actual row)
new_data = {
    'Timestamp': '2025-05-08 09:00:00',
    'UserID': 'U999',
    'HR': 85.0,
    'HRV': 40.0,
    'SpO2': 97.0,
    'Steps': 50,
    'Distance': 0.04,
    'Calories': 3.5,
    'ActiveTime': 1.0,
    'SleepDuration': 7.5,
    'SleepEfficiency': 90.0,
    'Height': 170.0,
    'Weight': 70.0,
    'Age': 35,
    'Sex': 'Male',
    'DrinkingHabits': 'Occasional',
    'SmokingHabits': 'Non-smoker',
    'PastMedicalHistory': 'Other',
    'Depression': 'No',
    'Context': 'Work'
}

# Convert to DataFrame
new_df = pd.DataFrame([new_data])

# Validate input
required_cols = ['Timestamp', 'UserID', 'HR', 'HRV', 'SpO2', 'Steps', 'Distance', 'Calories', 'ActiveTime',
                'SleepDuration', 'SleepEfficiency', 'Height', 'Weight', 'Age', 'Sex', 'DrinkingHabits',
                'SmokingHabits', 'PastMedicalHistory', 'Depression', 'Context']
if not all(col in new_df.columns for col in required_cols):
    raise ValueError(f"Missing required columns. Required: {required_cols}")

# Step 4: Preprocess the Single Row
# Feature Engineering
new_df['Timestamp'] = pd.to_datetime(new_df['Timestamp'])
new_df['Hour'] = new_df['Timestamp'].dt.hour
new_df['DayOfWeek'] = new_df['Timestamp'].dt.dayofweek
new_df['IsWeekend'] = new_df['DayOfWeek'].isin([5, 6]).astype(int)
new_df['TimeOfDay'] = pd.cut(new_df['Hour'], bins=[0, 6, 12, 18, 24], labels=['Night', 'Morning', 'Afternoon', 'Evening'], include_lowest=True)

# Rolling Statistics
window = 6
user_id = new_df['UserID'].iloc[0]
if 'df' in globals() and user_id in df['UserID'].values:
    user_history = df[df['UserID'] == user_id][['Timestamp', 'HR', 'HRV', 'SpO2']].copy()
    user_history = pd.concat([user_history, new_df[['Timestamp', 'HR', 'HRV', 'SpO2']]], ignore_index=True)
    for col in ['HR', 'HRV', 'SpO2']:
        new_df[f'{col}_RollingMean'] = user_history[col].rolling(window=window, min_periods=1).mean().iloc[-1]
        new_df[f'{col}_RollingStd'] = user_history[col].rolling(window=window, min_periods=1).std().iloc[-1]
else:
    # Fallback: Estimate rolling statistics
    try:
        for col in ['HR', 'HRV', 'SpO2']:
            new_df[f'{col}_RollingMean'] = df[col].mean()
            new_df[f'{col}_RollingStd'] = df[col].std()
    except NameError:
        # If dataset is unavailable, use reasonable defaults
        defaults = {'HR': (70.0, 15.0), 'HRV': (50.0, 15.0), 'SpO2': (97.0, 1.5)}
        for col in ['HR', 'HRV', 'SpO2']:
            new_df[f'{col}_RollingMean'] = defaults[col][0]
            new_df[f'{col}_RollingStd'] = defaults[col][1]
        print("Warning: Dataset unavailable. Using default values for rolling statistics.")

# Interaction Features
new_df['HR_Steps_Interaction'] = new_df['HR'] * new_df['Steps']
new_df['HRV_SleepDuration_Interaction'] = new_df['HRV'] * new_df['SleepDuration']
new_df['ActivityIntensity'] = pd.cut(new_df['Steps'], bins=[-1, 100, 400, 600], labels=['Low', 'Moderate', 'High'], include_lowest=True)

# Encode categorical variables
X_new = new_df[features]
for col in categorical_cols:
    try:
        X_new[col] = label_encoders[col].transform(X_new[col])
    except ValueError:
        raise ValueError(f"Invalid value for {col}. Must be one of {label_encoders[col].classes_}")

# Scale numerical features
X_new[numerical_cols] = scaler.transform(X_new[numerical_cols])

# Apply PCA
X_new_pca = pca.transform(X_new[numerical_cols])
X_new_pca_df = pd.DataFrame(X_new_pca, columns=[f'PC{i+1}' for i in range(X_new_pca.shape[1])])

# Combine PCA components with categorical features
X_new_final = pd.concat([X_new_pca_df, X_new[categorical_cols].reset_index(drop=True)], axis=1)

# Step 5: Predict with Stacking Classifier
y_pred = stacking_clf.predict(X_new_final)
y_proba = stacking_clf.predict_proba(X_new_final)

# Decode prediction
predicted_class = le_target.inverse_transform(y_pred)[0]
proba_dict = {le_target.inverse_transform([i])[0]: prob for i, prob in enumerate(y_proba[0])}

print("\n=== Prediction for Single Row ===")
print(f"Predicted StressLevel: {predicted_class}")
print("Class Probabilities:")
for class_name, prob in proba_dict.items():
    print(f"{class_name}: {prob:.4f}")

# Step 6: Save Results
new_df['Predicted_StressLevel'] = predicted_class
new_df['Prob_Low'] = proba_dict['Low']
new_df['Prob_Medium'] = proba_dict['Medium']
new_df['Prob_High'] = proba_dict['High']
new_df.to_csv('single_row_prediction.csv', index=False)
print("\nPrediction saved to 'single_row_prediction.csv'.")

print("\nTesting on Single Row Completed.")

Loaded model and preprocessing objects.

=== Prediction for Single Row ===
Predicted StressLevel: Medium
Class Probabilities:
High: 0.3484
Low: 0.1505
Medium: 0.5011

Prediction saved to 'single_row_prediction.csv'.

Testing on Single Row Completed.
