In [35]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Sleep_health_and_lifestyle_dataset.csv")

# Check for initial NaN values
print(df.isnull().sum())

# Define the mapping function for three activity levels
def label_activity(row):
    if row['Physical Activity Level'] <= 30:
        return 'Low'
    elif 30 < row['Physical Activity Level'] <= 60:
        return 'Good'
    else:
        return 'High'

# Apply the custom labeling function
df['Activity Level'] = df.apply(label_activity, axis=1)

# Verify the mapping
print(df[['Physical Activity Level', 'Activity Level']].head())

Person ID                    0
Gender                       0
Age                          0
Occupation                   0
Sleep Duration               0
Quality of Sleep             0
Physical Activity Level      0
Stress Level                 0
BMI Category                 0
Blood Pressure               0
Heart Rate                   0
Daily Steps                  0
Sleep Disorder             219
dtype: int64
   Physical Activity Level Activity Level
0                       42           Good
1                       60           Good
2                       60           Good
3                       30            Low
4                       30            Low


In [36]:
df.drop_duplicates()

# List of columns to drop
columns_to_drop = ["Occupation", "Sleep Duration", "Quality of Sleep", "BMI Category", "Daily Steps", "Sleep Disorder", "Person ID", "Gender"]

# Drop the columns from the DataFrame
df = df.drop(columns=columns_to_drop)

# Display the first few rows to confirm changes
df.head()

Unnamed: 0,Age,Physical Activity Level,Stress Level,Blood Pressure,Heart Rate,Activity Level
0,27,42,6,126/83,77,Good
1,28,60,8,125/80,75,Good
2,28,60,8,125/80,75,Good
3,28,30,8,140/90,85,Low
4,28,30,8,140/90,85,Low


In [37]:
# Split the "Blood Pressure" column into two separate columns
df[['Systolic BP', 'Diastolic BP']] = df['Blood Pressure'].str.split('/', expand=True)

# Convert the new columns to numeric values
df['Systolic BP'] = pd.to_numeric(df['Systolic BP'])
df['Diastolic BP'] = pd.to_numeric(df['Diastolic BP'])

# Drop the original "Blood Pressure" column
df = df.drop(columns=['Blood Pressure'])

print("Blood Pressure column fixed. Here's a preview:")
print(df.head())

Blood Pressure column fixed. Here's a preview:
   Age  Physical Activity Level  Stress Level  Heart Rate Activity Level  \
0   27                       42             6          77           Good   
1   28                       60             8          75           Good   
2   28                       60             8          75           Good   
3   28                       30             8          85            Low   
4   28                       30             8          85            Low   

   Systolic BP  Diastolic BP  
0          126            83  
1          125            80  
2          125            80  
3          140            90  
4          140            90  


In [38]:
from sklearn.preprocessing import MinMaxScaler

# Select the features to scale
features = ["Stress Level", "Systolic BP", "Diastolic BP", "Heart Rate", "Age"]
X = df[features]  # Extract features
y = df["Activity Level"]  # Extract target variable

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Scale the features
X_scaled = scaler.fit_transform(X)

# Convert scaled data back to a DataFrame (optional, for easier inspection)
X_scaled_df = pd.DataFrame(X_scaled, columns=features)

print("Normalized/Scaled features:")
print(X_scaled_df.head())

Normalized/Scaled features:
   Stress Level  Systolic BP  Diastolic BP  Heart Rate      Age
0           0.6     0.407407          0.40    0.571429  0.00000
1           1.0     0.370370          0.25    0.476190  0.03125
2           1.0     0.370370          0.25    0.476190  0.03125
3           1.0     0.925926          0.75    0.952381  0.03125
4           1.0     0.925926          0.75    0.952381  0.03125


In [39]:
df.head()

Unnamed: 0,Age,Physical Activity Level,Stress Level,Heart Rate,Activity Level,Systolic BP,Diastolic BP
0,27,42,6,77,Good,126,83
1,28,60,8,75,Good,125,80
2,28,60,8,75,Good,125,80
3,28,30,8,85,Low,140,90
4,28,30,8,85,Low,140,90


In [40]:
# Define the mapping for 'Activity Level'
activity_map = {
    'Low': 0,   # Low Activity is mapped to 0
    'Good': 1,  # Good Activity is mapped to 1
    'High': 2   # High Activity is mapped to 2
}

# Apply the mapping to the 'Activity Level' column
df['Activity Level'] = df['Activity Level'].map(activity_map)

# Check the mapping is applied correctly
print(df[['Physical Activity Level', 'Activity Level']].head())

   Physical Activity Level  Activity Level
0                       42               1
1                       60               1
2                       60               1
3                       30               0
4                       30               0


In [41]:
# Check the distribution of activity levels
df['Activity Level'].value_counts()

Activity Level
1    163
2    143
0     68
Name: count, dtype: int64

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Select features and target variable
features = df[["Stress Level", "Systolic BP", "Diastolic BP"]]  # Features
labels = df["Activity Level"]  # Numeric labels (0, 1, 2)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for later use in Flask
joblib.dump(scaler, 'scaler_3.pkl')

# Assign weights to classes (higher weights for minority classes)
class_weights = {0: 2, 1: 1, 2: 1}  # Adjust weights as needed

# Train the Random Forest model with class weights
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight=class_weights)
rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

# Save the model for later use in Flask
joblib.dump(rf_model, 'random_forest_model_2.pkl')

Model Accuracy: 0.8933333333333333


['random_forest_model_2.pkl']

In [48]:
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# import joblib

# # Map the activity levels to "Good", "Low", and "High" if not already done
# activity_map = {
#     'Low': 'Low Activity Level',
#     'Good': 'Good Activity Level',
#     'High': 'High Activity Level'
# }

# df['Activity Level'] = df['Activity Level'].map(activity_map)

# # Select the relevant features: Stress Level and Blood Pressure (Systolic and Diastolic)
# features = df[['Stress Level', 'Systolic BP', 'Diastolic BP']]  # Use only Stress Level and BP
# labels = df['Activity Level']  # Labels are still 'Low', 'Good', 'High'

# # Convert labels to numeric: 'Low Activity Level' = 0, 'Good Activity Level' = 1, 'High Activity Level' = 2
# label_map = {'Low Activity Level': 0, 'Good Activity Level': 1, 'High Activity Level': 2}
# df['Activity Level'] = df['Activity Level'].map(label_map)
# labels = df['Activity Level']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# # Scale the features using StandardScaler
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Save the scaler for later use in Flask
# joblib.dump(scaler, 'scaler.pkl')

# # Train the Random Forest model
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_model.fit(X_train_scaled, y_train)

# # Evaluate the model
# y_pred = rf_model.predict(X_test_scaled)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Model Accuracy: {accuracy}")

# # Classification report for evaluation
# from sklearn.metrics import classification_report
# print("Classification Report:\n", classification_report(y_test, y_pred))

# # Save the model for later use
# joblib.dump(rf_model, 'activity_level_model.pkl')

In [49]:
import pandas as pd

# Regular test data (randomly chosen)
test_data = pd.DataFrame({
    'Stress Level': [45],        # Moderate stress level
    'Systolic BP': [130],        # Normal systolic BP
    'Diastolic BP': [85]                 # Mid-age person (likely moderate activity)
})

# Scale the test data using the scaler
test_data_scaled = scaler.transform(test_data)

# Predict activity level for the test data
predicted_activity_level = rf_model.predict(test_data_scaled)

# Map the prediction back to its corresponding label
activity_level_map = {0: 'Low Activity Level', 1: 'Good Activity Level', 2: 'High Activity Level'}
predicted_label = activity_level_map[predicted_activity_level[0]]

print(f"Predicted Activity Level: {predicted_label}")

Predicted Activity Level: Good Activity Level


In [50]:
import pandas as pd

# Test data representing high activity levels (high parameters)
test_data_high = pd.DataFrame({
    'Stress Level': [20],        # Low stress level (associated with high activity)
    'Systolic BP': [120],        # Normal systolic BP
    'Diastolic BP': [75]        # Normal diastolic BP                  # Younger age (associated with higher activity)
})

# Scale the test data using the scaler
test_data_scaled = scaler.transform(test_data_high)

# Predict activity level for the test data
predicted_activity_level = rf_model.predict(test_data_scaled)

# Map the prediction back to its corresponding label
activity_level_map = {0: 'Low Activity Level', 1: 'Good Activity Level', 2: 'High Activity Level'}
predicted_label = activity_level_map[predicted_activity_level[0]]

print(f"Predicted Activity Level: {predicted_label}")

Predicted Activity Level: Low Activity Level


In [51]:
print(df['Activity Level'].value_counts())

Activity Level
1    163
2    143
0     68
Name: count, dtype: int64
