<a href="https://colab.research.google.com/github/drihn/Personal/blob/main/Untitled25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
dataset_url = 'https://raw.githubusercontent.com/drihn/Personal/refs/heads/main/Sleep_health_and_lifestyle_dataset.csv'
df = pd.read_csv(dataset_url)

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Handle missing values
for column in df.columns:
    if df[column].dtype == 'object':
        df[column].fillna(df[column].mode()[0], inplace=True)  # Fill categorical NaNs with the mode
    else:
        df[column].fillna(df[column].median(), inplace=True)  # Fill numerical NaNs with the median

# After handling missing values, check again
missing_values_after = df.isnull().sum()
print("Missing values after handling:\n", missing_values_after)

# Identify features and target variable
X = df.drop('Sleep Disorder', axis=1)  # Use 'Sleep Disorder' as the target column name
y = df['Sleep Disorder']  # Categorical target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Preprocessing pipeline
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Define a function to evaluate a model with error handling
def evaluate_model(model):
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')  # Changed to weighted
        recall = recall_score(y_test, y_pred, average='weighted')  # Changed to weighted
        f1 = f1_score(y_test, y_pred, average='weighted')  # Changed to weighted
        return accuracy, precision, recall, f1
    except Exception as e:
        print(f"Error occurred while evaluating model: {e}")
        return None

# Models to evaluate
models = {
    'Logistic Regression': Pipeline(steps=[('preprocessor', preprocessor),
                                           ('classifier', LogisticRegression(max_iter=1000))]),
    'K-Nearest Neighbors': Pipeline(steps=[('preprocessor', preprocessor),
                                           ('classifier', KNeighborsClassifier())]),
    'Decision Tree': Pipeline(steps=[('preprocessor', preprocessor),
                                      ('classifier', DecisionTreeClassifier())]),
    'Random Forest': Pipeline(steps=[('preprocessor', preprocessor),
                                      ('classifier', RandomForestClassifier())]),
    'Support Vector Machine': Pipeline(steps=[('preprocessor', preprocessor),
                                              ('classifier', SVC())])
}

# Evaluate each model and print the results
results = {}
for name, model in models.items():
    result = evaluate_model(model)
    if result:  # Only store results if evaluation was successful
        results[name] = result

# Display the results
for model_name, metrics in results.items():
    print(f"{model_name}: Accuracy: {metrics[0]:.2f}, Precision: {metrics[1]:.2f}, "
          f"Recall: {metrics[2]:.2f}, F1 Score: {metrics[3]:.2f}")

# Select the best performing model based on accuracy
if results:  # Check if there are results to compare
    best_model_name = max(results, key=lambda k: results[k][0])
    best_model_score = results[best_model_name]
    print(f"The best model is {best_model_name} with an accuracy of {best_model_score[0]:.2f}")
else:
    print("No models were evaluated successfully.")


Missing values in each column:
 Person ID                    0
Gender                       0
Age                          0
Occupation                   0
Sleep Duration               0
Quality of Sleep             0
Physical Activity Level      0
Stress Level                 0
BMI Category                 0
Blood Pressure               0
Heart Rate                   0
Daily Steps                  0
Sleep Disorder             219
dtype: int64
Missing values after handling:
 Person ID                  0
Gender                     0
Age                        0
Occupation                 0
Sleep Duration             0
Quality of Sleep           0
Physical Activity Level    0
Stress Level               0
BMI Category               0
Blood Pressure             0
Heart Rate                 0
Daily Steps                0
Sleep Disorder             0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)  # Fill numerical NaNs with the median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mode()[0], inplace=True)  # Fill categorical NaNs with the mode


Logistic Regression: Accuracy: 0.95, Precision: 0.95, Recall: 0.95, F1 Score: 0.95
K-Nearest Neighbors: Accuracy: 0.93, Precision: 0.93, Recall: 0.93, F1 Score: 0.93
Decision Tree: Accuracy: 0.93, Precision: 0.93, Recall: 0.93, F1 Score: 0.93
Random Forest: Accuracy: 0.95, Precision: 0.95, Recall: 0.95, F1 Score: 0.95
Support Vector Machine: Accuracy: 0.94, Precision: 0.94, Recall: 0.94, F1 Score: 0.94
The best model is Logistic Regression with an accuracy of 0.95
