In [1]:
import pandas as pd

# Correctly format the file path
file_path = "C:\\Users\\user\\Desktop\\Healthcare\\healthcare dataset.csv"

# Load the dataset
data = pd.read_csv(file_path)
df = pd.DataFrame(data)

In [2]:
# Convert date columns to datetime format, ensuring errors are handled as NaT for invalid entries
df['Admission Date'] = pd.to_datetime(df['Admission Date'], format='%d-%m-%Y', errors='coerce')
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'], format='%d-%m-%Y', errors='coerce')
df['Scheduled Date'] = pd.to_datetime(df['Scheduled Date'], format='%d-%m-%Y', errors='coerce')

# Check if there are any invalid or missing values in 'Scheduled Date'
if df['Scheduled Date'].isnull().sum() > 0:
    print(f"Warning: Some 'Scheduled Date' entries are invalid and have been set to NaT.")
    print(f"Total invalid 'Scheduled Date' entries: {df['Scheduled Date'].isnull().sum()}")
    # Optionally, forward fill or use other imputation methods
    df['Scheduled Date'] = df['Scheduled Date'].fillna(method='bfill')  # Backward fill

# Reference date for converting Scheduled Date to numeric (days since reference date)
reference_date = pd.to_datetime('2020-01-01')

# Calculate the number of days between the reference date and the Scheduled Date
df['Scheduled Days'] = (df['Scheduled Date'] - reference_date).dt.days

# Check if 'Scheduled Days' has been calculated correctly
if 'Scheduled Days' not in df.columns:
    raise ValueError("'Scheduled Days' column is missing after calculation.")


In [3]:
# Example of creating additional feature: Days since admission
df['Days Since Admission'] = (df['Scheduled Date'] - df['Admission Date']).dt.days

# Handle missing values in the new feature
df['Days Since Admission'] = df['Days Since Admission'].fillna(0)  # Replace NaN with 0
df['Days Since Admission'] = df['Days Since Admission'].clip(lower=0)  # Ensure no negative days

# One-hot encoding categorical features like Gender, Blood Type, Medical Condition, Admission Type
# Check cardinality before encoding
categorical_cols = ['Gender', 'Blood Type', 'Medical Condition', 'Admission Type']
for col in categorical_cols:
    if col in df.columns:
        unique_values = df[col].nunique()
        print(f"{col}: {unique_values} unique values")
        if unique_values > 20:  # Threshold for high cardinality
            print(f"High cardinality detected in {col}. Consider alternative encodings.")
        else:
            df = pd.get_dummies(df, columns=[col], drop_first=True)


Gender: 2 unique values
Blood Type: 8 unique values
Medical Condition: 6 unique values
Admission Type: 3 unique values


In [4]:
# Drop the target column and any non-relevant columns like 'Name', 'Patient_ID', 'Admission Date', 'Discharge Date'
columns_to_drop = ['Name', 'Patient_ID', 'Admission Date', 'Discharge Date']
existing_columns = [col for col in columns_to_drop if col in df.columns]
df = df.drop(columns=existing_columns, errors='ignore')  # Drop non-relevant columns

# Define the target variable 'y' as 'Scheduled Date'
if 'Scheduled Date' in df.columns:
    y = df['Scheduled Date']  # Target variable
    X = df.drop(columns=['Scheduled Date'], errors='ignore')  # Drop target column from features
else:
    raise KeyError("'Scheduled Date' column is missing, so it cannot be used as the target variable.")


In [5]:
# Identify already encoded columns
non_numeric_columns = X.select_dtypes(include=['object']).columns.tolist()
print("Non-numeric columns in X:", non_numeric_columns)

# Check for high cardinality and apply encoding only once
for col in non_numeric_columns:
    unique_values = X[col].nunique()
    print(f"{col}: {unique_values} unique values")
    if unique_values > 20:  # Threshold for high cardinality
        print(f"Skipping one-hot encoding for {col} due to high cardinality.")
        # Optional: Use frequency or target encoding instead
        X[col + '_freq'] = X[col].map(X[col].value_counts())
        X = X.drop(columns=[col], errors='ignore')
    else:
        X = pd.get_dummies(X, columns=[col], drop_first=True)

# Fill missing numerical values with the median (after encoding)
X = X.fillna(X.median())

# Ensure the target variable 'y' has no missing values (optional, depends on your target)
y = y.fillna(y.median())  # Replace with `.mode()[0]` if `y` is categorical (for mode-based filling)

# Check the shape after filling missing values and encoding categorical variables
print("Size after filling missing values and encoding:", X.shape)

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the size of training and testing sets
print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


Non-numeric columns in X: ['Medication', 'Test Results', 'Hospital', 'Email']
Medication: 5 unique values
Test Results: 3 unique values
Hospital: 2055 unique values
Skipping one-hot encoding for Hospital due to high cardinality.
Email: 476 unique values
Skipping one-hot encoding for Email due to high cardinality.
Size after filling missing values and encoding: (2110, 28)
Training set size: (1688, 28)
Test set size: (422, 28)


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming 'X' is your feature DataFrame and 'y' is your target variable

# Step 1: Handle missing values for numerical columns by filling with the median
numerical_columns = X.select_dtypes(include=['number']).columns
X[numerical_columns] = X[numerical_columns].fillna(X[numerical_columns].median())

# Step 2: Handle missing values for categorical columns by filling with the mode (most frequent value)
categorical_columns = X.select_dtypes(exclude=['number']).columns
for col in categorical_columns:
    X[col] = X[col].fillna(X[col].mode()[0])

# Step 3: Handle missing values for categorical columns and apply one-hot encoding or frequency encoding
for col in categorical_columns:
    if col in X.columns:
        unique_values = X[col].nunique()
        print(f"{col}: {unique_values} unique values")
        if unique_values > 20:  # High cardinality threshold
            print(f"Skipping one-hot encoding for {col} due to high cardinality. Applying frequency encoding instead.")
            X[col + '_freq'] = X[col].map(X[col].value_counts())
            X = X.drop(columns=[col], errors='ignore')  # Drop original column
        else:
            X = pd.get_dummies(X, columns=[col], drop_first=True)

# Step 4: Identify specific columns for one-hot encoding that exist in the DataFrame
one_hot_columns = ['Medication', 'Test Results', 'Hospital']  # Desired columns for one-hot encoding
existing_one_hot_columns = [col for col in one_hot_columns if col in X.columns]

# Step 5: Apply one-hot encoding only to the specific existing columns
if existing_one_hot_columns:
    X = pd.get_dummies(X, columns=existing_one_hot_columns, drop_first=True)
else:
    print("No specific columns available for one-hot encoding.")

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shape of the resulting dataset to confirm the changes
print("Shape of X after encoding:", X.shape)
print("Training data shape:", X_train.shape)
print("Test data shape:", X_test.shape)


Gender_Male: 2 unique values
Blood Type_A-: 2 unique values
Blood Type_AB+: 2 unique values
Blood Type_AB-: 2 unique values
Blood Type_B+: 2 unique values
Blood Type_B-: 2 unique values
Blood Type_O+: 2 unique values
Blood Type_O-: 2 unique values
Medical Condition_Asthma: 2 unique values
Medical Condition_Cancer: 2 unique values
Medical Condition_Diabetes: 2 unique values
Medical Condition_Hypertension: 2 unique values
Medical Condition_Obesity: 2 unique values
Admission Type_Emergency: 2 unique values
Admission Type_Urgent: 2 unique values
Medication_Ibuprofen: 2 unique values
Medication_Lipitor: 2 unique values
Medication_Paracetamol: 2 unique values
Medication_Penicillin: 2 unique values
Test Results_Inconclusive: 2 unique values
Test Results_Normal: 2 unique values
No specific columns available for one-hot encoding.
Shape of X after encoding: (2110, 28)
Training data shape: (1688, 28)
Test data shape: (422, 28)


In [7]:
# Check if the dataset has enough rows for splitting
if X.shape[0] < 10:  # This ensures that you have at least 10 rows in the dataset
    raise ValueError("Not enough data to perform train-test split")

# After splitting, check if both the training and testing sets have a reasonable number of rows
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

if X_train.shape[0] < 5 or X_test.shape[0] < 5:
    raise ValueError("Train-test split resulted in too few samples in either the training or testing set.")

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


Training set size: (1688, 28)
Test set size: (422, 28)


In [8]:
# Drop unnecessary columns: 'Name' and 'Patient_ID'
columns_to_drop = ['Name', 'Patient_ID']
existing_columns = [col for col in columns_to_drop if col in df.columns]

# Drop the columns if they exist
df = df.drop(columns=existing_columns, errors='ignore')  # 'ignore' ensures no error if columns are missing

# Check if the columns have been removed
print("Columns after dropping unnecessary columns:", df.columns.tolist())


Columns after dropping unnecessary columns: ['Age', 'Medication', 'Test Results', 'Scheduled Date', 'Hospital', 'Billing Amount', 'Email', 'Contact Number', 'Scheduled Days', 'Days Since Admission', 'Gender_Male', 'Blood Type_A-', 'Blood Type_AB+', 'Blood Type_AB-', 'Blood Type_B+', 'Blood Type_B-', 'Blood Type_O+', 'Blood Type_O-', 'Medical Condition_Asthma', 'Medical Condition_Cancer', 'Medical Condition_Diabetes', 'Medical Condition_Hypertension', 'Medical Condition_Obesity', 'Admission Type_Emergency', 'Admission Type_Urgent']


In [9]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor  # Import XGBoost

# Correctly format the file path
file_path = "C:\\Users\\user\\Desktop\\Healthcare\\healthcare dataset.csv"

# Load the dataset
df = pd.read_csv(file_path)

# Convert date columns to datetime format, use 'coerce' to convert invalid dates to NaT
df['Admission Date'] = pd.to_datetime(df['Admission Date'], format='%d-%m-%Y', errors='coerce')
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'], format='%d-%m-%Y', errors='coerce')
df['Scheduled Date'] = pd.to_datetime(df['Scheduled Date'], format='%d-%m-%Y', errors='coerce')

# Handle missing values in date columns (forward fill)
df['Scheduled Date'] = df['Scheduled Date'].ffill()
df['Admission Date'] = df['Admission Date'].ffill()

# Recheck for NaT values
print(f"Null values in 'Scheduled Date' after filling: {df['Scheduled Date'].isnull().sum()}")
print(f"Null values in 'Admission Date' after filling: {df['Admission Date'].isnull().sum()}")

# Feature Engineering: Create Year, Month, Day features from dates
df['Admission Year'] = df['Admission Date'].dt.year
df['Admission Month'] = df['Admission Date'].dt.month
df['Admission Day'] = df['Admission Date'].dt.day

df['Scheduled Year'] = df['Scheduled Date'].dt.year
df['Scheduled Month'] = df['Scheduled Date'].dt.month
df['Scheduled Day'] = df['Scheduled Date'].dt.day

# Create a 'Days Since Admission' feature based on the difference in days between 'Scheduled Date' and 'Admission Date'
df['Days Since Admission'] = (df['Scheduled Date'] - df['Admission Date']).dt.days

# Fill any remaining missing values in 'Days Since Admission' with 0
df['Days Since Admission'] = df['Days Since Admission'].fillna(0).astype('int64')

# Handle other missing values in the dataset (other than date columns)
# Using median for numerical columns and mode for categorical columns
for col in df.select_dtypes(include=['number']).columns:
    df[col] = df[col].fillna(df[col].median())  # For numerical columns

for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])  # For categorical columns

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Check the number of unique values in each categorical column
high_cardinality_cols = [col for col in categorical_cols if df[col].nunique() > 50]
low_cardinality_cols = [col for col in categorical_cols if df[col].nunique() <= 50]

print(f"High-cardinality categorical columns: {high_cardinality_cols}")
print(f"Low-cardinality categorical columns: {low_cardinality_cols}")

# Perform one-hot encoding only on low-cardinality columns
if low_cardinality_cols:
    df = pd.get_dummies(df, columns=low_cardinality_cols, drop_first=True)

# For high-cardinality columns, use label encoding
for col in high_cardinality_cols:
    df[col] = df[col].astype('category').cat.codes  # Label encoding

# Drop non-relevant columns (including original date columns)
columns_to_drop = ['Name', 'Patient_ID', 'Admission Date', 'Discharge Date']  # Remove original date columns
df = df.drop(columns=columns_to_drop, errors='ignore')

# Define the target variable (y) and feature set (X)
# Ensure 'Scheduled Date' is in the dataset before using it as target
if 'Scheduled Date' in df.columns:
    y = df['Scheduled Date'].values  # Use the original Scheduled Date as target variable
    reference_date = pd.to_datetime('2020-01-01')  # Reference date to convert Scheduled Date to number of days
    y = (df['Scheduled Date'] - reference_date).dt.days.values  # Convert 'Scheduled Date' to number of days
    X = df.drop(columns=['Scheduled Date'], errors='ignore')  # Drop 'Scheduled Date' column from features
else:
    raise KeyError("'Scheduled Date' column is missing and cannot be used as the target variable.")

# Check for NaN or infinite values in the dataset
if X.isnull().sum().sum() > 0:
    print("Data contains NaN values. Filling them with the median.")
    X = X.fillna(X.median())

if np.any(np.isinf(X)):
    print("Data contains infinite values. Replacing them with 0.")
    X.replace([np.inf, -np.inf], 0, inplace=True)

# Check if features and target are correctly defined
print("Feature set shape:", X.shape)
print("Target variable shape:", y.shape)  # This should display (number of rows,)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning (Optional) using GridSearchCV with a simplified grid for XGBoost
param_grid = {
    'n_estimators': [50, 100],  # Number of boosting rounds
    'max_depth': [3, 6],  # Maximum depth of the trees
    'learning_rate': [0.01, 0.1],  # Learning rate
    'subsample': [0.8, 1.0],  # Subsample ratio of the training instances
    'colsample_bytree': [0.8, 1.0]  # Subsample ratio of columns when constructing each tree
}

# GridSearchCV for XGBRegressor
grid_search = GridSearchCV(estimator=XGBRegressor(random_state=42), param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

# Best hyperparameters found
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Initialize and train the model with the best hyperparameters
xgb_model = XGBRegressor(
    n_estimators=grid_search.best_params_['n_estimators'],
    max_depth=grid_search.best_params_['max_depth'],
    learning_rate=grid_search.best_params_['learning_rate'],
    subsample=grid_search.best_params_['subsample'],
    colsample_bytree=grid_search.best_params_['colsample_bytree'],
    random_state=42
)

xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)


Null values in 'Scheduled Date' after filling: 0
Null values in 'Admission Date' after filling: 0
High-cardinality categorical columns: ['Name', 'Hospital', 'Email']
Low-cardinality categorical columns: ['Gender', 'Blood Type', 'Medical Condition', 'Admission Type', 'Medication', 'Test Results']
Feature set shape: (2110, 33)
Target variable shape: (2110,)
Best Hyperparameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'subsample': 1.0}
Mean Absolute Error (MAE): 1.6623044398158648


In [10]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# Initialize and train the XGBoost model
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.257096634091924


In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# XGBClassifier without the deprecated parameter
xgb_model = XGBClassifier(eval_metric='logloss')


# Example dataset
from sklearn.datasets import make_classification

# Create a sample dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "Gradient Boosting": GradientBoostingClassifier(),
    # "AdaBoost": AdaBoostClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    # "LightGBM": LGBMClassifier()
}

# Initialize results dictionary
results = []

# Evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_prob) if y_prob is not None else np.nan
    
    # Append to results
    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "AUC-ROC": auc_roc
    })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Print the results
print(results_df)

# Select the best model based on F1-Score
best_model = results_df.loc[results_df['F1-Score'].idxmax()]
print("\nBest Model:")
print(best_model)


                 Model  Accuracy  Precision    Recall  F1-Score   AUC-ROC
0  Logistic Regression  0.850000   0.876712  0.825806  0.850498  0.914171
1        Random Forest  0.863333   0.895833  0.832258  0.862876  0.923915
2                  SVM  0.833333   0.857143  0.812903  0.834437  0.911190
3    Gradient Boosting  0.866667   0.885906  0.851613  0.868421  0.923471
4        Decision Tree  0.846667   0.881119  0.812903  0.845638  0.847831
5  K-Nearest Neighbors  0.803333   0.847826  0.754839  0.798635  0.873571
6          Naive Bayes  0.813333   0.877863  0.741935  0.804196  0.885206
7              XGBoost  0.896667   0.942857  0.851613  0.894915  0.943404

Best Model:
Model         XGBoost
Accuracy     0.896667
Precision    0.942857
Recall       0.851613
F1-Score     0.894915
AUC-ROC      0.943404
Name: 7, dtype: object


Parameters: { "use_label_encoder" } are not used.

