In [95]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from sklearn.model_selection import  GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [120]:
data = pd.read_csv('healthcare_dataset.csv', delimiter=',', encoding='ISO-8859-1')
data = data.drop(columns=['Name', 'Doctor'])

In [None]:
# Group less common categories into 'Other'
def group_rare_categories(column, threshold=0.01):
    counts = column.value_counts(normalize=True)
    return column.apply(lambda x: x if counts[x] > threshold else 'Other')

for col in ['Hospital', 'Insurance Provider']:
    data[col] = group_rare_categories(data[col])

# Encode categorical columns using OneHotEncoder for low cardinality and LabelEncoder for high cardinality
low_cardinality_cols = ['Gender', 'Blood Type', 'Medical Condition', 'Admission Type', 'Medication']
high_cardinality_cols = ['Hospital', 'Insurance Provider']

In [105]:
# OneHotEncode low cardinality columns
data = pd.get_dummies(data, columns=low_cardinality_cols, drop_first=True)

# LabelEncode high cardinality columns
label_encoders = {}
for col in high_cardinality_cols:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Use LabelEncoder for the target variable
label_encoder = LabelEncoder()
data['Test Results'] = label_encoder.fit_transform(data['Test Results'])

# Create new feature: Length of Stay
data['Date of Admission'] = pd.to_datetime(data['Date of Admission'])
data['Discharge Date'] = pd.to_datetime(data['Discharge Date'])
data['Length of Stay'] = (data['Discharge Date'] - data['Date of Admission']).dt.days
data = data.drop(['Date of Admission', 'Discharge Date', 'Hospital'], axis=1)


# Define features (X) and target (y)
X = data.drop('Test Results', axis=1)
y = data['Test Results']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

Accuracy: 0.3427927927927928
              precision    recall  f1-score   support

           0       0.34      0.43      0.38      3754
           1       0.33      0.27      0.30      3617
           2       0.35      0.32      0.34      3729

    accuracy                           0.34     11100
   macro avg       0.34      0.34      0.34     11100
weighted avg       0.34      0.34      0.34     11100



In [None]:
corr = abs(data.corr(method='pearson')*100)
plt.figure(figsize=(20,20))
sns.heatmap(corr, annot=True, linewidth=.1, vmin=0, vmax=100,
            fmt=".2f", cmap=sns.color_palette("flare", as_cmap=True))
plt.tight_layout()
plt.show()

In [115]:
# Group less common categories into 'Other'
def group_rare_categories(column, threshold=0.01):
    counts = column.value_counts(normalize=True)
    return column.apply(lambda x: x if counts[x] > threshold else 'Other')

for col in ['Hospital', 'Insurance Provider']:
    data[col] = group_rare_categories(data[col])

# Encode categorical columns using OneHotEncoder for low cardinality and LabelEncoder for high cardinality
low_cardinality_cols = ['Gender', 'Blood Type', 'Medical Condition', 'Admission Type', 'Medication']
high_cardinality_cols = ['Hospital', 'Insurance Provider']

# OneHotEncode low cardinality columns
data = pd.get_dummies(data, columns=low_cardinality_cols, drop_first=True)

# LabelEncode high cardinality columns
label_encoders = {}
for col in high_cardinality_cols:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Use LabelEncoder for the target variable
label_encoder = LabelEncoder()
data['Test Results'] = label_encoder.fit_transform(data['Test Results'])

# Create new feature: Length of Stay
data['Date of Admission'] = pd.to_datetime(data['Date of Admission'])
data['Discharge Date'] = pd.to_datetime(data['Discharge Date'])
data['Length of Stay'] = (data['Discharge Date'] - data['Date of Admission']).dt.days
data = data.drop(['Date of Admission', 'Discharge Date'], axis=1)

# Define features (X) and target (y)
X = data.drop('Test Results', axis=1)
y = data['Test Results']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

'''# Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1,
                           verbose=2)

grid_search.fit(X_train, y_train)

# Get the best parameters
print(f'Best Parameters: {grid_search.best_params_}')

# Use the best model to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))'''

'''
# Feature Importance
importances = best_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
'''

Accuracy: 0.43405405405405406
              precision    recall  f1-score   support

           0       0.43      0.45      0.44      3754
           1       0.44      0.42      0.43      3617
           2       0.44      0.44      0.44      3729

    accuracy                           0.43     11100
   macro avg       0.43      0.43      0.43     11100
weighted avg       0.43      0.43      0.43     11100

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.4372072072072072
              precision    recall  f1-score   support

           0       0.43      0.44      0.44      3754
           1       0.44      0.42      0.43      3617
           2       0.44      0.45      0.44      3729

    accuracy                           0.44     11100
   macro avg       0.44      0.44      0.44     11100
weighted avg       0.44      0.44      0.44     11100



"\n# Feature Importance\nimportances = best_model.feature_importances_\nfeature_names = X.columns\nfeature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})\nfeature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)\n"

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE


# Group less common categories into 'Other'
def group_rare_categories(column, threshold=0.01):
    counts = column.value_counts(normalize=True)
    return column.apply(lambda x: x if counts[x] > threshold else 'Other')

for col in ['Hospital', 'Insurance Provider']:
    data[col] = group_rare_categories(data[col])

# Encode categorical columns using OneHotEncoder for low cardinality and LabelEncoder for high cardinality
low_cardinality_cols = ['Gender', 'Blood Type', 'Medical Condition', 'Admission Type', 'Medication']
high_cardinality_cols = ['Hospital', 'Insurance Provider']

# OneHotEncode low cardinality columns
data = pd.get_dummies(data, columns=low_cardinality_cols, drop_first=True)

# LabelEncode high cardinality columns
label_encoders = {}
for col in high_cardinality_cols:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Use LabelEncoder for the target variable
label_encoder = LabelEncoder()
data['Test Results'] = label_encoder.fit_transform(data['Test Results'])

# Create new feature: Length of Stay
data['Date of Admission'] = pd.to_datetime(data['Date of Admission'])
data['Discharge Date'] = pd.to_datetime(data['Discharge Date'])
data['Length of Stay'] = (data['Discharge Date'] - data['Date of Admission']).dt.days
data = data.drop(['Date of Admission', 'Discharge Date'], axis=1)

# Define features (X) and target (y)
X = data.drop('Test Results', axis=1)
y = data['Test Results']

# Create polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_poly = poly.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Initialize and train the model
model = GradientBoostingClassifier(n_estimators=200, random_state=42)
model.fit(X_resampled, y_resampled)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

# Extensive Hyperparameter Tuning
param_dist = {
    'n_estimators': [300, 400, 500],
    'max_depth': [None, 7, 9, 11],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

random_search = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=42),
                                   param_distributions=param_dist,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   random_state=42,
                                   n_jobs=-1)

random_search.fit(X_resampled, y_resampled)

# Get the best parameters
print(f'Best Parameters: {random_search.best_params_}')

# Use the best model to make predictions
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

# Cross-Validation
cv_scores = cross_val_score(best_model, X_poly, y, cv=5)
print(f'Cross-Validation Accuracy: {np.mean(cv_scores)}')
