In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
no_show = pd.read_csv('no_show.csv')
clinics = pd.read_csv('clinics.csv')
planning_neighborhoods = pd.read_csv('planning_neighborhoods.csv')
no_show_historical = pd.read_csv('no_show_historical.csv')

# Merging datasets
data = pd.merge(no_show, clinics, left_on='Clinic Location', right_on='clinic', how='left')
data = pd.merge(data, planning_neighborhoods, left_on='Neighborhood', right_on='neighborho', how='left')
data = pd.merge(data, no_show_historical, on='Patient ID', how='left')

# Combine no_show_x and no_show_y into a single column
if 'no_show_x' in data.columns and 'no_show_y' in data.columns:
    data['target_no_show'] = data['no_show_x'].fillna(data['no_show_y'])
    data.drop(columns=['no_show_x', 'no_show_y'], inplace=True)

# Preprocessing
data['Gender'] = data['Gender'].map({'M': 0, 'F': 1})
mapping_dict = {'0/week': 0, '1/week': 1, '5/week': 2, '10/week': 3, '> 14/week': 4}
data['Alcohol Consumption'] = data['Alcohol Consumption'].map(mapping_dict)
data['target_no_show'] = data['target_no_show'].astype(int)
data['Hypertension'] = data['Hypertension'].astype(int)
data['Diabetes'] = data['Diabetes'].astype(int)
data['age_group'] = pd.cut(data['Age'], bins=[0, 30, 40, 50, 60, 100], labels=['<30', '30-40', '40-50', '50-60', '>60'])
data = pd.get_dummies(data, columns=['age_group'], drop_first=True)
data['Appointment Date'] = pd.to_datetime(data['Appointment Date'])
data['Schedule Date'] = pd.to_datetime(data['Schedule Date'])
data['appointment_day'] = data['Appointment Date'].dt.day
data['appointment_month'] = data['Appointment Date'].dt.month
data['appointment_year'] = data['Appointment Date'].dt.year
data['appointment_dayofweek'] = data['Appointment Date'].dt.dayofweek
data['schedule_day'] = data['Schedule Date'].dt.day
data['schedule_month'] = data['Schedule Date'].dt.month
data['schedule_year'] = data['Schedule Date'].dt.year
data['schedule_dayofweek'] = data['Schedule Date'].dt.dayofweek
data['days_until_appointment'] = (data['Appointment Date'] - data['Schedule Date']).dt.days
data.drop(columns=['Appointment Date', 'Schedule Date', 'Age'], inplace=True)
categorical_columns = ['Clinic Location']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
non_numeric_columns = ['Patient ID', 'Appointment ID', 'Appointment Reason', 'Specialty', 'Neighborhood', 'name', 'type', 'addr1', 'addr2', 'lat', 'long', 'the_geom', 'neighborho', 'clinic']
data.drop(columns=non_numeric_columns, inplace=True)
for col in data.select_dtypes(include='bool').columns:
    data[col] = data[col].astype(int)
X = data.drop(columns=['target_no_show'])
y = data['target_no_show']
X.fillna(X.mean(), inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
clf = BalancedRandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train_balanced, y_train_balanced)
print("Best parameters found: ", random_search.best_params_)
print("Best score found: ", random_search.best_score_)
best_clf = random_search.best_estimator_
best_clf.fit(X_train_balanced, y_train_balanced)
y_pred = best_clf.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
roc_auc = roc_auc_score(y_test, best_clf.predict_proba(X_test)[:, 1])
print("ROC-AUC Score:", roc_auc)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Show', 'No Show'], yticklabels=['Show', 'No Show'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
feature_importances = best_clf.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
plt.title('Top 10 Feature Importances')
plt.show()


Fitting 3 folds for each of 100 candidates, totalling 300 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


KeyboardInterrupt: 