In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

In [54]:
# Load datasets
no_show = pd.read_csv('no_show.csv')
clinics = pd.read_csv('clinics.csv')
planning_neighborhoods = pd.read_csv('planning_neighborhoods.csv')
no_show_historical = pd.read_csv('no_show_historical.csv')

In [55]:
# Display column names
print("No Show Columns:", no_show.columns)
print("Clinics Columns:", clinics.columns)
print("Planning Neighborhoods Columns:", planning_neighborhoods.columns)
print("No Show Historical Columns:", no_show_historical.columns)

No Show Columns: Index(['no_show', 'Patient ID', 'Appointment ID', 'Gender', 'Age',
       'Alcohol Consumption', 'Hypertension', 'Diabetes', 'Appointment Date',
       'Schedule Date', 'Appointment Reason', 'Clinic Location', 'Specialty',
       'Neighborhood'],
      dtype='object')
Clinics Columns: Index(['name', 'type', 'addr1', 'addr2', 'clinic', 'lat', 'long'], dtype='object')
Planning Neighborhoods Columns: Index(['the_geom', 'neighborho'], dtype='object')
No Show Historical Columns: Index(['Patient ID', 'no_show'], dtype='object')


In [56]:
# Merging datasets
data = pd.merge(no_show, clinics, left_on='Clinic Location', right_on='clinic', how='left')
data = pd.merge(data, planning_neighborhoods, left_on='Neighborhood', right_on='neighborho', how='left')
data = pd.merge(data, no_show_historical, on='Patient ID', how='left')

In [57]:
data.columns

Index(['no_show_x', 'Patient ID', 'Appointment ID', 'Gender', 'Age',
       'Alcohol Consumption', 'Hypertension', 'Diabetes', 'Appointment Date',
       'Schedule Date', 'Appointment Reason', 'Clinic Location', 'Specialty',
       'Neighborhood', 'name', 'type', 'addr1', 'addr2', 'clinic', 'lat',
       'long', 'the_geom', 'neighborho', 'no_show_y'],
      dtype='object')

In [58]:
data.head()

Unnamed: 0,no_show_x,Patient ID,Appointment ID,Gender,Age,Alcohol Consumption,Hypertension,Diabetes,Appointment Date,Schedule Date,...,name,type,addr1,addr2,clinic,lat,long,the_geom,neighborho,no_show_y
0,False,649e3901-e56b-41d9-b2d3-f61ce708a415,659a5257-c2f0-4eda-a2bb-ebc4bd9ce4e4,F,43,5/week,False,False,2021-01-14T10:30:00,2020-10-26T00:00:00,...,Parnassus,Hospitals & Clinics,400 Parnassus Ave.,"San Francisco, CA 94143",Mission Bay,37.764124,-122.456572,MULTIPOLYGON (((-122.42095167789303 37.8089665...,Russian Hill,
1,False,649e3901-e56b-41d9-b2d3-f61ce708a415,659a5257-c2f0-4eda-a2bb-ebc4bd9ce4e4,F,43,5/week,False,False,2021-01-14T10:30:00,2020-10-26T00:00:00,...,Mount Zion,Hospitals & Clinics,1600 Divisadero St.,"San Francisco, CA 94115",Mission Bay,37.784882,-122.438723,MULTIPOLYGON (((-122.42095167789303 37.8089665...,Russian Hill,
2,False,649e3901-e56b-41d9-b2d3-f61ce708a415,659a5257-c2f0-4eda-a2bb-ebc4bd9ce4e4,F,43,5/week,False,False,2021-01-14T10:30:00,2020-10-26T00:00:00,...,Mission Bay,Hospitals & Clinics,1825 Fourth St.,"San Francisco, CA 94158",Mission Bay,37.766561,-122.389948,MULTIPOLYGON (((-122.42095167789303 37.8089665...,Russian Hill,
3,False,3028fd02-a20a-4233-ac16-b571dde4540c,7ae6e7f8-3788-48d2-9fbc-11114ec28bfe,F,37,0/week,False,True,2021-02-17T14:00:00,2021-01-25T00:00:00,...,Parnassus,Hospitals & Clinics,400 Parnassus Ave.,"San Francisco, CA 94143",Mission Bay,37.764124,-122.456572,MULTIPOLYGON (((-122.45912298496032 37.7082180...,Ocean View,
4,False,3028fd02-a20a-4233-ac16-b571dde4540c,7ae6e7f8-3788-48d2-9fbc-11114ec28bfe,F,37,0/week,False,True,2021-02-17T14:00:00,2021-01-25T00:00:00,...,Mount Zion,Hospitals & Clinics,1600 Divisadero St.,"San Francisco, CA 94115",Mission Bay,37.784882,-122.438723,MULTIPOLYGON (((-122.45912298496032 37.7082180...,Ocean View,


In [60]:
# Combine no_show_x and no_show_y into a single column
if 'no_show_x' in data.columns and 'no_show_y' in data.columns:
    data['target_no_show'] = data['no_show_x'].fillna(data['no_show_y'])
    print('target column updated')
    
    # Drop the no_show_x and no_show_y columns
    data.drop(columns=['no_show_x', 'no_show_y'], inplace=True)
else:
    raise KeyError("One or both of 'no_show_x' and 'no_show_y' columns not found in the data.")


KeyError: "One or both of 'no_show_x' and 'no_show_y' columns not found in the data."

In [61]:
data.head()

Unnamed: 0,Patient ID,Appointment ID,Gender,Age,Alcohol Consumption,Hypertension,Diabetes,Appointment Date,Schedule Date,Appointment Reason,...,name,type,addr1,addr2,clinic,lat,long,the_geom,neighborho,target_no_show
0,649e3901-e56b-41d9-b2d3-f61ce708a415,659a5257-c2f0-4eda-a2bb-ebc4bd9ce4e4,F,43,5/week,False,False,2021-01-14T10:30:00,2020-10-26T00:00:00,CHIROPRACT MANJ 3-4 REGIONS,...,Parnassus,Hospitals & Clinics,400 Parnassus Ave.,"San Francisco, CA 94143",Mission Bay,37.764124,-122.456572,MULTIPOLYGON (((-122.42095167789303 37.8089665...,Russian Hill,False
1,649e3901-e56b-41d9-b2d3-f61ce708a415,659a5257-c2f0-4eda-a2bb-ebc4bd9ce4e4,F,43,5/week,False,False,2021-01-14T10:30:00,2020-10-26T00:00:00,CHIROPRACT MANJ 3-4 REGIONS,...,Mount Zion,Hospitals & Clinics,1600 Divisadero St.,"San Francisco, CA 94115",Mission Bay,37.784882,-122.438723,MULTIPOLYGON (((-122.42095167789303 37.8089665...,Russian Hill,False
2,649e3901-e56b-41d9-b2d3-f61ce708a415,659a5257-c2f0-4eda-a2bb-ebc4bd9ce4e4,F,43,5/week,False,False,2021-01-14T10:30:00,2020-10-26T00:00:00,CHIROPRACT MANJ 3-4 REGIONS,...,Mission Bay,Hospitals & Clinics,1825 Fourth St.,"San Francisco, CA 94158",Mission Bay,37.766561,-122.389948,MULTIPOLYGON (((-122.42095167789303 37.8089665...,Russian Hill,False
3,3028fd02-a20a-4233-ac16-b571dde4540c,7ae6e7f8-3788-48d2-9fbc-11114ec28bfe,F,37,0/week,False,True,2021-02-17T14:00:00,2021-01-25T00:00:00,OFFICE/OUTPATIENT VISIT EST,...,Parnassus,Hospitals & Clinics,400 Parnassus Ave.,"San Francisco, CA 94143",Mission Bay,37.764124,-122.456572,MULTIPOLYGON (((-122.45912298496032 37.7082180...,Ocean View,False
4,3028fd02-a20a-4233-ac16-b571dde4540c,7ae6e7f8-3788-48d2-9fbc-11114ec28bfe,F,37,0/week,False,True,2021-02-17T14:00:00,2021-01-25T00:00:00,OFFICE/OUTPATIENT VISIT EST,...,Mount Zion,Hospitals & Clinics,1600 Divisadero St.,"San Francisco, CA 94115",Mission Bay,37.784882,-122.438723,MULTIPOLYGON (((-122.45912298496032 37.7082180...,Ocean View,False


In [62]:
# Preprocessing
# Preprocessing
data['Gender'] = data['Gender'].map({'M': 0, 'F': 1})

# Map 'Alcohol Consumption' to numeric values
mapping_dict = {'0/week': 0, '1/week': 1, '5/week': 2, '10/week': 3, '> 14/week': 4}
data['Alcohol Consumption'] = data['Alcohol Consumption'].map(mapping_dict)

# Convert 'target_no_show' to integer
data['target_no_show'] = data['target_no_show'].astype(int)

# Convert 'Hypertension' and 'Diabetes' to integer
data['Hypertension'] = data['Hypertension'].astype(int)
data['Diabetes'] = data['Diabetes'].astype(int)

# Create age group categories and convert to one-hot encoding
data['age_group'] = pd.cut(data['Age'], bins=[0, 30, 40, 50, 60, 100], labels=['<30', '30-40', '40-50', '50-60', '>60'])
data = pd.get_dummies(data, columns=['age_group'], drop_first=True)

# Ensure 'target_no_show' column is of integer type
data['target_no_show'] = data['target_no_show'].astype(int)


In [63]:
# Convert date columns to datetime format and extract features
data['Appointment Date'] = pd.to_datetime(data['Appointment Date'])
data['Schedule Date'] = pd.to_datetime(data['Schedule Date'])
data['appointment_day'] = data['Appointment Date'].dt.day
data['appointment_month'] = data['Appointment Date'].dt.month
data['appointment_year'] = data['Appointment Date'].dt.year
data['appointment_dayofweek'] = data['Appointment Date'].dt.dayofweek
data['schedule_day'] = data['Schedule Date'].dt.day
data['schedule_month'] = data['Schedule Date'].dt.month
data['schedule_year'] = data['Schedule Date'].dt.year
data['schedule_dayofweek'] = data['Schedule Date'].dt.dayofweek
data['days_until_appointment'] = (data['Appointment Date'] - data['Schedule Date']).dt.days

# Drop the original date columns if they are not needed anymore
data.drop(columns=['Appointment Date', 'Schedule Date'], inplace=True)

In [64]:
# Drop unnecessary columns
non_numeric_columns = ['Patient ID', 'Appointment ID', 'Appointment Reason', 'Specialty', 'Neighborhood', 'name', 'type', 'addr1', 'addr2', 'lat', 'long', 'the_geom', 'neighborho', 'clinic']
data.drop(columns=non_numeric_columns, inplace=True)

In [20]:
# Ensure all boolean columns are converted to integers
for col in data.select_dtypes(include='bool').columns:
    data[col] = data[col].astype(int)

In [36]:
# Define features and target
X = data.drop(columns=['target_no_show'])
y = data['target_no_show']
X.head()

Unnamed: 0,no_show_x,Gender,Age,Alcohol Consumption,Hypertension,Diabetes,Clinic Location,no_show_y,age_group_30-40,age_group_40-50,...,age_group_>60,appointment_day,appointment_month,appointment_year,appointment_dayofweek,schedule_day,schedule_month,schedule_year,schedule_dayofweek,days_until_appointment
0,False,1,43,2,0,0,Mission Bay,,False,True,...,False,14,1,2021,3,26,10,2020,0,80
1,False,1,43,2,0,0,Mission Bay,,False,True,...,False,14,1,2021,3,26,10,2020,0,80
2,False,1,43,2,0,0,Mission Bay,,False,True,...,False,14,1,2021,3,26,10,2020,0,80
3,False,1,37,0,0,1,Mission Bay,,True,False,...,False,17,2,2021,2,25,1,2021,0,23
4,False,1,37,0,0,1,Mission Bay,,True,False,...,False,17,2,2021,2,25,1,2021,0,23


In [65]:
data.head()

Unnamed: 0,Gender,Age,Alcohol Consumption,Hypertension,Diabetes,Clinic Location,target_no_show,age_group_30-40,age_group_40-50,age_group_50-60,age_group_>60,appointment_day,appointment_month,appointment_year,appointment_dayofweek,schedule_day,schedule_month,schedule_year,schedule_dayofweek,days_until_appointment
0,1,43,2,0,0,Mission Bay,0,False,True,False,False,14,1,2021,3,26,10,2020,0,80
1,1,43,2,0,0,Mission Bay,0,False,True,False,False,14,1,2021,3,26,10,2020,0,80
2,1,43,2,0,0,Mission Bay,0,False,True,False,False,14,1,2021,3,26,10,2020,0,80
3,1,37,0,0,1,Mission Bay,0,True,False,False,False,17,2,2021,2,25,1,2021,0,23
4,1,37,0,0,1,Mission Bay,0,True,False,False,False,17,2,2021,2,25,1,2021,0,23


In [66]:
# Encode categorical variables using one-hot encoding
categorical_columns = ['Clinic Location']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

In [67]:
data.head()

Unnamed: 0,Gender,Age,Alcohol Consumption,Hypertension,Diabetes,target_no_show,age_group_30-40,age_group_40-50,age_group_50-60,age_group_>60,...,schedule_dayofweek,days_until_appointment,Clinic Location_Daniel Burnham Court,Clinic Location_Geary Boulevard,Clinic Location_Lakeshore,Clinic Location_Laurel Village,Clinic Location_Mission Bay,Clinic Location_Montgomery Street,Clinic Location_Mount Zion,Clinic Location_Parnassus
0,1,43,2,0,0,0,False,True,False,False,...,0,80,False,False,False,False,True,False,False,False
1,1,43,2,0,0,0,False,True,False,False,...,0,80,False,False,False,False,True,False,False,False
2,1,43,2,0,0,0,False,True,False,False,...,0,80,False,False,False,False,True,False,False,False
3,1,37,0,0,1,0,True,False,False,False,...,0,23,False,False,False,False,True,False,False,False
4,1,37,0,0,1,0,True,False,False,False,...,0,23,False,False,False,False,True,False,False,False


In [68]:
data=data.drop(columns=['Age'])

In [69]:
data.head()

Unnamed: 0,Gender,Alcohol Consumption,Hypertension,Diabetes,target_no_show,age_group_30-40,age_group_40-50,age_group_50-60,age_group_>60,appointment_day,...,schedule_dayofweek,days_until_appointment,Clinic Location_Daniel Burnham Court,Clinic Location_Geary Boulevard,Clinic Location_Lakeshore,Clinic Location_Laurel Village,Clinic Location_Mission Bay,Clinic Location_Montgomery Street,Clinic Location_Mount Zion,Clinic Location_Parnassus
0,1,2,0,0,0,False,True,False,False,14,...,0,80,False,False,False,False,True,False,False,False
1,1,2,0,0,0,False,True,False,False,14,...,0,80,False,False,False,False,True,False,False,False
2,1,2,0,0,0,False,True,False,False,14,...,0,80,False,False,False,False,True,False,False,False
3,1,0,0,1,0,True,False,False,False,17,...,0,23,False,False,False,False,True,False,False,False
4,1,0,0,1,0,True,False,False,False,17,...,0,23,False,False,False,False,True,False,False,False


In [70]:
# Define features and target
X = data.drop(columns=['target_no_show'])
y = data['target_no_show']

In [73]:
# Ensure all columns are numeric
print("Data types of features:\n", X.dtypes)

Data types of features:
 Gender                                  int64
Alcohol Consumption                     int64
Hypertension                            int64
Diabetes                                int64
age_group_30-40                          bool
age_group_40-50                          bool
age_group_50-60                          bool
age_group_>60                            bool
appointment_day                         int32
appointment_month                       int32
appointment_year                        int32
appointment_dayofweek                   int32
schedule_day                            int32
schedule_month                          int32
schedule_year                           int32
schedule_dayofweek                      int32
days_until_appointment                  int64
Clinic Location_Daniel Burnham Court     bool
Clinic Location_Geary Boulevard          bool
Clinic Location_Lakeshore                bool
Clinic Location_Laurel Village           bool
Clinic Lo

In [74]:
# Convert boolean columns to integers
bool_columns = X.select_dtypes(include='bool').columns
X[bool_columns] = X[bool_columns].astype(int)

# Verify the changes
print("Data types of features after conversion:\n", X.dtypes)


Data types of features after conversion:
 Gender                                  int64
Alcohol Consumption                     int64
Hypertension                            int64
Diabetes                                int64
age_group_30-40                         int64
age_group_40-50                         int64
age_group_50-60                         int64
age_group_>60                           int64
appointment_day                         int32
appointment_month                       int32
appointment_year                        int32
appointment_dayofweek                   int32
schedule_day                            int32
schedule_month                          int32
schedule_year                           int32
schedule_dayofweek                      int32
days_until_appointment                  int64
Clinic Location_Daniel Burnham Court    int64
Clinic Location_Geary Boulevard         int64
Clinic Location_Lakeshore               int64
Clinic Location_Laurel Village        

In [75]:
# Check for non-numeric columns
non_numeric_columns = [col for col in X.columns if not np.issubdtype(X[col].dtype, np.number)]
if non_numeric_columns:
    print("Non-numeric columns found after preprocessing:", non_numeric_columns)
    # raise ValueError("Not all columns are numeric after preprocessing.")

In [76]:
# Handling missing values
X.fillna(X.mean(), inplace=True)

In [77]:
# Split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [78]:
# Initialize the scaler
scaler = StandardScaler()

In [79]:
# Fit the scaler on the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [80]:
# Initialize SMOTE and fit on the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

In [81]:
# Define the model
model = RandomForestClassifier(random_state=42)

In [82]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [83]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=StratifiedKFold(n_splits=5), n_jobs=-1, scoring='f1')

In [None]:
# Fit the GridSearchCV to the training data
grid_search.fit(X_train_res, y_train_res)

In [86]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Load datasets
no_show = pd.read_csv('no_show.csv')
clinics = pd.read_csv('clinics.csv')
planning_neighborhoods = pd.read_csv('planning_neighborhoods.csv')
no_show_historical = pd.read_csv('no_show_historical.csv')

# Display column names
print("No Show Columns:", no_show.columns)
print("Clinics Columns:", clinics.columns)
print("Planning Neighborhoods Columns:", planning_neighborhoods.columns)
print("No Show Historical Columns:", no_show_historical.columns)

# Merging datasets
data = pd.merge(no_show, clinics, left_on='Clinic Location', right_on='clinic', how='left')
data = pd.merge(data, planning_neighborhoods, left_on='Neighborhood', right_on='neighborho', how='left')
data = pd.merge(data, no_show_historical, on='Patient ID', how='left')

# Combine no_show_x and no_show_y into a single column
if 'no_show_x' in data.columns and 'no_show_y' in data.columns:
    data['target_no_show'] = data['no_show_x'].fillna(data['no_show_y'])
    print('target column updated')
    # Drop the no_show_x and no_show_y columns
    data.drop(columns=['no_show_x', 'no_show_y'], inplace=True)
else:
    raise KeyError("One or both of 'no_show_x' and 'no_show_y' columns not found in the data.")

# Preprocessing
data['Gender'] = data['Gender'].map({'M': 0, 'F': 1})

# Map 'Alcohol Consumption' to numeric values
mapping_dict = {'0/week': 0, '1/week': 1, '5/week': 2, '10/week': 3, '> 14/week': 4}
data['Alcohol Consumption'] = data['Alcohol Consumption'].map(mapping_dict)

# Convert 'target_no_show' to integer
data['target_no_show'] = data['target_no_show'].astype(int)

# Convert 'Hypertension' and 'Diabetes' to integer
data['Hypertension'] = data['Hypertension'].astype(int)
data['Diabetes'] = data['Diabetes'].astype(int)

# Create age group categories and convert to one-hot encoding
data['age_group'] = pd.cut(data['Age'], bins=[0, 30, 40, 50, 60, 100], labels=['<30', '30-40', '40-50', '50-60', '>60'])
data = pd.get_dummies(data, columns=['age_group'], drop_first=True)

# Convert date columns to datetime format and extract features
data['Appointment Date'] = pd.to_datetime(data['Appointment Date'])
data['Schedule Date'] = pd.to_datetime(data['Schedule Date'])
data['appointment_day'] = data['Appointment Date'].dt.day
data['appointment_month'] = data['Appointment Date'].dt.month
data['appointment_year'] = data['Appointment Date'].dt.year
data['appointment_dayofweek'] = data['Appointment Date'].dt.dayofweek
data['schedule_day'] = data['Schedule Date'].dt.day
data['schedule_month'] = data['Schedule Date'].dt.month
data['schedule_year'] = data['Schedule Date'].dt.year
data['schedule_dayofweek'] = data['Schedule Date'].dt.dayofweek
data['days_until_appointment'] = (data['Appointment Date'] - data['Schedule Date']).dt.days

# Drop the original date columns if they are not needed anymore
data.drop(columns=['Appointment Date', 'Schedule Date', 'Age'], inplace=True)

# Encode categorical variables using one-hot encoding
categorical_columns = ['Clinic Location']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Drop unnecessary columns
non_numeric_columns = ['Patient ID', 'Appointment ID', 'Appointment Reason', 'Specialty', 'Neighborhood', 'name', 'type', 'addr1', 'addr2', 'lat', 'long', 'the_geom', 'neighborho', 'clinic']
data.drop(columns=non_numeric_columns, inplace=True)

# Ensure all boolean columns are converted to integers
for col in data.select_dtypes(include='bool').columns:
    data[col] = data[col].astype(int)

# Define features and target
X = data.drop(columns=['target_no_show'])
y = data['target_no_show']

# Ensure all columns are numeric
print("Data types of features:\n", X.dtypes)

# Convert boolean columns to integers
bool_columns = X.select_dtypes(include='bool').columns
X[bool_columns] = X[bool_columns].astype(int)

# Verify the changes
print("Data types of features after conversion:\n", X.dtypes)

# Check for non-numeric columns
non_numeric_columns = [col for col in X.columns if not np.issubdtype(X[col].dtype, np.number)]
if non_numeric_columns:
    print("Non-numeric columns found after preprocessing:", non_numeric_columns)
    # raise ValueError("Not all columns are numeric after preprocessing.")

# Handling missing values
X.fillna(X.mean(), inplace=True)

# Split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize SMOTE and fit on the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# Define the model
model = RandomForestClassifier(random_state=42)

# Define the reduced parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV with fewer folds
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=StratifiedKFold(n_splits=3), n_jobs=-1, scoring='f1')

# Fit the GridSearchCV to the training data
grid_search.fit(X_train_res, y_train_res)

# Make predictions on the test data
y_pred = grid_search.predict(X_test_scaled)

# Evaluate the model
print("Best parameters found:", grid_search.best_params_)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


No Show Columns: Index(['no_show', 'Patient ID', 'Appointment ID', 'Gender', 'Age',
       'Alcohol Consumption', 'Hypertension', 'Diabetes', 'Appointment Date',
       'Schedule Date', 'Appointment Reason', 'Clinic Location', 'Specialty',
       'Neighborhood'],
      dtype='object')
Clinics Columns: Index(['name', 'type', 'addr1', 'addr2', 'clinic', 'lat', 'long'], dtype='object')
Planning Neighborhoods Columns: Index(['the_geom', 'neighborho'], dtype='object')
No Show Historical Columns: Index(['Patient ID', 'no_show'], dtype='object')
target column updated
Data types of features:
 Gender                                  int64
Alcohol Consumption                     int64
Hypertension                            int64
Diabetes                                int64
age_group_30-40                         int64
age_group_40-50                         int64
age_group_50-60                         int64
age_group_>60                           int64
appointment_day                         

In [88]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Load datasets
no_show = pd.read_csv('no_show.csv')
clinics = pd.read_csv('clinics.csv')
planning_neighborhoods = pd.read_csv('planning_neighborhoods.csv')
no_show_historical = pd.read_csv('no_show_historical.csv')

# Display column names
print("No Show Columns:", no_show.columns)
print("Clinics Columns:", clinics.columns)
print("Planning Neighborhoods Columns:", planning_neighborhoods.columns)
print("No Show Historical Columns:", no_show_historical.columns)

# Merging datasets
data = pd.merge(no_show, clinics, left_on='Clinic Location', right_on='clinic', how='left')
data = pd.merge(data, planning_neighborhoods, left_on='Neighborhood', right_on='neighborho', how='left')
data = pd.merge(data, no_show_historical, on='Patient ID', how='left')

# Combine no_show_x and no_show_y into a single column
if 'no_show_x' in data.columns and 'no_show_y' in data.columns:
    data['target_no_show'] = data['no_show_x'].fillna(data['no_show_y'])
    print('target column updated')
    # Drop the no_show_x and no_show_y columns
    data.drop(columns=['no_show_x', 'no_show_y'], inplace=True)
else:
    raise KeyError("One or both of 'no_show_x' and 'no_show_y' columns not found in the data.")

# Preprocessing
data['Gender'] = data['Gender'].map({'M': 0, 'F': 1})

# Map 'Alcohol Consumption' to numeric values
mapping_dict = {'0/week': 0, '1/week': 1, '5/week': 2, '10/week': 3, '> 14/week': 4}
data['Alcohol Consumption'] = data['Alcohol Consumption'].map(mapping_dict)

# Convert 'target_no_show' to integer
data['target_no_show'] = data['target_no_show'].astype(int)

# Convert 'Hypertension' and 'Diabetes' to integer
data['Hypertension'] = data['Hypertension'].astype(int)
data['Diabetes'] = data['Diabetes'].astype(int)

# Create age group categories and convert to one-hot encoding
data['age_group'] = pd.cut(data['Age'], bins=[0, 30, 40, 50, 60, 100], labels=['<30', '30-40', '40-50', '50-60', '>60'])
data = pd.get_dummies(data, columns=['age_group'], drop_first=True)

# Convert date columns to datetime format and extract features
data['Appointment Date'] = pd.to_datetime(data['Appointment Date'])
data['Schedule Date'] = pd.to_datetime(data['Schedule Date'])
data['appointment_day'] = data['Appointment Date'].dt.day
data['appointment_month'] = data['Appointment Date'].dt.month
data['appointment_year'] = data['Appointment Date'].dt.year
data['appointment_dayofweek'] = data['Appointment Date'].dt.dayofweek
data['schedule_day'] = data['Schedule Date'].dt.day
data['schedule_month'] = data['Schedule Date'].dt.month
data['schedule_year'] = data['Schedule Date'].dt.year
data['schedule_dayofweek'] = data['Schedule Date'].dt.dayofweek
data['days_until_appointment'] = (data['Appointment Date'] - data['Schedule Date']).dt.days

# Drop the original date columns if they are not needed anymore
data.drop(columns=['Appointment Date', 'Schedule Date', 'Age'], inplace=True)

# Encode categorical variables using one-hot encoding
categorical_columns = ['Clinic Location']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Drop unnecessary columns
non_numeric_columns = ['Patient ID', 'Appointment ID', 'Appointment Reason', 'Specialty', 'Neighborhood', 'name', 'type', 'addr1', 'addr2', 'lat', 'long', 'the_geom', 'neighborho', 'clinic']
data.drop(columns=non_numeric_columns, inplace=True)

# Ensure all boolean columns are converted to integers
for col in data.select_dtypes(include='bool').columns:
    data[col] = data[col].astype(int)

# Define features and target
X = data.drop(columns=['target_no_show'])
y = data['target_no_show']

# Ensure all columns are numeric
print("Data types of features:\n", X.dtypes)

# Convert boolean columns to integers
bool_columns = X.select_dtypes(include='bool').columns
X[bool_columns] = X[bool_columns].astype(int)

# Verify the changes
print("Data types of features after conversion:\n", X.dtypes)

# Check for non-numeric columns
non_numeric_columns = [col for col in X.columns if not np.issubdtype(X[col].dtype, np.number)]
if non_numeric_columns:
    print("Non-numeric columns found after preprocessing:", non_numeric_columns)
    # raise ValueError("Not all columns are numeric after preprocessing.")

# Handling missing values
X.fillna(X.mean(), inplace=True)

# Split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize SMOTE and fit on the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# Define the model
model = RandomForestClassifier(random_state=42)

# Define the reduced parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced', 'balanced_subsample', None]
}

# Initialize GridSearchCV with fewer folds
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=StratifiedKFold(n_splits=3), n_jobs=-1, scoring='f1')

# Fit the GridSearchCV to the training data
grid_search.fit(X_train_res, y_train_res)

# Make predictions on the test data
y_pred = grid_search.predict(X_test_scaled)

# Evaluate the model
print("Best parameters found:", grid_search.best_params_)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


No Show Columns: Index(['no_show', 'Patient ID', 'Appointment ID', 'Gender', 'Age',
       'Alcohol Consumption', 'Hypertension', 'Diabetes', 'Appointment Date',
       'Schedule Date', 'Appointment Reason', 'Clinic Location', 'Specialty',
       'Neighborhood'],
      dtype='object')
Clinics Columns: Index(['name', 'type', 'addr1', 'addr2', 'clinic', 'lat', 'long'], dtype='object')
Planning Neighborhoods Columns: Index(['the_geom', 'neighborho'], dtype='object')
No Show Historical Columns: Index(['Patient ID', 'no_show'], dtype='object')
target column updated
Data types of features:
 Gender                                  int64
Alcohol Consumption                     int64
Hypertension                            int64
Diabetes                                int64
age_group_30-40                         int64
age_group_40-50                         int64
age_group_50-60                         int64
age_group_>60                           int64
appointment_day                         

In [89]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Load datasets
no_show = pd.read_csv('no_show.csv')
clinics = pd.read_csv('clinics.csv')
planning_neighborhoods = pd.read_csv('planning_neighborhoods.csv')
no_show_historical = pd.read_csv('no_show_historical.csv')

# Display column names
print("No Show Columns:", no_show.columns)
print("Clinics Columns:", clinics.columns)
print("Planning Neighborhoods Columns:", planning_neighborhoods.columns)
print("No Show Historical Columns:", no_show_historical.columns)

# Merging datasets
data = pd.merge(no_show, clinics, left_on='Clinic Location', right_on='clinic', how='left')
data = pd.merge(data, planning_neighborhoods, left_on='Neighborhood', right_on='neighborho', how='left')
data = pd.merge(data, no_show_historical, on='Patient ID', how='left')

# Combine no_show_x and no_show_y into a single column
if 'no_show_x' in data.columns and 'no_show_y' in data.columns:
    data['target_no_show'] = data['no_show_x'].fillna(data['no_show_y'])
    print('target column updated')
    # Drop the no_show_x and no_show_y columns
    data.drop(columns=['no_show_x', 'no_show_y'], inplace=True)
else:
    raise KeyError("One or both of 'no_show_x' and 'no_show_y' columns not found in the data.")

# Preprocessing
data['Gender'] = data['Gender'].map({'M': 0, 'F': 1})

# Map 'Alcohol Consumption' to numeric values
mapping_dict = {'0/week': 0, '1/week': 1, '5/week': 2, '10/week': 3, '> 14/week': 4}
data['Alcohol Consumption'] = data['Alcohol Consumption'].map(mapping_dict)

# Convert 'target_no_show' to integer
data['target_no_show'] = data['target_no_show'].astype(int)

# Convert 'Hypertension' and 'Diabetes' to integer
data['Hypertension'] = data['Hypertension'].astype(int)
data['Diabetes'] = data['Diabetes'].astype(int)

# Create age group categories and convert to one-hot encoding
data['age_group'] = pd.cut(data['Age'], bins=[0, 30, 40, 50, 60, 100], labels=['<30', '30-40', '40-50', '50-60', '>60'])
data = pd.get_dummies(data, columns=['age_group'], drop_first=True)

# Convert date columns to datetime format and extract features
data['Appointment Date'] = pd.to_datetime(data['Appointment Date'])
data['Schedule Date'] = pd.to_datetime(data['Schedule Date'])
data['appointment_day'] = data['Appointment Date'].dt.day
data['appointment_month'] = data['Appointment Date'].dt.month
data['appointment_year'] = data['Appointment Date'].dt.year
data['appointment_dayofweek'] = data['Appointment Date'].dt.dayofweek
data['schedule_day'] = data['Schedule Date'].dt.day
data['schedule_month'] = data['Schedule Date'].dt.month
data['schedule_year'] = data['Schedule Date'].dt.year
data['schedule_dayofweek'] = data['Schedule Date'].dt.dayofweek
data['days_until_appointment'] = (data['Appointment Date'] - data['Schedule Date']).dt.days

# Drop the original date columns if they are not needed anymore
data.drop(columns=['Appointment Date', 'Schedule Date', 'Age'], inplace=True)

# Encode categorical variables using one-hot encoding
categorical_columns = ['Clinic Location']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Drop unnecessary columns
non_numeric_columns = ['Patient ID', 'Appointment ID', 'Appointment Reason', 'Specialty', 'Neighborhood', 'name', 'type', 'addr1', 'addr2', 'lat', 'long', 'the_geom', 'neighborho', 'clinic']
data.drop(columns=non_numeric_columns, inplace=True)

# Ensure all boolean columns are converted to integers
for col in data.select_dtypes(include='bool').columns:
    data[col] = data[col].astype(int)

# Define features and target
X = data.drop(columns=['target_no_show'])
y = data['target_no_show']

# Ensure all columns are numeric
print("Data types of features:\n", X.dtypes)

# Convert boolean columns to integers
bool_columns = X.select_dtypes(include='bool').columns
X[bool_columns] = X[bool_columns].astype(int)

# Verify the changes
print("Data types of features after conversion:\n", X.dtypes)

# Check for non-numeric columns
non_numeric_columns = [col for col in X.columns if not np.issubdtype(X[col].dtype, np.number)]
if non_numeric_columns:
    print("Non-numeric columns found after preprocessing:", non_numeric_columns)
    # raise ValueError("Not all columns are numeric after preprocessing.")

# Handling missing values
X.fillna(X.mean(), inplace=True)

# Split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize SMOTE and fit on the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# Define the model
model = RandomForestClassifier(random_state=42)

# Define the parameter grid for Random Search
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample', None]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=50, cv=StratifiedKFold(n_splits=3), n_jobs=-1, scoring='f1', random_state=42)

# Fit the RandomizedSearchCV to the training data
random_search.fit(X_train_res, y_train_res)

# Make predictions on the test data
y_pred = random_search.predict(X_test_scaled)

# Evaluate the model
print("Best parameters found:", random_search.best_params_)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


No Show Columns: Index(['no_show', 'Patient ID', 'Appointment ID', 'Gender', 'Age',
       'Alcohol Consumption', 'Hypertension', 'Diabetes', 'Appointment Date',
       'Schedule Date', 'Appointment Reason', 'Clinic Location', 'Specialty',
       'Neighborhood'],
      dtype='object')
Clinics Columns: Index(['name', 'type', 'addr1', 'addr2', 'clinic', 'lat', 'long'], dtype='object')
Planning Neighborhoods Columns: Index(['the_geom', 'neighborho'], dtype='object')
No Show Historical Columns: Index(['Patient ID', 'no_show'], dtype='object')
target column updated
Data types of features:
 Gender                                  int64
Alcohol Consumption                     int64
Hypertension                            int64
Diabetes                                int64
age_group_30-40                         int64
age_group_40-50                         int64
age_group_50-60                         int64
age_group_>60                           int64
appointment_day                         

In [91]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Load datasets
no_show = pd.read_csv('no_show.csv')
clinics = pd.read_csv('clinics.csv')
planning_neighborhoods = pd.read_csv('planning_neighborhoods.csv')
no_show_historical = pd.read_csv('no_show_historical.csv')

# Display column names
print("No Show Columns:", no_show.columns)
print("Clinics Columns:", clinics.columns)
print("Planning Neighborhoods Columns:", planning_neighborhoods.columns)
print("No Show Historical Columns:", no_show_historical.columns)

# Merging datasets
data = pd.merge(no_show, clinics, left_on='Clinic Location', right_on='clinic', how='left')
data = pd.merge(data, planning_neighborhoods, left_on='Neighborhood', right_on='neighborho', how='left')
data = pd.merge(data, no_show_historical, on='Patient ID', how='left')


# Combine no_show_x and no_show_y into a single column
if 'no_show_x' in data.columns and 'no_show_y' in data.columns:
    data['target_no_show'] = data['no_show_x'].fillna(data['no_show_y'])
    print('target column updated')
    # Drop the no_show_x and no_show_y columns
    data.drop(columns=['no_show_x', 'no_show_y'], inplace=True)
else:
    raise KeyError("One or both of 'no_show_x' and 'no_show_y' columns not found in the data.")



No Show Columns: Index(['no_show', 'Patient ID', 'Appointment ID', 'Gender', 'Age',
       'Alcohol Consumption', 'Hypertension', 'Diabetes', 'Appointment Date',
       'Schedule Date', 'Appointment Reason', 'Clinic Location', 'Specialty',
       'Neighborhood'],
      dtype='object')
Clinics Columns: Index(['name', 'type', 'addr1', 'addr2', 'clinic', 'lat', 'long'], dtype='object')
Planning Neighborhoods Columns: Index(['the_geom', 'neighborho'], dtype='object')
No Show Historical Columns: Index(['Patient ID', 'no_show'], dtype='object')
target column updated


In [92]:
data.head()

Unnamed: 0,Patient ID,Appointment ID,Gender,Age,Alcohol Consumption,Hypertension,Diabetes,Appointment Date,Schedule Date,Appointment Reason,...,name,type,addr1,addr2,clinic,lat,long,the_geom,neighborho,target_no_show
0,649e3901-e56b-41d9-b2d3-f61ce708a415,659a5257-c2f0-4eda-a2bb-ebc4bd9ce4e4,F,43,5/week,False,False,2021-01-14T10:30:00,2020-10-26T00:00:00,CHIROPRACT MANJ 3-4 REGIONS,...,Parnassus,Hospitals & Clinics,400 Parnassus Ave.,"San Francisco, CA 94143",Mission Bay,37.764124,-122.456572,MULTIPOLYGON (((-122.42095167789303 37.8089665...,Russian Hill,False
1,649e3901-e56b-41d9-b2d3-f61ce708a415,659a5257-c2f0-4eda-a2bb-ebc4bd9ce4e4,F,43,5/week,False,False,2021-01-14T10:30:00,2020-10-26T00:00:00,CHIROPRACT MANJ 3-4 REGIONS,...,Mount Zion,Hospitals & Clinics,1600 Divisadero St.,"San Francisco, CA 94115",Mission Bay,37.784882,-122.438723,MULTIPOLYGON (((-122.42095167789303 37.8089665...,Russian Hill,False
2,649e3901-e56b-41d9-b2d3-f61ce708a415,659a5257-c2f0-4eda-a2bb-ebc4bd9ce4e4,F,43,5/week,False,False,2021-01-14T10:30:00,2020-10-26T00:00:00,CHIROPRACT MANJ 3-4 REGIONS,...,Mission Bay,Hospitals & Clinics,1825 Fourth St.,"San Francisco, CA 94158",Mission Bay,37.766561,-122.389948,MULTIPOLYGON (((-122.42095167789303 37.8089665...,Russian Hill,False
3,3028fd02-a20a-4233-ac16-b571dde4540c,7ae6e7f8-3788-48d2-9fbc-11114ec28bfe,F,37,0/week,False,True,2021-02-17T14:00:00,2021-01-25T00:00:00,OFFICE/OUTPATIENT VISIT EST,...,Parnassus,Hospitals & Clinics,400 Parnassus Ave.,"San Francisco, CA 94143",Mission Bay,37.764124,-122.456572,MULTIPOLYGON (((-122.45912298496032 37.7082180...,Ocean View,False
4,3028fd02-a20a-4233-ac16-b571dde4540c,7ae6e7f8-3788-48d2-9fbc-11114ec28bfe,F,37,0/week,False,True,2021-02-17T14:00:00,2021-01-25T00:00:00,OFFICE/OUTPATIENT VISIT EST,...,Mount Zion,Hospitals & Clinics,1600 Divisadero St.,"San Francisco, CA 94115",Mission Bay,37.784882,-122.438723,MULTIPOLYGON (((-122.45912298496032 37.7082180...,Ocean View,False
