In [43]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
import tensorflow as tf
from tensorflow.python.keras import layers, models
from sklearn.preprocessing import LabelEncoder

In [44]:
# Load the data
data = pd.read_csv('reduced_data.csv')

# Check for and handle missing values (if any)
data = data.dropna()

# Convert 'target_no_show' from bool to int
data['target_no_show'] = data['target_no_show'].astype(int)

# Verify data types before processing
print("Initial Data Types:\n", data.dtypes)

Initial Data Types:
 Gender                 object
Age                     int64
Alcohol_Consumption    object
Hypertension             bool
Diabetes                 bool
Appointment_Date       object
Schedule_Date          object
Clinic_Location        object
Specialty              object
Neighborhood           object
target_no_show          int64
dtype: object


In [45]:
# Create age group categories and one-hot encoding
data['age_group'] = pd.cut(data['Age'], bins=[0, 30, 40, 50, 60, 100], labels=['<30', '30-40', '40-50', '50-60', '>60'])
data = pd.get_dummies(data, columns=['age_group'], drop_first=True)
data.drop('Age', axis='columns', inplace=True)

# Map 'Alcohol Consumption' to numeric values
mapping_dict = {'0/week': 0, '1/week': 1, '5/week': 2, '10/week': 3, '> 14/week': 4}
data['Alcohol_Consumption'] = data['Alcohol_Consumption'].map(mapping_dict)

# Convert other necessary columns to integers
data['Hypertension'] = data['Hypertension'].astype(int)
data['Diabetes'] = data['Diabetes'].astype(int)

# Convert date columns to datetime format and extract features
data['Appointment_Date'] = pd.to_datetime(data['Appointment_Date'])
data['Schedule_Date'] = pd.to_datetime(data['Schedule_Date'])
data['days_until_appointment'] = (data['Appointment_Date'] - data['Schedule_Date']).dt.days

# Drop the original date columns if not needed anymore
data.drop(columns=['Appointment_Date', 'Schedule_Date'], inplace=True)

# Apply One-Hot Encoding for nominal variables
data = pd.get_dummies(data, columns=['Clinic_Location', 'Specialty', 'Neighborhood'])

# Apply Label Encoding for binary variables
le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])

In [46]:
# Convert boolean columns to integers
bool_columns = data.select_dtypes(include=['bool']).columns
for column in bool_columns:
    data[column] = data[column].astype(int)

# Verify data types after conversion
print("\nData Types After Conversion:\n", data.dtypes)


Data Types After Conversion:
 Gender                              int64
Alcohol_Consumption                 int64
Hypertension                        int64
Diabetes                            int64
target_no_show                      int64
                                    ...  
Neighborhood_Treasure Island/YBI    int64
Neighborhood_Twin Peaks             int64
Neighborhood_Visitacion Valley      int64
Neighborhood_West of Twin Peaks     int64
Neighborhood_Western Addition       int64
Length: 323, dtype: object


In [47]:
# Ensure all data is numeric
assert data.apply(lambda x: np.issubdtype(x.dtype, np.number)).all(), "Non-numeric data found in dataset"


In [48]:
# Create feature and target arrays
X = data.drop(columns=['target_no_show']).values
y = data['target_no_show'].values

In [49]:
# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

In [50]:
# Convert to TensorFlow-compatible format and ensure data type consistency
X_smote = np.array(X_smote, dtype=np.float32)
y_smote = np.array(y_smote, dtype=np.float32)

In [51]:
# Verify shapes and types of the arrays
print("\nShapes and Types of Arrays:")
print(f"X_smote type: {type(X_smote)}, dtype: {X_smote.dtype}, shape: {X_smote.shape}")
print(f"y_smote type: {type(y_smote)}, dtype: {y_smote.dtype}, shape: {y_smote.shape}")



Shapes and Types of Arrays:
X_smote type: <class 'numpy.ndarray'>, dtype: float32, shape: (53704, 322)
y_smote type: <class 'numpy.ndarray'>, dtype: float32, shape: (53704,)


In [52]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

In [53]:
# Print shapes and types for debugging
print(f"\nX_train type: {type(X_train)}, dtype: {X_train.dtype}, shape: {X_train.shape}")
print(f"y_train type: {type(y_train)}, dtype: {y_train.dtype}, shape: {y_train.shape}")


X_train type: <class 'numpy.ndarray'>, dtype: float32, shape: (42963, 322)
y_train type: <class 'numpy.ndarray'>, dtype: float32, shape: (42963,)


In [54]:
# Define a simple TensorFlow model
model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [55]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [56]:
# Train the model
try:
    history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)
except Exception as e:
    print(f"Error during model training: {e}")

Epoch 1/10
[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.6812 - loss: 0.6108 - val_accuracy: 0.8122 - val_loss: 0.4096
Epoch 2/10
[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8125 - loss: 0.4060 - val_accuracy: 0.8147 - val_loss: 0.4129
Epoch 3/10
[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8161 - loss: 0.3797 - val_accuracy: 0.8263 - val_loss: 0.3672
Epoch 4/10
[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8284 - loss: 0.3591 - val_accuracy: 0.7855 - val_loss: 0.4187
Epoch 5/10
[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8280 - loss: 0.3568 - val_accuracy: 0.8278 - val_loss: 0.3519
Epoch 6/10
[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8364 - loss: 0.3387 - val_accuracy: 0.7820 - val_loss: 0.4216
Epoch 7/10
[1m1

In [57]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {accuracy:.2f}")

[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 623us/step - accuracy: 0.8616 - loss: 0.2877

Test Accuracy: 0.86


In [58]:
# Predict the labels for the test set
y_pred = (model.predict(X_test) > 0.5).astype("int32")

[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 749us/step


In [59]:
# Calculate and print classification metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Precision: 0.87
Recall: 0.84
F1 Score: 0.86
Accuracy: 0.86


In [60]:
# Generate and print the full classification report
LABELS = ["Show", "No Show"]
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=LABELS))


Classification Report:
              precision    recall  f1-score   support

        Show       0.85      0.88      0.86      5407
     No Show       0.87      0.84      0.86      5334

    accuracy                           0.86     10741
   macro avg       0.86      0.86      0.86     10741
weighted avg       0.86      0.86      0.86     10741



In [41]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder
 
# Load the data
data = pd.read_csv('reduced_data.csv')
 
# Check for and handle missing values (if any)
data = data.dropna()
 
# Convert 'target_no_show' from bool to int
data['target_no_show'] = data['target_no_show'].astype(int)
 
# Verify data types before processing
print("Initial Data Types:\n", data.dtypes)
 
# Create age group categories and one-hot encoding
data['age_group'] = pd.cut(data['Age'], bins=[0, 30, 40, 50, 60, 100], labels=['<30', '30-40', '40-50', '50-60', '>60'])
data = pd.get_dummies(data, columns=['age_group'], drop_first=True)
data.drop('Age', axis='columns', inplace=True)
 
# Map 'Alcohol Consumption' to numeric values
mapping_dict = {'0/week': 0, '1/week': 1, '5/week': 2, '10/week': 3, '> 14/week': 4}
data['Alcohol_Consumption'] = data['Alcohol_Consumption'].map(mapping_dict)
 
# Convert other necessary columns to integers
data['Hypertension'] = data['Hypertension'].astype(int)
data['Diabetes'] = data['Diabetes'].astype(int)
 
# Convert date columns to datetime format and extract features
data['Appointment_Date'] = pd.to_datetime(data['Appointment_Date'])
data['Schedule_Date'] = pd.to_datetime(data['Schedule_Date'])
data['days_until_appointment'] = (data['Appointment_Date'] - data['Schedule_Date']).dt.days
 
# Drop the original date columns if not needed anymore
data.drop(columns=['Appointment_Date', 'Schedule_Date'], inplace=True)
 
# Apply One-Hot Encoding for nominal variables
data = pd.get_dummies(data, columns=['Clinic_Location', 'Specialty', 'Neighborhood'])
 
# Apply Label Encoding for binary variables
le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])
 
# Convert boolean columns to integers
bool_columns = data.select_dtypes(include=['bool']).columns
for column in bool_columns:
    data[column] = data[column].astype(int)
 
# Verify data types after conversion
print("\nData Types After Conversion:\n", data.dtypes)
 
# Ensure all data is numeric
assert data.apply(lambda x: np.issubdtype(x.dtype, np.number)).all(), "Non-numeric data found in dataset"
 
# Create feature and target arrays
X = data.drop(columns=['target_no_show']).values
y = data['target_no_show'].values
 
# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)
 
# Convert to TensorFlow-compatible format and ensure data type consistency
X_smote = np.array(X_smote, dtype=np.float32)
y_smote = np.array(y_smote, dtype=np.float32)
 
# Verify shapes and types of the arrays
print("\nShapes and Types of Arrays:")
print(f"X_smote type: {type(X_smote)}, dtype: {X_smote.dtype}, shape: {X_smote.shape}")
print(f"y_smote type: {type(y_smote)}, dtype: {y_smote.dtype}, shape: {y_smote.shape}")
 
 
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)
 
# Print shapes and types for debugging
print(f"\nX_train type: {type(X_train)}, dtype: {X_train.dtype}, shape: {X_train.shape}")
print(f"y_train type: {type(y_train)}, dtype: {y_train.dtype}, shape: {y_train.shape}")
 
# Define a simple TensorFlow model
model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])
 
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
 
# Train the model
try:
    history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)
except Exception as e:
    print(f"Error during model training: {e}")
 
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {accuracy:.2f}")
 
# Predict the labels for the test set
y_pred = (model.predict(X_test) > 0.5).astype("int32")
 
# Calculate and print classification metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
 
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
 
# Generate and print the full classification report
LABELS = ["Show", "No Show"]
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=LABELS))
 

Initial Data Types:
 Gender                 object
Age                     int64
Alcohol_Consumption    object
Hypertension             bool
Diabetes                 bool
Appointment_Date       object
Schedule_Date          object
Clinic_Location        object
Specialty              object
Neighborhood           object
target_no_show          int64
dtype: object

Data Types After Conversion:
 Gender                              int64
Alcohol_Consumption                 int64
Hypertension                        int64
Diabetes                            int64
target_no_show                      int64
                                    ...  
Neighborhood_Treasure Island/YBI    int64
Neighborhood_Twin Peaks             int64
Neighborhood_Visitacion Valley      int64
Neighborhood_West of Twin Peaks     int64
Neighborhood_Western Addition       int64
Length: 323, dtype: object

Shapes and Types of Arrays:
X_smote type: <class 'numpy.ndarray'>, dtype: float32, shape: (53704, 322)
y_smote typ

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7015 - loss: 0.5970 - val_accuracy: 0.6972 - val_loss: 0.7273
Epoch 2/10
[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8050 - loss: 0.4220 - val_accuracy: 0.8181 - val_loss: 0.4136
Epoch 3/10
[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8163 - loss: 0.3832 - val_accuracy: 0.7906 - val_loss: 0.4071
Epoch 4/10
[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8219 - loss: 0.3695 - val_accuracy: 0.8281 - val_loss: 0.3744
Epoch 5/10
[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8340 - loss: 0.3434 - val_accuracy: 0.8277 - val_loss: 0.3418
Epoch 6/10
[1m1075/1075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8377 - loss: 0.3290 - val_accuracy: 0.8455 - val_loss: 0.3279
Epoch 7/10
[1m1075/1075[0