In [1]:
import torch

In [None]:
# checking the gpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


### Data Preprocessing

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [10]:
# loading the data
df=pd.read_csv('KaggleV2-May-2016.csv')
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


#### Dropping unwanted columns

In [12]:
df.drop(columns=['PatientId','AppointmentID','ScheduledDay','AppointmentDay'],inplace=True)
df.head()

Unnamed: 0,Gender,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,F,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,M,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,F,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,F,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,F,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [13]:
df['No-show'].value_counts()
# The target class is imbalanced in the ratio of 4:1 which is high class imbalance in the dataset

No-show
No     88208
Yes    22319
Name: count, dtype: int64

#### Label Encoding 

In [14]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['Gender']=le.fit_transform(df['Gender'])
df['No-show']=le.fit_transform(df['No-show'])
df['Neighbourhood']=le.fit_transform(df['Neighbourhood'])

In [15]:
# separating the features and target
X=df.drop(columns=['No-show'])
y=df['No-show']

X.shape,y.shape

((110527, 9), (110527,))

#### Feature Scaling

In [16]:
from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()
X=scalar.fit_transform(X)

#### Splitting into train and validation dataset

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,stratify=y)
X_train.shape,X_val.shape,y_train.shape,y_val.shape

((88421, 9), (22106, 9), (88421,), (22106,))

#### Part 1: From Scratch Implementation
with using class weights to address the class imbalance for the model

In [18]:
input_size=X_train.shape[1]

In [19]:
X_train.shape,X_val.shape,y_train.shape,y_val.shape

((88421, 9), (22106, 9), (88421,), (22106,))

In [47]:
def initialize_parameters(input_size, hidden_layers=[128, 64]):
     
    weights = []
    biases = []
    layers = [input_size] + hidden_layers + [1]  
    
    for i in range(len(layers)-1):
        weights.append(np.random.randn(layers[i], layers[i+1])* 0.01)
        biases.append(np.zeros((1, layers[i+1])))
    
    return weights, biases

In [21]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def ReLU(x):
    return np.maximum(0,x)

In [43]:
def compute_weighted_loss(y_true, y_pred, class_weights):
    
    y_true = np.asarray(y_true).reshape(-1, 1)
    y_pred = np.asarray(y_pred).reshape(-1, 1)
    sample_weights = np.where(y_true == 1, class_weights[1], class_weights[0])
    return np.mean(sample_weights * (y_true - y_pred)**2)

In [48]:
def forward_propagation(X, weights, biases):

    z_layers = []
    activations = [X]
    
    for i in range(len(weights)-1):
        z = np.dot(activations[-1], weights[i]) + biases[i]
        a = ReLU(z)  
        z_layers.append(z)
        activations.append(a)
    
     
    z = np.dot(activations[-1], weights[-1]) + biases[-1]
    a = sigmoid(z)
    z_layers.append(z)
    activations.append(a)
    
    return activations, z_layers

In [49]:
def backward_propagation(X, y, activations, z_layers, weights, biases, class_weights, learning_rate=0.001):
    
    m = X.shape[0]
    y = y.reshape(-1, 1)
    
    sample_weights = np.where(y == 1, class_weights[1], class_weights[0])
    
    grad_w = [np.zeros_like(w) for w in weights]
    grad_b = [np.zeros_like(b) for b in biases]
    
    dA = -(sample_weights * (y - activations[-1]) / m)
    dZ = dA * (activations[-1] * (1 - activations[-1]))  
    
    grad_w[-1] = np.dot(activations[-2].T, dZ)
    grad_b[-1] = np.sum(dZ, axis=0, keepdims=True)
    
    for l in range(len(weights)-2, -1, -1):
        dA = np.dot(dZ, weights[l+1].T)
        dZ = dA * (z_layers[l] > 0)  
        grad_w[l] = np.dot(activations[l].T, dZ)
        grad_b[l] = np.sum(dZ, axis=0, keepdims=True)
    
    for i in range(len(weights)):
        weights[i] -= learning_rate * grad_w[i]
        biases[i] -= learning_rate * grad_b[i]
    
    return weights, biases
     

In [22]:
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y_train)
weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weights = {0: weights[0], 1: weights[1]}

In [50]:
learning_rate=0.001
X_train = np.array(X_train, dtype=np.float32)
y_train = np.array(y_train).reshape(-1, 1)
X_val = np.array(X_val, dtype=np.float32)
y_val = np.array(y_val).reshape(-1, 1)

 
hidden_layers=[128,64]
epochs=100

weights, biases = initialize_parameters(input_size, hidden_layers)

best_weights, best_biases = None, None
best_val_loss = float('inf')

for epoch in range(epochs):
    
    activations, z_layers = forward_propagation(X_train, weights, biases)
        
    train_loss = compute_weighted_loss(y_train, activations[-1], class_weights)
    val_activations, _ = forward_propagation(X_val, weights, biases)
    val_loss = compute_weighted_loss(y_val, val_activations[-1], class_weights)
        
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_weights = [w.copy() for w in weights]
        best_biases = [b.copy() for b in biases]
    
    weights, biases = backward_propagation(X_train, y_train, activations, z_layers, weights, biases, class_weights,learning_rate=learning_rate)
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

print(f"Training complete. Best Val Loss: {best_val_loss:.4f}")

Epoch 0: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 10: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 20: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 30: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 40: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 50: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 60: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 70: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 80: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 90: Train Loss = 0.2500, Val Loss = 0.2500
Training complete. Best Val Loss: 0.2500


In [51]:
from sklearn.metrics import classification_report, f1_score, roc_auc_score, confusion_matrix
def evaluate_model(X_test, y_test, weights, biases, threshold=0.5):
     
    activations, _ = forward_propagation(X_test, weights, biases)
    y_pred_cont = activations[-1]
    
    y_pred_bin = (y_pred_cont > threshold).astype(int)
    
    report= classification_report(y_test, y_pred_bin)
    f1score=f1_score(y_test, y_pred_bin)
    roc=roc_auc_score(y_test, y_pred_cont)
    matrix= confusion_matrix(y_test, y_pred_bin)
    metrics = {
        'classification_report':report,
        'f1_score': f1score,
        'roc_auc': roc,   
        'confusion_matrix': matrix
    }
    
    return metrics


In [52]:
# 4. Evaluate
metrics = evaluate_model(X_val, y_val, best_weights, best_biases)
print("Classification Report:\n", metrics['classification_report'])
print("F1 Score:", metrics['f1_score'])
print("ROC AUC:", metrics['roc_auc'])
print("Confusion Matrix:\n", metrics['confusion_matrix'])

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.14      0.24     17642
           1       0.21      0.89      0.34      4464

    accuracy                           0.29     22106
   macro avg       0.53      0.52      0.29     22106
weighted avg       0.71      0.29      0.26     22106

F1 Score: 0.3386481214485625
ROC AUC: 0.5633827500681617
Confusion Matrix:
 [[ 2517 15125]
 [  471  3993]]


#### Part 2: PyTorch Implementation

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim

class ANN_model(nn.Module):

  def __init__(self, input_size):
        super().__init__()
        self.net = nn.Sequential(
            
            nn.Linear(input_size, 128),
            nn.ReLU(),
            
            nn.Linear(128, 64),
            nn.ReLU(),

            nn.Linear(64,1),
            nn.Sigmoid()
        )
    
  def forward(self, x):
      return self.net(x)


In [None]:
from sklearn.utils.class_weight import compute_class_weight

y_train_np = y_train.values if hasattr(y_train, 'values') else np.array(y_train)
y_train_np = y_train_np.ravel()  

classes = np.unique(y_train_np)
weights = compute_class_weight('balanced', classes=classes, y=y_train_np)

class_weights = torch.tensor(weights, dtype=torch.float).to('cpu')
sample_weights = class_weights[y_train_np].reshape(-1, 1)

In [34]:
input_shape=X_train.shape[1]
model=ANN_model(input_shape)
optimizer=optim.Adam(model.parameters(),lr=0.001)
criterion=nn.BCELoss(weight=sample_weights)

In [36]:
num_epochs =100
loss_his=[]
for epoch in range(num_epochs):

    model.train()
    inputs = torch.from_numpy(X_train).float()
    labels = torch.from_numpy(y_train).float().view(-1, 1)

    outputs = model(inputs)
    loss = criterion(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    loss_his.append(loss)
    if epoch%10==0:
        print(f"epoch is {epoch} and loss is {loss.item()}")

epoch is 0 and loss is 0.6962325572967529
epoch is 10 and loss is 0.6788383722305298
epoch is 20 and loss is 0.6770852208137512
epoch is 30 and loss is 0.6758477687835693
epoch is 40 and loss is 0.6752893924713135
epoch is 50 and loss is 0.674804151058197
epoch is 60 and loss is 0.6744301319122314
epoch is 70 and loss is 0.6740926504135132
epoch is 80 and loss is 0.6737724542617798
epoch is 90 and loss is 0.6734555959701538


In [None]:
from sklearn.metrics import classification_report, f1_score, roc_auc_score,confusion_matrix
model.eval()
with torch.no_grad():
    inputs=torch.from_numpy(X_val).float()
    outputs=model(inputs)
    y_pred_prob=outputs.numpy()
    y_pred=(y_pred_prob>0.5)

report=classification_report(y_val,y_pred)
f1score=f1_score(y_val,y_pred)
roc=roc_auc_score(y_val,y_pred)
matrix=confusion_matrix(y_val,y_pred)
print("classification report",report)
print("f1 Score",f1score)
print("ROC AUC Score",roc)
print("Confusion Matrix\n",matrix)

classification report               precision    recall  f1-score   support

           0       0.83      0.70      0.76     17642
           1       0.27      0.46      0.34      4464

    accuracy                           0.65     22106
   macro avg       0.55      0.58      0.55     22106
weighted avg       0.72      0.65      0.67     22106

f1 Score 0.3431025770591208
ROC AUC Score 0.5758799489366163
Confusion Matrix
 [[12269  5373]
 [ 2427  2037]]


#### Adding Useful Features
I noticed that adding useful features is significantly improving the model performance even for a imbalanced dataset
SO I decided to add more useful features to distinguish between the patient who would show up versus those who would miss their appointments.

In [76]:
df = pd.read_csv("KaggleV2-May-2016.csv")

df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])

df['days_between'] = (df['AppointmentDay'] - df['ScheduledDay']).dt.days
df['days_between'] = np.where(df['days_between'] < 0, 0, df['days_between'])

df['waiting_category'] = pd.cut(df['days_between'],
    bins=[-1, 0, 1, 3, 7, 14, 30, 1000],
    labels=['same_day', '1_day', '2-3_days', '4-7_days', '8-14_days', '15-30_days', '30+_days'])

In [77]:
df['is_weekend'] = (df['AppointmentDay'].dt.dayofweek >= 5).astype(int)
df['month'] = df['AppointmentDay'].dt.month
df['hour_of_day'] = df['ScheduledDay'].dt.hour
df['appointment_dayofweek'] = df['AppointmentDay'].dt.dayofweek

In [78]:
patient_history = df.groupby('PatientId').agg(
    total_appointments=('AppointmentID', 'count'),
    prev_no_shows=('No-show', lambda x: sum(x == 'Yes'))
)
df = df.merge(patient_history, on='PatientId', how='left')
df['no_show_rate'] = df['prev_no_shows'] / df['total_appointments']

In [79]:
df = df.sort_values(['PatientId', 'AppointmentDay'])
df['days_since_last'] = df.groupby('PatientId')['AppointmentDay'].diff().dt.days
df['days_since_last'] = df['days_since_last'].fillna(0)

In [80]:
conditions = ['Hipertension', 'Diabetes', 'Alcoholism', 'Handcap']
df['health_risk_score'] = df[conditions].sum(axis=1)
df['senior_with_condition'] = ((df['Age'] >= 60) & (df[conditions].sum(axis=1) > 0)).astype(int)

df['last_minute'] = (df['days_between'] <= 1).astype(int)
df['appt_freq'] = df.groupby('PatientId')['AppointmentDay'].transform('count')
df['prev_same_day'] = df.groupby(['PatientId', df['AppointmentDay'].dt.date])['AppointmentID'].transform('count') - 1

In [81]:
df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['month']/12)

In [82]:
bins = [0, 12, 19, 30, 50, 70, 120]
labels = ['child', 'teen', 'young_adult', 'adult', 'senior', 'elderly']
df['age_group'] = pd.cut(df['Age'], bins=bins, labels=labels)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['age_group']=le.fit_transform(df['age_group'])
df['No-show']=le.fit_transform(df['No-show'])

In [None]:
from category_encoders import TargetEncoder

encoder = TargetEncoder(cols=['Neighbourhood', 'waiting_category'])
df[['Neighbourhood', 'waiting_category']] = encoder.fit_transform(df[['Neighbourhood', 'waiting_category']],df['No-show'])

df=df.drop(columns=['ScheduledDay', 'AppointmentDay'])

In [None]:
X=df.drop(columns=['No-show'])
y=df['No-show']

from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()
X=scalar.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [65]:
X_train.shape,X_val.shape,y_train.shape,y_val.shape

((88421, 29), (22106, 29), (88421,), (22106,))

In [66]:
from sklearn.utils.class_weight import compute_class_weight

y_train_np = y_train.values if hasattr(y_train, 'values') else np.array(y_train)
y_train_np = y_train_np.ravel()  

classes = np.unique(y_train_np)
weights = compute_class_weight('balanced', classes=classes, y=y_train_np)

class_weights = torch.tensor(weights, dtype=torch.float).to('cpu')
sample_weights = class_weights[y_train_np].reshape(-1, 1)


In [67]:
learning_rate=0.001
X_train = np.array(X_train, dtype=np.float32)
y_train = np.array(y_train).reshape(-1, 1)
X_val = np.array(X_val, dtype=np.float32)
y_val = np.array(y_val).reshape(-1, 1)

input_size=X_train.shape[1]
hidden_layers=[128,64]
epochs=100

weights, biases = initialize_parameters(input_size, hidden_layers)

best_weights, best_biases = None, None
best_val_loss = float('inf')

for epoch in range(epochs):
    
    activations, z_layers = forward_propagation(X_train, weights, biases)
        
    train_loss = compute_weighted_loss(y_train, activations[-1], class_weights)
    val_activations, _ = forward_propagation(X_val, weights, biases)
    val_loss = compute_weighted_loss(y_val, val_activations[-1], class_weights)
        
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_weights = [w.copy() for w in weights]
        best_biases = [b.copy() for b in biases]
    
    weights, biases = backward_propagation(X_train, y_train, activations, z_layers, weights, biases, class_weights,learning_rate=learning_rate)
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

print(f"Training complete. Best Val Loss: {best_val_loss:.4f}")

Epoch 0: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 10: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 20: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 30: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 40: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 50: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 60: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 70: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 80: Train Loss = 0.2500, Val Loss = 0.2500
Epoch 90: Train Loss = 0.2500, Val Loss = 0.2500
Training complete. Best Val Loss: 0.2500


In [68]:
metrics = evaluate_model(X_val, y_val, best_weights, best_biases)
print("Classification Report:\n", metrics['classification_report'])
print("F1 Score:", metrics['f1_score'])
print("ROC AUC:", metrics['roc_auc'])
print("Confusion Matrix:\n", metrics['confusion_matrix'])

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.90      0.85     17642
           1       0.23      0.12      0.15      4464

    accuracy                           0.74     22106
   macro avg       0.52      0.51      0.50     22106
weighted avg       0.69      0.74      0.71     22106

F1 Score: 0.1540532455877954
ROC AUC: 0.44500260101444133
Confusion Matrix:
 [[15935  1707]
 [ 3949   515]]


In [69]:
import torch
import torch.nn as nn
import torch.optim as optim

class ANN_model(nn.Module):

  def __init__(self, input_size):
        super().__init__()
        self.net = nn.Sequential(
            
            nn.Linear(input_size, 128),
            nn.ReLU(),
            
            nn.Linear(128, 64),
            nn.ReLU(),

            nn.Linear(64,1),
            nn.Sigmoid()
        )
    
  def forward(self, x):
      return self.net(x)
  


In [None]:
input_shape=X_train.shape[1]
model=ANN_model(input_shape)
optimizer=optim.Adam(model.parameters(),lr=0.001)
criterion=nn.BCELoss(weight=sample_weights)

In [73]:
num_epochs =75
loss_his=[]
for epoch in range(num_epochs):

    model.train()
    inputs = torch.from_numpy(X_train).float()
    labels = torch.from_numpy(y_train).float().view(-1, 1)

    outputs = model(inputs)
    loss = criterion(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    loss_his.append(loss)
    if epoch%10==0:
        print(f"epoch is {epoch} and loss is {loss.item()}")

epoch is 0 and loss is 0.6948327422142029
epoch is 10 and loss is 0.5904402136802673
epoch is 20 and loss is 0.4579855501651764
epoch is 30 and loss is 0.33583006262779236
epoch is 40 and loss is 0.2652541399002075
epoch is 50 and loss is 0.2411290854215622
epoch is 60 and loss is 0.23280633985996246
epoch is 70 and loss is 0.22654058039188385


In [74]:
from sklearn.metrics import classification_report, roc_auc_score, f1_score, confusion_matrix
model.eval()
with torch.no_grad():
    inputs=torch.from_numpy(X_val).float()
    outputs=model(inputs)
    y_pred_prob=outputs.numpy()
    y_pred=(y_pred_prob>0.5)

report=classification_report(y_val,y_pred)
f1score=f1_score(y_val,y_pred)
roc=roc_auc_score(y_val,y_pred)
matrix=confusion_matrix(y_val,y_pred)
print("classification report",report)
print("f1 Score",f1score)
print("ROC AUC Score",roc)
print("Confusion Matrix\n",matrix)

classification report               precision    recall  f1-score   support

           0       0.98      0.88      0.93     17642
           1       0.66      0.91      0.76      4464

    accuracy                           0.89     22106
   macro avg       0.82      0.90      0.84     22106
weighted avg       0.91      0.89      0.89     22106

f1 Score 0.7643252368001501
ROC AUC Score 0.8962319549226573
Confusion Matrix
 [[15518  2124]
 [  389  4075]]
