In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('data/data/KaggleV2-May-2016.csv')
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [32]:
df['No-show'] = df['No-show'].apply(lambda x: 1 if x == 'Yes' else 0)
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])
df['WaitingTime'] = (df['ScheduledDay'] - df['AppointmentDay']).dt.days
df['ApptDayOfWeek'] = df['AppointmentDay'].dt.dayofweek
df['ScheduledDayOfWeek'] = df['ScheduledDay'].dt.dayofweek

In [33]:
df = df[(df['WaitingTime']) >= 0 & (df['Age'] >= 0)& (df['Age'] <= 120)]
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,WaitingTime,ApptDayOfWeek,ScheduledDayOfWeek
0,29872500000000.0,5642903,F,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,JARDIM DA PENHA,0,1,0,0,0,0,0,0,4,4
1,558997800000000.0,5642503,M,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,0,0,0,0,0,0,0,4,4
2,4262962000000.0,5642549,F,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,MATA DA PRAIA,0,0,0,0,0,0,0,0,4,4
3,867951200000.0,5642828,F,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,PONTAL DE CAMBURI,0,0,0,0,0,0,0,0,4,4
4,8841186000000.0,5642494,F,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,1,1,0,0,0,0,0,4,4


In [34]:
# Feature selection
from scipy.stats import chi2_contingency, pearsonr
# Pearson correlation for numeric features
for col in ["Age", "WaitingTime"]:
    corr, p = pearsonr(df[col], df["No-show"])
    print(f"{col}: Pearson r = {corr:.3f}, p = {p:.4f}")

# Chi-squared test for categorical/binary features
for col in ["Scholarship", "Hipertension", "Diabetes", "Alcoholism", "SMS_received", "ApptDayOfWeek"]:
    table = pd.crosstab(df[col], df["No-show"])
    chi2, p, dof, ex = chi2_contingency(table)
    print(f"{col}: chi2 = {chi2:.3f}, p = {p:.4f}")

Age: Pearson r = -0.058, p = 0.0000
WaitingTime: Pearson r = 0.036, p = 0.0000
Scholarship: chi2 = 14.635, p = 0.0001
Hipertension: chi2 = 25.102, p = 0.0000
Diabetes: chi2 = 9.065, p = 0.0026
Alcoholism: chi2 = 3.504, p = 0.0612
SMS_received: chi2 = 0.000, p = 1.0000
ApptDayOfWeek: chi2 = 47.621, p = 0.0000


In [35]:
# Data preprocessing
features = ['Age', 'WaitingTime', 'Scholarship', 'Hipertension', 'Diabetes',
            'ApptDayOfWeek']
X = df[features]
y = df['No-show']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train=y_train.to_numpy().reshape(-1, 1)
y_test=y_test.to_numpy().reshape(-1, 1)



In [None]:
from train import main
results = main(
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    model_type="regression",
    stepsize_strategy="fixed",
    num_epochs=100,
    batch_size=1
)
print(results)


{'model_type': 'regression', 'stepsize_strategy': 'fixed', 'num_epochs': 100, 'batch_size': 1, 'hidden_dim': -1, 'test_loss': 0.04173346355378949, 'train_loss': 0.02326072931324537, 'grad_norm': 0.0028818804216644753, 'dist_to_opt': 0.9141870410133073}
