# Testing ML Models

In [1]:
%load_ext autoreload
%autoreload 2 

In [2]:
from models.linear_regression   import LinearRegression
from models.logistic_regression import LogisticRegression
from models.decision_tree       import DecisionTree

## Creating a fake dataset

In [3]:
import pandas as pd
import numpy as np

np.random.seed(42)

num_samples = 500

data = {
    "id": np.arange(1, num_samples + 1),
    "has_fever": np.random.choice([0, 1], size=num_samples, p=[0.4, 0.6]),
    "has_cough": np.random.choice([0, 1], size=num_samples, p=[0.5, 0.5]),
    "has_breathing_issues": np.random.choice([0, 1], size=num_samples, p=[0.3, 0.7]),
}

# Simulate infection status with some correlation to symptoms
data["infected"] = np.where(
    (data["has_fever"] == 1) & (data["has_cough"] == 1) & (data["has_breathing_issues"] == 1),
    np.random.choice([0, 1], size=num_samples, p=[0.2, 0.8]),  # Higher probability of infection if all symptoms
    np.random.choice([0, 1], size=num_samples, p=[0.7, 0.3])   # Lower probability otherwise
)

# Convert to DataFrame
df = pd.DataFrame(data)

df

Unnamed: 0,id,has_fever,has_cough,has_breathing_issues,infected
0,1,0,1,0,0
1,2,1,1,1,1
2,3,1,0,1,1
3,4,1,1,1,1
4,5,0,1,1,0
...,...,...,...,...,...
495,496,0,0,1,0
496,497,1,1,1,1
497,498,0,0,1,1
498,499,1,1,1,0


## Data processing

In [4]:
feature_names = ['has_fever', 'has_cough', 'has_breathing_issues']
target_name   = 'infected'
X_classification = df[feature_names].values
y_classification = df[target_name].values

## Decision Tree

In [5]:
# Decision Tree Classifier (ID3 using entropy)
clf = DecisionTree(max_depth=5, feature_names=feature_names)
clf.fit(X_classification, y_classification)


print(f"Classification Accuracy: {clf.score(X_classification, y_classification):.2f}")
clf.print_tree()

Classification Accuracy: 0.74
├── has_cough ≤ 1.000
  🎯 Class: 0
└── has_cough > 1.000
  ├── has_fever ≤ 1.000
    🎯 Class: 0
  └── has_fever > 1.000
    ├── has_breathing_issues ≤ 1.000
      🎯 Class: 0
    └── has_breathing_issues > 1.000
      🎯 Class: 1


## Logistic Regression

In [6]:
# train the clf
clf = LogisticRegression()
clf.fit(X_classification, y_classification, epochs=5000, learning_rate=0.01)

# make predictions
y_proba = clf.predict_prob(X_classification)
y_pred = clf.predict(X_classification)
accuracy = clf.score(X_classification, y_classification)

print("\n-----------------------------")
#print(f"Input data:\n{X_classification}")
print(f"Actual labels: {y_classification}")
#print(f"Predicted probabilities: {y_proba}")
print(f"Predicted labels: {y_pred}")
print(f"clf Accuracy: {accuracy:.2f}")
print("-----------------------------")
print("Logistic Regression clf parameters:")
print(f"Weights: {clf.weights}")
print(f"Bias: {clf.bias}")
print("-----------------------------")

Epoch 0, loss 0.6931471785599453
Epoch 300, loss 0.6654828348188853
Epoch 600, loss 0.655846180116607
Epoch 900, loss 0.6482625755848527
Epoch 1200, loss 0.6418689493163338
Epoch 1500, loss 0.6364359426284852
Epoch 1800, loss 0.6317977282689398
Epoch 2100, loss 0.6278211537221035
Epoch 2400, loss 0.6243982547054788
Epoch 2700, loss 0.6214410252191751
Epoch 3000, loss 0.6188773435701448
Epoch 3300, loss 0.6166477846451202
Epoch 3600, loss 0.6147031217949815
Epoch 3900, loss 0.613002363447676
Epoch 4200, loss 0.6115112039140987
Epoch 4500, loss 0.6102007953619346
Epoch 4800, loss 0.6090467694526367

-----------------------------
Actual labels: [0 1 1 1 0 1 0 1 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0
 0 0 0 1 0 0 1 1 1 0 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1
 1 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 1 0 1 0
 0 0 1 1 1 0 0 0 1 1 0 1 0 1 0 0 1 1 0 1 0 0 0 1 1 1 1 0 1 1 0 1 0 0 1 0 1
 0 1 0 0 1 1 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0