<a href="https://colab.research.google.com/github/dornercr/INFO371/blob/main/INFO371_week9_nlp_lab_healthcare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# INFO 371: NLP Lab – Healthcare Dataset (Generated Manually)
# Author: Charles Dorner, EdD (Candidate)

!pip install torchdata --quiet

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
import random

# 🔧 Tensor Demos
print("Manual tensor:", torch.tensor([1, 2, 3]))
print("Random 2x3 tensor:", torch.rand(2, 3))
print("Range tensor:", torch.arange(0, 6, dtype=torch.float32))

# 📥 Generate healthcare-style dataset (truthful vs. misinformation)
true_claims = [
    "Vaccines prevent serious illness and save lives.",
    "Washing hands reduces the spread of infection.",
    "Exercise improves cardiovascular health.",
    "Wearing seat belts reduces fatalities in car accidents.",
    "Early cancer screening increases survival rates.",
    "Flu shots are recommended annually for most people.",
    "Smoking increases the risk of lung cancer.",
    "Obesity is a risk factor for type 2 diabetes.",
    "Proper hydration supports kidney function.",
    "Sleep deprivation affects immune response."
]

false_claims = [
    "Vaccines cause autism in children.",
    "Drinking bleach cures viral infections.",
    "You can detox your body with foot pads.",
    "Microwaving food removes its nutrients.",
    "Essential oils cure cancer.",
    "5G towers cause COVID-19.",
    "Wearing wet socks cures the flu.",
    "Alkaline water neutralizes all disease.",
    "Sunlight alone can replace a healthy diet.",
    "Ear candles remove brain toxins."
]

# Build dataset of 5000 entries (balanced)
reviews, sentiments = [], []
for _ in range(2500):
    reviews.append(random.choice(true_claims))
    sentiments.append(1)  # True

    reviews.append(random.choice(false_claims))
    sentiments.append(0)  # False

# Shuffle
combined = list(zip(reviews, sentiments))
random.shuffle(combined)
reviews, sentiments = zip(*combined)

# Create DataFrame
df = pd.DataFrame({"review": reviews, "sentiment": sentiments})

# Split manually
train_texts = df["review"][:4000]
train_labels = df["sentiment"][:4000]
test_texts = df["review"][4000:]
test_labels = df["sentiment"][4000:]
label_names = ["false", "true"]

# 🔠 Tokenizer and Vocabulary
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data):
    for text in data:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_texts), specials=["<pad>"])
vocab.set_default_index(vocab["<pad>"])

def encode(text):
    return torch.tensor(vocab(tokenizer(text)), dtype=torch.long)

def collate_batch(batch):
    text_list, label_list = [], []
    for _text, _label in batch:
        text_list.append(encode(_text))
        label_list.append(torch.tensor(_label, dtype=torch.long))
    text_list = pad_sequence(text_list, batch_first=True, padding_value=vocab["<pad>"])
    return text_list, torch.stack(label_list)

# 📦 Dataset and DataLoader
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        return self.texts.iloc[idx], self.labels.iloc[idx]

train_ds = TextDataset(train_texts, train_labels)
test_ds = TextDataset(test_texts, test_labels)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_batch)
test_dl = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=collate_batch)

# 🧠 Model Definition
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        mean_emb = embedded.mean(dim=1)
        x = self.relu(self.fc1(mean_emb))
        return self.fc2(x)

# ⚙️ Training Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TextClassifier(len(vocab), 64, 32, 2).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# 🔁 Training Loop
for epoch in range(3):
    model.train()
    total_loss = 0
    for batch, (X, y) in enumerate(train_dl):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if batch % 50 == 0:
            print(f"Epoch {epoch+1}, Batch {batch}, Loss: {loss.item():.4f}")
    print(f"Epoch {epoch+1} complete | Total Loss: {total_loss:.4f}")

# 💾 Save and Reload Model
torch.save(model.state_dict(), "healthcare_sentiment_model.pt")
model.load_state_dict(torch.load("healthcare_sentiment_model.pt"))
model.eval()

# 🧪 Evaluation
y_true, y_pred = [], []
with torch.no_grad():
    for X, y in test_dl:
        X = X.to(device)
        out = model(X)
        preds = out.argmax(1).cpu().tolist()
        y_true.extend(y.tolist())
        y_pred.extend(preds)

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Classification Report:\n", classification_report(y_true, y_pred, target_names=label_names))

# 🔍 Misclassifications
for i in range(10):
    if y_true[i] != y_pred[i]:
        print(f"❌ Predicted {label_names[y_pred[i]]}, Actual {label_names[y_true[i]]}: {test_texts.iloc[i][:100]}")


Manual tensor: tensor([1, 2, 3])
Random 2x3 tensor: tensor([[0.7209, 0.7654, 0.4252],
        [0.8179, 0.0247, 0.5336]])
Range tensor: tensor([0., 1., 2., 3., 4., 5.])
Epoch 1, Batch 0, Loss: 0.7188
Epoch 1, Batch 50, Loss: 0.4466
Epoch 1, Batch 100, Loss: 0.0934
Epoch 1 complete | Total Loss: 42.8276
Epoch 2, Batch 0, Loss: 0.0457
Epoch 2, Batch 50, Loss: 0.0133
Epoch 2, Batch 100, Loss: 0.0056
Epoch 2 complete | Total Loss: 1.6946
Epoch 3, Batch 0, Loss: 0.0042
Epoch 3, Batch 50, Loss: 0.0035
Epoch 3, Batch 100, Loss: 0.0019
Epoch 3 complete | Total Loss: 0.3514
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

       false       1.00      1.00      1.00       512
        true       1.00      1.00      1.00       488

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000

