In [31]:
# Import necessary libraries
import pandas as pd
import re
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, BertModel, AdamW
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np


In [25]:
# Load the dataset
file_path = "/content/healthcare_dataset.csv"
healthcare_data = pd.read_csv(file_path)

# Display the first few rows
print("Dataset successfully loaded.")
print(healthcare_data.head())

# Display dataset info
print("\nDataset Info:")
print(healthcare_data.info())


Dataset successfully loaded.
            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Date   Medica

In [38]:
# Check the column names in the DataFrame
print("Columns in DataFrame:", healthcare_data.columns)

# Rename the 'Medical Condition' column to 'text' if it hasn't been renamed
if 'Medical Condition' in healthcare_data.columns:
    healthcare_data.rename(columns={'Medical Condition': 'text'}, inplace=True)

# Convert subtopics to numeric labels
subtopic_mapping = {
    "nutrition": 0,
    "exercise": 1,
    "mental health": 2,
    "physical therapy": 3
}
healthcare_data['subtopic'] = healthcare_data['subtopic'].map(subtopic_mapping)

if healthcare_data['subtopic'].isnull().any():
    raise ValueError("Some subtopics could not be mapped. Ensure all subtopics are included in the mapping.")

# Display the updated DataFrame
print("\nUpdated DataFrame with Subtopics:")
print(healthcare_data[['text', 'subtopic']].head())


Columns in DataFrame: Index(['Name', 'Age', 'Gender', 'Blood Type', 'text', 'Date of Admission',
       'Doctor', 'Hospital', 'Insurance Provider', 'Billing Amount',
       'Room Number', 'Admission Type', 'Discharge Date', 'Medication',
       'Test Results', 'subtopic'],
      dtype='object')

Updated DataFrame with Subtopics:
       text  subtopic
0    Cancer         0
1   Obesity         1
2   Obesity         1
3  Diabetes         0
4    Cancer         0


In [39]:
# Split data into training and testing sets
train_data, test_data = train_test_split(healthcare_data, test_size=0.2, random_state=42)

# Convert text to numerical features
vectorizer = TfidfVectorizer(max_features=500)
X_train = vectorizer.fit_transform(train_data['text'])
X_test = vectorizer.transform(test_data['text'])

# Train logistic regression model
classifier = LogisticRegression()
classifier.fit(X_train, train_data['subtopic'])

# Evaluate the model
accuracy = classifier.score(X_test, test_data['subtopic'])
print(f"\nClassifier Accuracy: {accuracy:.2f}")

# Test the classifier with a new query
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.lower().strip()

test_query = "ways to improve sleep"
test_query_vec = vectorizer.transform([preprocess_text(test_query)])
predicted_subtopic = classifier.predict(test_query_vec)
print(f"Predicted Subtopic for '{test_query}': {predicted_subtopic[0]}")


Classifier Accuracy: 1.00
Predicted Subtopic for 'ways to improve sleep': 0


In [40]:
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import numpy as np

# Function to generate embeddings using BERT
def get_embedding(text, tokenizer, model):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        output = model(**tokens)  # Use BertModel
    return output.last_hidden_state.mean(dim=1).squeeze().numpy()  # Pooling embeddings

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')  # Use BertModel for embeddings

# Generate query embedding (ensure it is 2D for cosine_similarity)
query_embedding = get_embedding("ways to manage anxiety", tokenizer, model).reshape(1, -1)

# Generate document embeddings (stack them into a 2D array)
doc_embeddings = np.vstack([get_embedding(doc, tokenizer, model) for doc in test_data['text'][:5]])

# Calculate similarity scores
similarities = cosine_similarity(query_embedding, doc_embeddings)

# Rank documents by relevance
ranked_indices = similarities.argsort()[0][::-1]  # Descending order

# Display ranked documents
print("\nRanked Documents by Relevance:")
for idx in ranked_indices:
    print(f"Document: {test_data['text'].iloc[idx]}, Similarity: {similarities[0][idx]:.2f}")




Ranked Documents by Relevance:
Document: Cancer, Similarity: 0.55
Document: Diabetes, Similarity: 0.55
Document: Asthma, Similarity: 0.52
Document: Arthritis, Similarity: 0.45
Document: Arthritis, Similarity: 0.45


In [37]:
# Define a custom dataset class
class HealthDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts.iloc[index]
        label = self.labels.iloc[index]
        # Ensure label is numeric
        if isinstance(label, str):
            raise ValueError(f"Label is a string: {label}. Ensure labels are numeric before creating the dataset.")
        label = int(label)
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "label": torch.tensor(label, dtype=torch.long)
        }



In [41]:
# Initialize tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128

# Ensure tokenizer initialization is explicit
if 'bert_tokenizer' not in globals():
    raise NameError("The tokenizer 'bert_tokenizer' is not initialized.")

# Create datasets
train_dataset = HealthDataset(
    texts=train_data['text'],
    labels=train_data['subtopic'],
    tokenizer=bert_tokenizer,
    max_len=max_len
)

test_dataset = HealthDataset(
    texts=test_data['text'],
    labels=test_data['subtopic'],
    tokenizer=bert_tokenizer,
    max_len=max_len
)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

print("\nDataLoaders created successfully.")



DataLoaders created successfully.


In [42]:
# Initialize BERT model for classification
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(subtopic_mapping))

# Initialize optimizer
optimizer = AdamW(bert_model.parameters(), lr=5e-5)

print("\nModel and optimizer initialized successfully.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model and optimizer initialized successfully.




In [43]:
# Training loop
for epoch in range(1):  # Adjust number of epochs as needed
    bert_model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {
            "input_ids": batch["input_ids"],
            "attention_mask": batch["attention_mask"],
            "labels": batch["label"]
        }
        outputs = bert_model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Training loss: {loss.item()}")

Training loss: 1.5674004554748535
Training loss: 1.3021866083145142
Training loss: 1.1926870346069336


KeyboardInterrupt: 

In [44]:
# Ranking with BERT embeddings
def get_embedding(text, tokenizer, model):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        output = model(**tokens)  # Use BertModel for embeddings
    return output.last_hidden_state.mean(dim=1).squeeze().numpy()  # Pooling embeddings

bert_base_model = BertModel.from_pretrained('bert-base-uncased')

query_embedding = get_embedding("ways to manage anxiety", bert_tokenizer, bert_base_model).reshape(1, -1)
doc_embeddings = np.vstack([get_embedding(doc, bert_tokenizer, bert_base_model) for doc in test_data['text'][:5]])

# Calculate similarity scores
similarities = cosine_similarity(query_embedding, doc_embeddings)
ranked_indices = similarities.argsort()[0][::-1]  # Descending order

# Display ranked documents
print("\nRanked Documents by Relevance:")
for idx in ranked_indices:
    print(f"Document: {test_data['text'].iloc[idx]}, Similarity: {similarities[0][idx]:.2f}")


Ranked Documents by Relevance:
Document: Cancer, Similarity: 0.55
Document: Diabetes, Similarity: 0.55
Document: Asthma, Similarity: 0.52
Document: Arthritis, Similarity: 0.45
Document: Arthritis, Similarity: 0.45


In [None]:
def process_query(query, classifier, vectorizer, tokenizer, model, documents):
    # Step 1: Predict Subtopic
    query_clean = preprocess_text(query)
    query_vec = vectorizer.transform([query_clean])
    predicted_subtopic = classifier.predict(query_vec)[0]

    # Step 2: Rank Documents by Relevance
    query_embedding = get_embedding(query, tokenizer, model).reshape(1, -1)
    doc_embeddings = np.vstack([get_embedding(doc, tokenizer, model) for doc in documents['text']])
    similarities = cosine_similarity(query_embedding, doc_embeddings)
    ranked_indices = similarities.argsort()[0][::-1]

    # Step 3: Display Results
    print(f"\nQuery: {query}")
    print(f"Predicted Subtopic: {predicted_subtopic}")
    print("\nTop Ranked Documents:")
    for idx in ranked_indices[:5]:  # Top 5 results
        print(f"- {documents['text'].iloc[idx]} (Score: {similarities[0][idx]:.2f})")

# Example Usage
process_query(
    "how to reduce obesity",
    classifier,
    vectorizer,
    bert_tokenizer,
    bert_base_model,
    test_data
)
