In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import re
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download nltk data files (run this once)
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize and remove stopwords, apply lemmatization
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Load and preprocess your dataset
data = pd.read_csv('/content/drive/MyDrive/CURA GPT/LSTM model/health_queries.csv')  # Ensure columns 'symptoms' and 'disease' exist
data['symptoms'] = data['symptoms'].apply(preprocess_text)  # Apply preprocessing to symptoms column


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(data['symptoms'])
sequences = tokenizer.texts_to_sequences(data['symptoms'])
max_len = max([len(x) for x in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# One-hot encode the target labels
disease_labels = pd.get_dummies(data['disease']).values


In [4]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, disease_labels, test_size=0.2, random_state=42)


In [5]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dense(32, activation='relu'))
model.add(Dense(disease_labels.shape[1], activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/10




[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 111ms/step - accuracy: 0.0675 - loss: 3.3931 - val_accuracy: 0.3211 - val_loss: 2.0374
Epoch 2/10
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 86ms/step - accuracy: 0.4425 - loss: 1.7859 - val_accuracy: 0.6931 - val_loss: 1.1170
Epoch 3/10
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 85ms/step - accuracy: 0.7040 - loss: 1.0032 - val_accuracy: 0.8232 - val_loss: 0.6751
Epoch 4/10
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 63ms/step - accuracy: 0.8147 - loss: 0.6166 - val_accuracy: 0.8841 - val_loss: 0.3522
Epoch 5/10
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 72ms/step - accuracy: 0.8821 - loss: 0.3770 - val_accuracy: 0.9248 - val_loss: 0.3484
Epoch 6/10
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 86ms/step - accuracy: 0.9414 - loss: 0.2496 - val_accuracy: 0.9736 - val_loss: 0.0966
Epoch 7/10
[1m123/123[0m

<keras.src.callbacks.history.History at 0x7bbd1ed0fe80>

In [None]:
# Function to predict disease based on user input
def predict_disease(symptoms_text):
    # Preprocess the input text
    processed_text = preprocess_text(symptoms_text)
    # Tokenize and pad the sequence
    seq = tokenizer.texts_to_sequences([processed_text])
    padded_seq = pad_sequences(seq, maxlen=max_len, padding='post')
    # Predict the disease
    prediction = model.predict(padded_seq)
    predicted_disease = np.argmax(prediction, axis=1)
    return data['disease'].unique()[predicted_disease[0]]

# Chatbot interaction loop
print("Hello! I'm your medical assistant. Describe your symptoms, and I'll try to predict the disease.")
while True:
    symptoms_input = input("You:")
    if symptoms_input.lower() in ['quit', 'exit']:
        print("Goodbye! Take care.")
        break
    disease_prediction = predict_disease(symptoms_input)
    print(f"Chatbot: Based on your symptoms, you might have {disease_prediction}. Please consult a doctor for a thorough diagnosis.")


Hello! I'm your medical assistant. Describe your symptoms, and I'll try to predict the disease.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 435ms/step
Chatbot: Based on your symptoms, you might have This could be a sign of Typhoid.. Please consult a doctor for a thorough diagnosis.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Chatbot: Based on your symptoms, you might have This could be a sign of Hyperthyroidism.. Please consult a doctor for a thorough diagnosis.
