In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

In [3]:
data = pd.read_csv('dataset.csv')

# Clean data: Convert symptoms to lowercase and ensure the disease column is clean
data['symptoms'] = data['symptoms'].str.lower()
data['disease'] = data['disease'].str.strip()

# Map the "disease" to a numeric target (since it's a classification problem)
disease_classes = data['disease'].unique()
disease_mapping = {disease: idx for idx, disease in enumerate(disease_classes)}
data['disease_label'] = data['disease'].map(disease_mapping)

# Prepare features (symptoms) and labels (disease)
X = data['symptoms']
y = data['disease_label']


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# 1. CountVectorizer (Bag of Words)
count_vectorizer = CountVectorizer(stop_words='english')
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

In [6]:
# 2. TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [13]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Support Vector Machine': SVC(kernel='linear', random_state=42)
}

In [11]:
def evaluate_model(model, X_train, X_test, y_train, y_test, vectorizer_name):
    print(f"Evaluating {model.__class__.__name__} using {vectorizer_name}")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Print unique classes in y_test and y_pred
    print("Unique classes in y_test:", np.unique(y_test))
    print("Unique classes in y_pred:", np.unique(y_pred))
    
    # Ensure target_names match the unique classes in y_test
    disease_classes = list(data['disease'].unique())  # or get unique diseases directly from the dataset
    print("Number of unique diseases:", len(disease_classes))
    
    # Performance metrics
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    
    # Use target_names to ensure that it matches the number of unique classes in y_test
    print(classification_report(y_test, y_pred, target_names=disease_classes))
    print("="*60)

In [14]:
# Evaluate models with CountVectorizer features
for model_name, model in models.items():
    evaluate_model(model, X_train_count, X_test_count, y_train, y_test, "CountVectorizer")

# Evaluate models with TfidfVectorizer features
for model_name, model in models.items():
    evaluate_model(model, X_train_tfidf, X_test_tfidf, y_train, y_test, "TfidfVectorizer")

Evaluating LogisticRegression using CountVectorizer
Unique classes in y_test: [ 0  4  5 10 12 16 20 27 35 37 39 42 56 58 62 65 72 75 80 82]
Unique classes in y_pred: [ 2  4  5  8 15 26 28 32 36 38 40 45 52 63 66 79 81]
Number of unique diseases: 84
Accuracy: 0.1000
Confusion Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]]
Classification Report:


ValueError: Number of classes, 35, does not match size of target_names, 84. Try specifying the labels parameter

In [None]:
accuracies_count = {}
accuracies_tfidf = {}

for model_name, model in models.items():
    model.fit(X_train_count, y_train)
    y_pred_count = model.predict(X_test_count)
    accuracies_count[model_name] = accuracy_score(y_test, y_pred_count)
    
    model.fit(X_train_tfidf, y_train)
    y_pred_tfidf = model.predict(X_test_tfidf)
    accuracies_tfidf[model_name] = accuracy_score(y_test, y_pred_tfidf)

# Print out comparison of accuracies
print("\nAccuracy Comparison (CountVectorizer vs. TfidfVectorizer):")
for model_name in models.keys():
    print(f"{model_name}: CountVectorizer Accuracy = {accuracies_count[model_name]:.4f}, TfidfVectorizer Accuracy = {accuracies_tfidf[model_name]:.4f}")

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Sample dataset (replace with your actual dataset)
data = pd.DataFrame({
    'disease': ['flu', 'bronchitis', 'pneumonia', 'heart attack', 'stroke'],
    'symptoms': [
        'fever,cough,sore throat,runny or stuffy nose,muscle aches,headache,fatigue',
        'cough,mucus production,shortness of breath,chest pain',
        'fever,cough,shortness of breath,chest pain,fatigue',
        'chest pain,shortness of breath,nausea,vomiting,lightheadedness,sweating',
        'sudden weakness,numbness on one side of the body,confusion,difficulty speaking,trouble seeing in one eye,severe headache'
    ]
})

# Assuming the 'disease' column is the target and 'symptoms' is the feature
X = data['symptoms']
y = data['disease']

# Convert the target 'disease' labels to numerical labels
y = pd.factorize(y)[0]

# Train-test split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1)  # Unigrams and bigrams, ignore terms that appear in fewer than 1 document

# Transform the text data
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)

# Initialize Logistic Regression model with class weighting to handle imbalanced classes
model = LogisticRegression(class_weight='balanced', max_iter=1000)

# Define the function to evaluate the model
def evaluate_model(model, X_train, X_test, y_train, y_test, vectorizer_name):
    print(f"Evaluating {model.__class__.__name__} using {vectorizer_name}")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Debugging: Check unique classes in y_test and y_pred
    print("Unique classes in y_test:", np.unique(y_test))
    print("Unique classes in y_pred:", np.unique(y_pred))
    
    # Performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Ensure that we are accounting for all classes in the evaluation
    all_classes = list(range(len(np.unique(y))))  # Generate the full list of class labels (0 to 83 if you have 84 unique diseases)
    print("Classification Report:")
    print(classification_report(y_test, y_pred, labels=all_classes))  # Explicitly include all possible labels in the report
    print("="*60)

# Call the function to evaluate the model
evaluate_model(model, X_train_count, X_test_count, y_train, y_test, "CountVectorizer")


Evaluating LogisticRegression using CountVectorizer
Unique classes in y_test: [1]
Unique classes in y_pred: [2]
Accuracy: 0.0000
Confusion Matrix:
[[0 1]
 [0 0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       1.0
           2       0.00      0.00      0.00       0.0
           3       0.00      0.00      0.00       0.0
           4       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
