In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import joblib  # Library for saving/loading models

def train_symptom_analyzer(data_path='symptom_data.csv'):
    """Trains the symptom analysis model and saves it to disk."""
    print("🚀 Starting symptom analyzer training...")

    # Load and prepare data
    df = pd.read_csv('/content/large_symptom_data.csv')
    le = LabelEncoder()
    df['condition_encoded'] = le.fit_transform(df['label'])

    X = df['symptoms_text']
    y = df['condition_encoded']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Create the model pipeline
    model_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
        ('xgb', xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', use_label_encoder=False))
    ])

    # Train the model
    model_pipeline.fit(X_train, y_train)

    accuracy = model_pipeline.score(X_test, y_test)
    print(f"✅ Training complete! Accuracy: {accuracy:.4f}")

    # --- Save the trained model and the label encoder to files ---
    joblib.dump(model_pipeline, 'symptom_analyzer_model.joblib')
    joblib.dump(le, 'symptom_label_encoder.joblib')
    print("💾 Model and label encoder saved to 'symptom_analyzer_model.joblib' and 'symptom_label_encoder.joblib'")

def test_symptom_analyzer():
    """Loads the trained model from disk and tests it."""
    print("\n🧪 --- Testing Symptom Analyzer ---")
    try:
        # --- Load the model and encoder from files ---
        model = joblib.load('symptom_analyzer_model.joblib')
        le = joblib.load('symptom_label_encoder.joblib')
    except FileNotFoundError:
        print("Model files not found. Please run the training function first.")
        return

    test_symptoms = [
        "I have a high temperature and my whole body hurts.",
        "My skin is red and very itchy.",
        "I can't stop sneezing and my nose is runny."
    ]

    predictions_proba = model.predict_proba(test_symptoms)

    for i, text in enumerate(test_symptoms):
        pred_index = predictions_proba[i].argmax()
        confidence = predictions_proba[i][pred_index]
        condition_name = le.inverse_transform([pred_index])[0]

        print(f"\nInput Symptoms: '{text}'")
        print(f"  -> Predicted Condition: {condition_name}")
        print(f"  -> Confidence: {confidence:.2%}")

# To run this file:
# 1. Make sure you have symptom_data.csv in the same directory.
# 2. Run `python symptom_analyzer.py` in your terminal.
if __name__ == '__main__':
    train_symptom_analyzer()
    test_symptom_analyzer()

🚀 Starting symptom analyzer training...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Training complete! Accuracy: 0.9885
💾 Model and label encoder saved to 'symptom_analyzer_model.joblib' and 'symptom_label_encoder.joblib'

🧪 --- Testing Symptom Analyzer ---

Input Symptoms: 'I have a high temperature and my whole body hurts.'
  -> Predicted Condition: Viral Infection
  -> Confidence: 76.12%

Input Symptoms: 'My skin is red and very itchy.'
  -> Predicted Condition: Viral Infection
  -> Confidence: 76.12%

Input Symptoms: 'I can't stop sneezing and my nose is runny.'
  -> Predicted Condition: Viral Infection
  -> Confidence: 76.12%


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import joblib

def train_risk_assessor(data_path='risk_data.csv'):
    """Trains the risk assessment model and saves it to disk."""
    print("🚀 Starting risk assessor training...")

    # Load data
    df = pd.read_csv('/content/large_risk_data.csv')
    X = df[['age', 'chest', 'sob']]
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Initialize and train the model
    model = LogisticRegression(random_state=42)
    model.fit(X_train, y_train)

    accuracy = model.score(X_test, y_test)
    print(f"✅ Training complete! Accuracy: {accuracy:.4f}")

    # --- Save the trained model to a file ---
    joblib.dump(model, 'risk_assessor_model.joblib')
    print("💾 Model saved to 'risk_assessor_model.joblib'")

def test_risk_assessor():
    """Loads the trained model from disk and tests it."""
    print("\n🧪 --- Testing Risk Assessor ---")
    try:
        # --- Load the model from file ---
        model = joblib.load('risk_assessor_model.joblib')
    except FileNotFoundError:
        print("Model file not found. Please run the training function first.")
        return

    test_patients = [[75, 1, 1], [35, 0, 0], [60, 1, 0]]

    for patient_data in test_patients:
        patient_df = pd.DataFrame([patient_data], columns=['age', 'chest', 'sob'])

        prediction = model.predict(patient_df)[0]
        probabilities = model.predict_proba(patient_df)[0]
        class_index = list(model.classes_).index(prediction)
        risk_score = probabilities[class_index]

        print(f"\nInput: Age={patient_data[0]}, Chest Pain={bool(patient_data[1])}, SOB={bool(patient_data[2])}")
        print(f"  -> Predicted Risk: {prediction}")
        print(f"  -> Score: {risk_score:.2%}")

# To run this file:
# 1. Make sure you have risk_data.csv in the same directory.
# 2. Run `python risk_assessor.py` in your terminal.
if __name__ == '__main__':
    train_risk_assessor()
    test_risk_assessor()

🚀 Starting risk assessor training...
✅ Training complete! Accuracy: 1.0000
💾 Model saved to 'risk_assessor_model.joblib'

🧪 --- Testing Risk Assessor ---

Input: Age=75, Chest Pain=True, SOB=True
  -> Predicted Risk: high
  -> Score: 100.00%

Input: Age=35, Chest Pain=False, SOB=False
  -> Predicted Risk: low
  -> Score: 100.00%

Input: Age=60, Chest Pain=True, SOB=False
  -> Predicted Risk: high
  -> Score: 99.93%


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
import joblib

def train_doctor_recommender(data_path='specialist_data.csv'):
    """Trains the doctor recommendation model and saves it to disk."""
    print("🚀 Starting doctor recommender training...")

    # Load and prepare data
    df = pd.read_csv('/content/large_doctor_data.csv')
    le = LabelEncoder()
    df['specialization_encoded'] = le.fit_transform(df['label'])

    X = df['symptoms_text']
    y = df['specialization_encoded']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Create the model pipeline
    model_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('nb', MultinomialNB())
    ])

    # Train the model
    model_pipeline.fit(X_train, y_train)

    accuracy = model_pipeline.score(X_test, y_test)
    print(f"✅ Training complete! Accuracy: {accuracy:.4f}")

    # --- Save the trained model and the label encoder to files ---
    joblib.dump(model_pipeline, 'doctor_recommender_model.joblib')
    joblib.dump(le, 'specialist_label_encoder.joblib')
    print("💾 Model and label encoder saved to 'doctor_recommender_model.joblib' and 'specialist_label_encoder.joblib'")

def test_doctor_recommender():
    """Loads the trained model from disk and tests it."""
    print("\n🧪 --- Testing Doctor Recommender ---")
    try:
        # --- Load the model and encoder from files ---
        model = joblib.load('doctor_recommender_model.joblib')
        le = joblib.load('specialist_label_encoder.joblib')
    except FileNotFoundError:
        print("Model files not found. Please run the training function first.")
        return

    test_symptoms = [
        "My heart feels like it's beating too fast and I have chest pains.",
        "I broke my arm playing football.",
        "I have a strange red rash on my back."
    ]

    predictions_proba = model.predict_proba(test_symptoms)

    for i, text in enumerate(test_symptoms):
        pred_index = predictions_proba[i].argmax()
        confidence = predictions_proba[i][pred_index]
        specialist_name = le.inverse_transform([pred_index])[0]

        print(f"\nInput Symptoms: '{text}'")
        print(f"  -> Recommended Specialization: {specialist_name}")
        print(f"  -> Confidence: {confidence:.2%}")

# To run this file:
# 1. Make sure you have specialist_data.csv in the same directory.
# 2. Run `python doctor_recommender.py` in your terminal.
if __name__ == '__main__':
    train_doctor_recommender()
    test_doctor_recommender()

🚀 Starting doctor recommender training...
✅ Training complete! Accuracy: 1.0000
💾 Model and label encoder saved to 'doctor_recommender_model.joblib' and 'specialist_label_encoder.joblib'

🧪 --- Testing Doctor Recommender ---

Input Symptoms: 'My heart feels like it's beating too fast and I have chest pains.'
  -> Recommended Specialization: Cardiology
  -> Confidence: 99.98%

Input Symptoms: 'I broke my arm playing football.'
  -> Recommended Specialization: Cardiology
  -> Confidence: 40.14%

Input Symptoms: 'I have a strange red rash on my back.'
  -> Recommended Specialization: Dermatology
  -> Confidence: 99.98%
