In [13]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

MODEL_FILE = "doctor_recommendation.joblib"  # changed to .joblib

def train_model(df):
    X, y = df["symptoms_text"].values, df["label"].values
    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(X)
    clf = RandomForestClassifier()
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2)
    clf.fit(Xtr, ytr)
    print(classification_report(yte, clf.predict(Xte)))
    # Save both classifier and vectorizer as a single joblib file
    joblib.dump((clf, tfidf), MODEL_FILE, compress=3)
    print(f"Model and vectorizer saved to {MODEL_FILE}")
    return clf, tfidf

if __name__ == "__main__":
    # Load your CSV dataset
    df = pd.read_csv("/content/large_doctor_data.csv")

    # Train model and save locally
    train_model(df)


                  precision    recall  f1-score   support

      Cardiology       1.00      1.00      1.00      7975
     Dermatology       1.00      1.00      1.00      5986
General Medicine       1.00      1.00      1.00      6039

        accuracy                           1.00     20000
       macro avg       1.00      1.00      1.00     20000
    weighted avg       1.00      1.00      1.00     20000

Model and vectorizer saved to doctor_recommendation.joblib


In [12]:
import pandas as pd
import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

MODEL_FILE = "risk_assessment.joblib"

def train_model(df):
    X = df[["age", "chest", "sob"]].values
    y = (df["label"] == "high").astype(int).values  # Convert label to 0/1
    clf = GradientBoostingClassifier()
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2)
    clf.fit(Xtr, ytr)
    print(classification_report(yte, clf.predict(Xte)))
    joblib.dump(clf, MODEL_FILE)
    print(f"Risk assessment model saved to {MODEL_FILE}")
    return clf

if __name__ == "__main__":
    df = pd.read_csv("/content/large_risk_data.csv")
    print("Columns in dataset:", df.columns)
    print(df.head())
    train_model(df)


Columns in dataset: Index(['age', 'chest', 'sob', 'label'], dtype='object')
   age  chest  sob label
0   25      0    0   low
1   66      0    0   low
2   51      0    0   low
3   78      0    0   low
4   50      1    0   low
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18017
           1       1.00      1.00      1.00      1983

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Risk assessment model saved to risk_assessment.joblib


In [15]:
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

MODEL_FILE = "symptom_analyzer.joblib"

def symptoms_to_text(symptoms):
    return " ".join(symptoms)

def train_model(df):
    X, y = df["symptoms_text"].values, df["label"].values
    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("clf", LogisticRegression(max_iter=1000))
    ])
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2)
    pipeline.fit(Xtr, ytr)
    print(classification_report(yte, pipeline.predict(Xte)))
    joblib.dump(pipeline, MODEL_FILE, compress=3)
    print(f"Symptom Analyzer model saved to {MODEL_FILE}")
    return pipeline

if __name__ == "__main__":
    # Load your dataset
    df = pd.read_csv("/content/large_symptom_data.csv")
    print(df.head())

    # Train model and save as joblib
    train_model(df)


                          symptoms_text            label
0              nausea light sensitivity         Migraine
1                              headache         Migraine
2  nausea shortness of breath dizziness    Cardiac Event
3                           cough fever  Viral Infection
4           light sensitivity dizziness         Migraine
                     precision    recall  f1-score   support

Allergic Dermatitis       1.00      1.00      1.00      3086
      Cardiac Event       1.00      0.78      0.88      1006
           Migraine       0.95      1.00      0.97      3950
    Viral Infection       1.00      1.00      1.00     11958

           accuracy                           0.99     20000
          macro avg       0.99      0.95      0.96     20000
       weighted avg       0.99      0.99      0.99     20000

Symptom Analyzer model saved to symptom_analyzer.joblib
