In [2]:
import pandas as pd

df = pd.read_csv("hf://datasets/segfz/chatbot_medical_fr/dataset1.csv")
df_labeled = pd.read_csv("hf://datasets/keivalya/MedQuad-MedicalQnADataset/medDataset_processed.csv")

In [3]:
import pandas as pd
from langdetect import detect

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False  # Handle detection failure cases

# Apply the function to the 'Description' column
df['is_english'] = df['Description'].apply(is_english)

# Check if all questions are in English
all_english = df['is_english'].all()

if all_english:
    print("All questions in the dataset are in English.")
else:
    print("There are non-English questions in the dataset.")
    # Output examples of non-English questions
    print(df[df['is_english'] == False][['Description']])


There are non-English questions in the dataset.
                                             Description
3                 Suggested treatment for mental illness
6                 Suggested treatment for mental illness
9                                      Feeling depressed
27          Suggest diet plan to increase stamina in men
37       Suggest treatment for misbehavior of my husband
...                                                  ...
40010  Q. Les changements hormonaux peuvent-ils provo...
40011  Q. Mon urètre et mes lèvres sont soudainement ...
40012  Q. Les problèmes de colère, la peur de l'aband...
40013  Q. Il y a un faible TLC après la livraison. Po...
40014  Q. Je souffre de diabète de type 1 et j'essaie...

[25484 rows x 1 columns]


In [4]:
df = df[df['is_english'] == True].drop(columns=['is_english'])

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


# Step 1: Delete rows with specified categories
excluded_types = ["genetic changes", "support groups", "exams and tests", "research", "outlook", "frequency", "considerations", "susceptibility", "prevention"]
df_filtered = df_labeled[~df_labeled['qtype'].isin(excluded_types)]

# Prepare training data
X = df_filtered['Question']  # Question text
y = df_filtered['qtype']     # Question type labels

# Step 2: Split the data into training and test sets (optional, for model validation)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Create a pipeline for text processing and classification
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),                # Text feature extraction
    ('clf', LogisticRegression(max_iter=1000))   # Classification using logistic regression
])

# Step 4: Train the model
pipeline.fit(X_train, y_train)

# Optional: Evaluate model performance
accuracy = pipeline.score(X_test, y_test)
print(f"Model accuracy: {accuracy:.2f}")

# Step 5: Use the model to predict categories for the unlabeled dataset
df['predicted_qtype'] = pipeline.predict(df['Description'])

# Output classification results
print(df[['Description', 'predicted_qtype']].head())


Model accuracy: 1.00
                                         Description predicted_qtype
0  What is the opinion about traveling of a perso...     information
1       Suggest any solution for my crave for blood.       treatment
2        Suggest treatment for wet bedding in adults       treatment
4  Feeling presence of something moving around st...        symptoms
5  Should I go for embolization for varicocele an...       treatment


In [6]:
sample_questions = df.groupby('predicted_qtype')['Description'].apply(lambda x: x.sample(2, random_state=42))

sample_questions

predicted_qtype       
causes           29848    What causes attention deviation, anxiety and a...
                 15793                 Q. What causes unexplained bruising?
information      2985     Swelling under arm. Ultrasound taking too long...
                 1694     Anxiety attacks, shaking and nervous when in a...
stages           29242          What are the stages of Alzheimer s disease?
                 19238          What are the stages of Alzheimer's disease?
symptoms         2762     What is the possibility of changing the mental...
                 25332        Q. What does abutment of the nerve root mean?
treatment        28314    Suggest an alternative medication for Celia pr...
                 4785           Suggest home remedies for pernicious anemia
Name: Description, dtype: object

In [7]:
import pandas as pd

# Add a boolean column indicating whether each row's `Description` contains 'depression'
df['depression_related'] = df['Description'].str.contains('depression', case=False, na=False)

# Filter data based on conditions
filtered_df = df[
    (df['depression_related']) & 
    (df['Description'].str.len() < 200) & 
    (df['Patient'].str.len() < 300)
]

# Select one Alzheimer-related question in each question type
result_df = filtered_df.groupby('predicted_qtype').head(10)

result_df


Unnamed: 0.1,Unnamed: 0,Description,Patient,Doctor,predicted_qtype,depression_related
17,17,Have depression after taking medication,Hi was on Effexor for 9 years and finally got ...,"HIThanks for using healthcare magicI think, yo...",information,True
29,29,What causes the symptoms of intrusive thoughts...,"Hello, I believe I might be experiencing signs...","Hi, It seems you are suffering with depression...",symptoms,True
34,34,"Is there a relationship between stress, depres...","Hello, My wife was emotionally stressed yester...","HIThanks for using healthcare magicI think, sh...",information,True
125,125,How to treat a person with severe depression w...,My daughter has severe depression and refuses ...,Hello and welcome to Healthcare Magic. Thanks ...,information,True
149,149,What causes sleepiness and weakness having Neu...,I am a grade 4/5 teacher and no matter how muc...,HIThanks for using healthcare magicFluoxetine ...,causes,True
162,162,"What causes depression, sleeplessness and weak...","am not getting sleeping well,if i take libotry...",HiThanks for using healthcare magicIf you want...,causes,True
210,210,What could be the reason for the depression an...,MY HUSBAND GETS REALLY MAD ABOUT LITTLE THINai...,Hello and welcome to Healthcare Magic. Thanks ...,treatment,True
243,243,"Need medication for severe anxiety, depression...","i have severe depression,anxiety and panic dis...","Hello,Thanks for choosing health care magic fo...",information,True
256,256,What could be the cause of the constant belchi...,Mother is going through depression and has anx...,"Hello,Thanks for choosing health care magic fo...",symptoms,True
266,266,What causes fatigue and tiredness leading to d...,ve been feeling very tired and tried regardles...,"Hello,Thanks for choosing health care magic fo...",causes,True


In [8]:
result_df.to_csv('filtered_questions.csv', index=False, encoding='utf-8')

In [7]:
df = pd.read_csv("filtered_questions.csv", encoding='ISO-8859-1')

df.to_csv("filtered_questions_utf8.csv", index=False, encoding='utf-8')