In [10]:
import pandas as pd
import re

# Load the disease-symptom dataset
df_disease_symptom = pd.read_csv('dataset.csv')
# Load the drugs disease dataset
df_drugs_disease = pd.read_csv('drugs_side_effects_drugs_com.csv')

# Fill any missing values
df_drugs_disease.fillna('', inplace=True)

# Fill any missing values
df_disease_symptom.fillna('', inplace=True)

# Combine all 17 symptom columns into a single list
symptom_columns = [f'Symptom_{i}' for i in range(1, 18)]
all_symptoms = df_disease_symptom[symptom_columns].values.flatten()

# Remove empty strings and get unique symptoms
unique_symptoms = pd.Series(all_symptoms).replace('', pd.NA).dropna().unique()

# Convert to list for easier handling
unique_symptoms_list = unique_symptoms.tolist()

# Print the list of unique symptoms
# print(unique_symptoms_list)

# Convert disease names to lowercase and strip leading/trailing whitespace
# df_disease_symptom['Disease'] = df_disease_symptom['Disease'].str.lower().str.strip()
# df_drugs_disease['medical_condition'] = df_drugs_disease['medical_condition'].str.lower().str.strip()

# df_merged = pd.merge(df_disease_symptom, df_drugs_disease, left_on='Disease', right_on='medical_condition', how='inner')

# # Drop the redundant 'medical_condition' column after merging
# df_merged.drop('medical_condition', axis=1, inplace=True)

# # Fill any remaining missing values if necessary
# df_merged.fillna('', inplace=True)

# # Save the merged dataset to a new CSV file
# df_merged.to_csv('merged_dataset.csv', index=False)

# # Display the first few rows of the merged dataset
# print(df_merged.head())

In [11]:
# Function to extract symptoms from medical condition description
def extract_symptoms(description, symptoms_list):
    description = description.lower()
    found_symptoms = [symptom for symptom in symptoms_list if re.search(r'\b' + re.escape(symptom) + r'\b', description)]
    return found_symptoms

In [14]:
df_drugs_disease['extracted_symptoms'] = df_drugs_disease['medical_condition_description'].apply(lambda x: extract_symptoms(x, unique_symptoms_list))

# Split the extracted symptoms into separate columns
df_exploded_symptoms = df_drugs_disease['extracted_symptoms'].apply(pd.Series)


In [17]:
# Combine symptom columns into a single list for each disease
df_disease_symptom['all_symptoms'] = df_disease_symptom.apply(
    lambda row: list(filter(None, [row[f'Symptom_{i}'] for i in range(1, 18)])), axis=1
)

# Convert disease names to lowercase and strip leading/trailing whitespace
df_disease_symptom['Disease'] = df_disease_symptom['Disease'].str.lower().str.strip()
df_drugs_disease['medical_condition'] = df_drugs_disease['medical_condition'].str.lower().str.strip()

# Merge the datasets on the disease column
df_merged = pd.merge(df_drugs_disease, df_disease_symptom, left_on='medical_condition', right_on='Disease', how='inner')

# Combine symptoms from both datasets
def combine_symptoms(row):
    symptoms = set(row['extracted_symptoms'] + row['all_symptoms'])
    return list(symptoms)

df_merged['combined_symptoms'] = df_merged.apply(combine_symptoms, axis=1)

# Split the combined symptoms into separate columns
df_combined_symptoms = df_merged['combined_symptoms'].apply(pd.Series)

# Rename the columns to Symptom_1, Symptom_2, etc.
df_combined_symptoms.columns = [f'Symptom_{i+1}' for i in range(df_combined_symptoms.shape[1])]

# Concatenate the combined symptoms columns with the merged dataframe
df_final = pd.concat([df_merged, df_combined_symptoms], axis=1)

# Drop unnecessary columns
df_final.drop(['extracted_symptoms', 'all_symptoms', 'combined_symptoms', 'Disease'], axis=1, inplace=True)

# Save the final merged dataset to a new CSV file
df_final.to_csv('final_merged_dataset.csv', index=False)

# Display the first few rows of the final merged dataset
print(df_merged.head())

     drug_name medical_condition  \
0  doxycycline              acne   
1  doxycycline              acne   
2  doxycycline              acne   
3  doxycycline              acne   
4  doxycycline              acne   

                                        side_effects generic_name  \
0  (hives, difficult breathing, swelling in your ...  doxycycline   
1  (hives, difficult breathing, swelling in your ...  doxycycline   
2  (hives, difficult breathing, swelling in your ...  doxycycline   
3  (hives, difficult breathing, swelling in your ...  doxycycline   
4  (hives, difficult breathing, swelling in your ...  doxycycline   

                                 drug_classes  \
0  Miscellaneous antimalarials, Tetracyclines   
1  Miscellaneous antimalarials, Tetracyclines   
2  Miscellaneous antimalarials, Tetracyclines   
3  Miscellaneous antimalarials, Tetracyclines   
4  Miscellaneous antimalarials, Tetracyclines   

                                         brand_names activity rx_otc  \
0