In [None]:
import pandas as pd

# Load drugs to disease dataset
df_drugs_disease = pd.read_csv('drugs_side_effects_drugs_com.csv')

# Load disease_drug dataset
df_disease_drug = pd.read_csv('final.csv')

# Load disease to symptoms dataset
df_disease_symptoms = pd.read_csv('dataset.csv')


In [None]:
# Clean disease names in all datasets
df_drugs_disease['medical_condition'] = df_drugs_disease['medical_condition'].str.lower().str.strip()
df_disease_drug['disease'] = df_disease_drug['disease'].str.lower().str.strip()
df_disease_symptoms['disease'] = df_disease_symptoms['Disease'].str.lower().str.strip()
df_disease_symptoms.drop(['Disease'], axis=1, inplace=True)


In [None]:
import re

def extract_symptoms(description):
    # Define a regex pattern to extract symptoms
    pattern = r'\b(symptom(?:s)?\b|\bcondition(?:s)?\b|\bmanifest(?:s)?\b|\bpresent(?:s)?\b|\bcomplain(?:s)?\b|\bfind(?:s)?\b)\b(?:\w+\W+\w+){0,5}?(\w+\b)'
    symptoms = re.findall(pattern, description, re.IGNORECASE)
    return [symptom[1] for symptom in symptoms]

# Apply the function to extract symptoms from medical_condition_description
df_drugs_disease['extracted_symptoms'] = df_drugs_disease['medical_condition_description'].apply(extract_symptoms)


In [28]:
# Merge disease_drug dataset with drugs to disease dataset on disease and drug columns
df_merged = pd.merge(df_drugs_disease, df_disease_drug, left_on=['medical_condition', 'drug_name'], right_on=['disease', 'drug'], how='inner')

# Merge resulting dataset with disease to symptoms dataset on disease column
df_final = pd.merge(df_merged, df_disease_symptoms, on='disease', how='left')

# Fill NaN values in symptom columns with empty strings
symptom_columns = [f'Symptom_{i}' for i in range(1, 18)]
df_final[symptom_columns] = df_final[symptom_columns].fillna('')

# Combine extracted symptoms from medical_condition_description and symptom columns
df_final['combined_symptoms'] = df_final.apply(lambda row: list(set(row['extracted_symptoms'] + [symptom for symptom in row[symptom_columns] if symptom])), axis=1)

# Drop all columns except drug_name, disease, and symptom columns
columns_to_keep = ['drug_name', 'disease'] + symptom_columns
df_final = df_final[columns_to_keep]

# Save the final dataset to a CSV file
df_final.to_csv('final_dataset_with_symptoms.csv', index=False)

# Display the first few rows of the final dataset
print(df_final.head())


     drug_name disease            Symptom_1            Symptom_2    Symptom_3  \
0  doxycycline    acne            skin_rash   pus_filled_pimples   blackheads   
1  doxycycline    acne            skin_rash   pus_filled_pimples   blackheads   
2  doxycycline    acne   pus_filled_pimples           blackheads     scurring   
3  doxycycline    acne            skin_rash           blackheads     scurring   
4  doxycycline    acne            skin_rash   pus_filled_pimples     scurring   

   Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9 Symptom_10  \
0   scurring                                                                
1   scurring                                                                
2                                                                           
3                                                                           
4                                                                           

  Symptom_11 Symptom_12 Symptom_13 Symptom_14 Symp

In [27]:
# Assuming df_final is already defined and contains your final dataset

# Get the shape of the dataframe (rows, columns)
total_rows = df_final.shape[0]

print(f"Total number of rows in the final dataset: {total_rows}")


Total number of rows in the final dataset: 23437
