<a href="https://colab.research.google.com/github/g-eez/capstone-project/blob/Mohamed/respiratory_summary_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Respiratory Diseases — Dataset-based Summary (Colab)
This notebook loads the provided CSV and generates a concise, dataset-only summary for each disease.
**Output is strictly based on fields present in the CSV** (Symptoms, Treatment, Age, Sex, Nature).


In [24]:
# Setup
import pandas as pd
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.units import inch

In [25]:
# Load dataset
csv_path = "/content/respiratory_symptoms_and_treatment.csv"
df = pd.read_csv(csv_path)
df.columns = [c.strip() for c in df.columns]

In [26]:
# Filtering for respiratory diseases
respiratory_keywords = [
    "asthma", "bronchitis", "bronchiolitis", "pneumonia", "influenza",
    "tuberculosis", "asbestosis", "aspergillosis", "pneumothorax",
    "pulmonary", "respiratory", "mesothelioma", "chronic bronchitis",
    "chronic obstructive pulmonary disease", "sleep apnea", "acute respiratory distress"
]
# Convert 'Disease' column to string type and handle potential NaN values
df = df[df["Disease"].astype(str).str.lower().apply(lambda x: any(k in x for k in respiratory_keywords))]

In [27]:
# Aggregate concise summary
agg = (
    df.groupby("Disease", dropna=False)
      .agg({
          "Symptoms": lambda x: sorted({s.strip() for s in x.dropna()}),
          "Treatment": lambda x: sorted({str(t).strip() for t in x.dropna() if str(t).strip().lower() not in ['nan','none','']}),
          "Nature": lambda x: sorted({str(n).strip() for n in x.dropna() if str(n).strip().lower() not in ['nan','none','']}),
          "Age": lambda x: {"min": int(x.min()) if pd.notna(x.min()) else None, "max": int(x.max()) if pd.notna(x.max()) else None},
          "Sex": lambda x: sorted({str(s).strip() for s in x.dropna() if str(s).strip().lower() not in ['nan','none','']})
      })
      .reset_index()
)

In [28]:
# Generate PDF
pdf_path = "/content/Respiratory_Disease_Summary.pdf"

styles = getSampleStyleSheet()
title_style = ParagraphStyle('TitleStyle', parent=styles['Heading1'], fontSize=18, spaceAfter=12)
subheading_style = ParagraphStyle('SubHeading', parent=styles['Heading2'], fontSize=14, spaceAfter=6)
text_style = ParagraphStyle('BodyText', parent=styles['Normal'], fontSize=11, leading=15)

doc = SimpleDocTemplate(pdf_path, pagesize=A4)
story = []

story.append(Paragraph("Respiratory Diseases — Dataset-Based Summary", title_style))
story.append(Paragraph("This professional summary was generated strictly from the dataset fields (Symptoms, Treatments, Age, Sex, Nature).", text_style))
story.append(Spacer(1, 0.2 * inch))

for _, row in agg.iterrows():
    disease = row['Disease']
    symptoms = ', '.join(row['Symptoms']) if row['Symptoms'] else 'No symptoms recorded.'
    treatments = ', '.join(row['Treatment']) if row['Treatment'] else 'No treatments recorded.'
    nature = ', '.join(row['Nature']) if row['Nature'] else 'None'
    age = row['Age']
    age_note = f"Age range: {age.get('min')} — {age.get('max')}" if age and (age.get('min') or age.get('max')) else "Age range: not recorded"
    sex = ', '.join(row['Sex']) if row['Sex'] else 'Not recorded'

    story.append(Paragraph(f"Disease: {disease}", subheading_style))
    story.append(Paragraph(f"<b>Symptoms:</b> {symptoms}", text_style))
    story.append(Paragraph(f"<b>Treatments:</b> {treatments}", text_style))
    story.append(Paragraph(f"<b>Dataset Notes:</b> {age_note}; Sex: {sex}; Nature: {nature}", text_style))
    story.append(Spacer(1, 0.15 * inch))

doc.build(story)
print(f"PDF summary saved to: {pdf_path}")

PDF summary saved to: /content/Respiratory_Disease_Summary.pdf
