# Importer les librairies nécessaires

In [15]:
import os
import pandas as pd
from collections import Counter
from monai.transforms import LoadImage, Compose, EnsureChannelFirst, ScaleIntensity
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass
from typing import List

# 1. Récupérer les données cliniques des patients

In [16]:
## données sous forme d'un dataframe (contient aussi les labels)
clinical_data = pd.read_excel('/Users/constance/Documents/Project_lung_cancer/NIH dataset_raw/statistics-clinical-20201221.xlsx')

# afficher les statistiques descriptives du dataframe
print(clinical_data.describe(include='all')) # les colonnes sont: No. ; NewPatientID ; Sex ; Age ; weight (kg) ; T-Stage ; N-Stage ; Ｍ-Stage ; Histopathological grading ; Smoking History                0

# afficher le nombre de données manquantes dans chaque colonne
print(clinical_data.isnull().sum())

               No. NewPatientID  Sex         Age  weight (kg) T-Stage  \
count   355.000000          355  355  354.000000   351.000000     355   
unique         NaN          355    2         NaN          NaN      10   
top            NaN        A0001    M         NaN          NaN      1c   
freq           NaN            1  190         NaN          NaN     127   
mean    178.000000          NaN  NaN   61.242938    65.315100     NaN   
std     102.623909          NaN  NaN    9.987968    11.839154     NaN   
min       1.000000          NaN  NaN   28.000000    35.500000     NaN   
25%      89.500000          NaN  NaN   55.000000    57.650000     NaN   
50%     178.000000          NaN  NaN   62.000000    65.000000     NaN   
75%     266.500000          NaN  NaN   68.000000    72.750000     NaN   
max     355.000000          NaN  NaN   90.000000   103.000000     NaN   

           N-Stage  Ｍ-Stage Histopathological grading  Smoking History  
count   355.000000    355.0                       

1.A)1. Définir une classe patient qui servira pour l'entrainement (il faudra lier les données aux CT)

In [17]:
@dataclass
class Patient:
    PatientID: str
    Sex: str
    Age: int
    Weight: float
    SmokingHistory: int
    images: List[str]  # Liste des chemins des images associées à ce patient

    def __str__(self):
        return (
            f"Patient No. {self.No}, ID: {self.PatientID}, "
            f"Sex: {self.Sex}, Age: {self.Age}, Weight: {self.Weight}kg, "
            f"Smoking History: {self.SmokingHistory}, "
            f"Images: {len(self.images)}"
        )

    def add_image(self, image_path: str):
        """Ajoute une image à la liste des images du patient"""
        self.images.append(image_path)


1.A)2. gérer les valeurs manquantes dans les données

In [18]:
# Calculer la moyenne de l'âge et du poids en ignorant les NaN
mean_age = clinical_data['Age'].mean()
mean_weight = clinical_data['weight (kg)'].mean()

# Remplacer les NaN par la moyenne dans les colonnes correspondantes
clinical_data['Age'] = clinical_data['Age'].fillna(mean_age)
clinical_data['weight (kg)'] = clinical_data['weight (kg)'].fillna(mean_weight)

In [19]:
patients = {
    row['NewPatientID']: Patient(
        PatientID=row['NewPatientID'],
        Sex=row['Sex'],
        Age=int(row['Age']),
        Weight=float(row['weight (kg)']),
        SmokingHistory=row['Smoking History'],
        images=[]  # Liste des images pour chaque patient
    )
    for _, row in clinical_data.iterrows()
}

1.B) Créer une classe diagnostic pour les labels des patients

In [20]:
class Diagnostic:
    def __init__(self, patient_id: str, t_stage: str, n_stage: str, m_stage: str, hispastological_grading: str):
        self.patient_id = patient_id
        self.t_stage = t_stage
        self.n_stage = n_stage
        self.m_stage = m_stage
        self.hispastological_grading = hispastological_grading

    def __str__(self):
        return (
            f"Patient ID: {self.patient_id}, T-Stage: {self.t_stage}, "
            f"N-Stage: {self.n_stage}, M-Stage: {self.m_stage}"
            f"Histopathological Grading: {self.hispastological_grading}"
        )

In [21]:
diagnostic = {
    row['NewPatientID']: Diagnostic(
        patient_id=row['NewPatientID'],
        t_stage=row['T-Stage'],
        n_stage=row['N-Stage'],
        m_stage=row['Ｍ-Stage'],
        hispastological_grading=row['Histopathological grading']
    )
    for _, row in clinical_data.iterrows()
}

In [22]:
# Compter les occurrences pour chaque champ
t_stage_counts = Counter(diag.t_stage for diag in diagnostic.values())
n_stage_counts = Counter(diag.n_stage for diag in diagnostic.values())
m_stage_counts = Counter(diag.m_stage for diag in diagnostic.values())
grading_counts = Counter(diag.hispastological_grading for diag in diagnostic.values())

# Affichage des résultats
print("Occurrences par valeur :")
print("T-Stage:") # taille de la tumeur
for val, count in t_stage_counts.items():
    print(f"  {val}: {count}")

Occurrences par valeur :
T-Stage:
  2b: 15
  1c: 127
  1b: 29
  3: 57
  2: 53
  2a: 37
  is: 3
  1a: 9
  4: 23
  1: 2


In [23]:
print("N-Stage:") # stade des métastases (0: pas de métastases, 1: métastases ganglionnaires péri-bronchiques incluant une extension directe, 2: métastases dans les ganglions, 3: métastases ganglionnaires)
for val, count in n_stage_counts.items():
    print(f"  {val}: {count}")

N-Stage:
  3: 78
  1: 85
  0: 184
  2: 8


In [24]:
print("M-Stage:") # stade des métastases (0: pas de métastases à distance, 1: métastases à distance)
for val, count in m_stage_counts.items():
    print(f"  {val}: {count}")

M-Stage:
  1b: 26
  0: 230
  1: 53
  1a: 30
  1c: 13
  3: 2
  2: 1


In [25]:
print("Histopathological Grading:")
for val, count in grading_counts.items():
    print(f"  {val}: {count}")

Histopathological Grading:
  G3: 61
  nan: 210
  G2: 27
  G1: 11
  G2-3: 34
  G1-2: 7
   G3: 1
  G2-G3: 1
   G1: 1
  G1-G2: 2


# 2. Charger les données Dicom

In [26]:
# Étape 1 — récupère les chemins valides
def get_valid_image_paths(root_dir, file_extension='.dcm'):
    valid_paths = []
    loader = LoadImage()

    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.lower().endswith(file_extension):
                file_path = os.path.join(dirpath, filename)
                try:
                    loader(file_path)  # test de chargement uniquement
                    valid_paths.append(file_path)
                except Exception:
                    pass
    return valid_paths

In [27]:
# Étape 2 — dataset personnalisé
class DicomDataset(Dataset):
    def __init__(self, file_paths, transform=None):
        self.file_paths = file_paths
        self.transform = transform or Compose([
            LoadImage(),
            EnsureChannelFirst(),        
            ScaleIntensity()              
        ])

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        img_path = self.file_paths[idx]
        image = self.transform(img_path)
        return image 

In [None]:
# Étape 3 — utilisation dans un DataLoader

root_directory = '/Users/constance/Documents/Project_lung_cancer/NIH dataset_raw/manifest-1608669183333/Lung-PET-CT-Dx'
valid_paths = get_valid_image_paths(root_directory)

dataset = DicomDataset(valid_paths)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)