In [1]:
import os
import fitz  # PyMuPDF
import pandas as pd
import re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Trying with pdf

In [2]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""

    for page in doc:
        text += page.get_text()
    doc.close()
    print(text)
    return text


def find_data_in_text(text):

    # Multiline regex patterns for Nom, Prénom, Date de Naissance, and Sexe
    nom_pattern = r"(?i)NOM\s*:\s*([A-Z]+)"
    prenom_pattern = r"Prénom\s*:\s*([A-Z\s]+)"

    date_naissance_pattern = r"Date de naissance\s*:\s*(\d{2}/\d{2}/\d{4})"
    sexe_pattern = r"Sexe\s*:\s*([MF])"

    # Using re.DOTALL to ensure dot matches across newlines
    nom_match = re.search(nom_pattern, text, re.DOTALL)
    prenom_match = re.search(prenom_pattern, text, re.DOTALL)
    date_naissance_match = re.search(date_naissance_pattern, text, re.DOTALL)
    sexe_match = re.search(sexe_pattern, text, re.DOTALL)

    # Extracting matched groups, handling None
    nom = nom_match.group(1).strip() if nom_match else None
    prenom = prenom_match.group(1).strip() if prenom_match else None
    date_naissance = date_naissance_match.group(1) if date_naissance_match else None
    sexe = sexe_match.group(1) if sexe_match else None

    return nom, prenom, date_naissance, sexe


def process_pdf_files_in_directory(directory_path):
    data = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            # Extract IPP from the filename
            ipp = filename[:9]

            pdf_path = os.path.join(directory_path, filename)
            text = extract_text_from_pdf(pdf_path)

            # Extract required information using the updated function
            nom, prenom, date_naissance, sexe = find_data_in_text(text)

            # Append the extracted data including IPP
            data.append([filename, ipp, nom, prenom, date_naissance, sexe])

    return pd.DataFrame(
        data, columns=["Filename", "IPP", "Nom", "Prénom", "Date de Naissance", "Sexe"]
    )

In [3]:
directory_path = "./data/data_extractMoustapha_2020:2021"
df = process_pdf_files_in_directory(directory_path)

df

1/LAVAGE BRONCHIOLO-ALVEOLAIRE
Technique : cytocentrifugation. Colorations : MGG, Papanicolaou, Perls, Grocott, Ziehl
Volume :15ml. Aspect : trouble
Numération : 180000 éléments/ml D=0
Formule 
Macrophages : 75%. Sidérophages (Perls) : 02%. Intensité: 1. Score de Golde : 02
Lymphocytes : 16%
Polynucléaires neutrophiles : 08%
Polynucléaires éosinophiles : 01%
Autres éléments : Hématies : + ; Cellules malpighiennes oro-pharyngées : +.
Agents pathogènes :
Pneumocystis Jorivecii : Absence
Mycoses (Grocott) : Absence
Mycobactéries (Ziehl) : Absence
CMV : Absence
Eléments bactériens : Présence de quelques germes extracellulaires (contamination? cf. Microbiologie).
Commentaires : Grocott et Ziehl négatifs.
Prescripteur :
Sexe : M
11/10/1964
Date de naissance : 
N° de demande : NN8990-21C01
Service d'Anatomie et de Cytologie Pathologiques
40 rue Worth - BP 36 - 91151 SURESNES Cedex
Tel : 01.46.25.23.12 - Fax : 01.46.25.26.45 - anapath@hopital-foch.com
N° Finess 92 0000 650
IPP : 0300682227
Pre

Unnamed: 0,Filename,IPP,Nom,Prénom,Date de Naissance,Sexe
0,300682227_13592AE5-5BE0-4E8D-9D80-8A293331E16B...,300682227,PHILIPPE,PHILIPPE\nP,,M
1,300675753_24DC775E-103F-4F65-A0DC-7A495F0AF034...,300675753,KEBE,ABDOURAKHMANE \n \n \nKEBE \n \n \nD,20/04/1968,M
2,300694221_AFC7C13D-868B-451E-A237-23B84F0C3C86...,300694221,JEAN,JEAN FRANCIS\nR,,
3,300554737_2DC418F3-6E5A-40CB-BAB7-2D243535195D...,300554737,CYRIL,CYRIL\nR,,
4,300650079_D2D9735F-56F8-424F-AFD7-C26667993382...,300650079,DELOBEL,SOPHIE \n \nN,04/08/1994,F
...,...,...,...,...,...,...
1049,300375539_AC10B1F6-478E-4A33-9B6D-BA3A5D0C463B...,300375539,CHRISTOPHE,CHRISTOPHE\nR,,
1050,300742430_D183372A-7A28-41D0-8551-F49667D72F1A...,300742430,JEAN,JEAN LOUIS\nR,,
1051,300576539_9198748A-AF2C-4528-A742-5A3772FB51E3...,300576539,LE,CAMILLE \n \nN,05/02/2000,F
1052,300128818_F235BDCD-B101-4140-A6A2-1C9BEDF1D39A...,300128818,FLORENCE,FLORENCE\nR,,


In [4]:
df["Prénom"]

0                                 PHILIPPE\nP
1       ABDOURAKHMANE \n  \n \nKEBE \n \n \nD
2                             JEAN FRANCIS\nR
3                                    CYRIL\nR
4                              SOPHIE \n  \nN
                        ...                  
1049                            CHRISTOPHE\nR
1050                            JEAN LOUIS\nR
1051                          CAMILLE \n  \nN
1052                              FLORENCE\nR
1053                          VALERIE \n  \nN
Name: Prénom, Length: 1054, dtype: object

## Going for txt files

In [5]:
def read_text_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

In [6]:
def find_data_in_text(text):

    # Multiline regex patterns for Nom, Prénom, Date de Naissance, and Sexe
    nom_pattern = r"(?i)NOM\s*:\s*([A-Z]+)"
    prenom_pattern = r"Prénom\s*:\s*([A-Z\s]+)"

    date_naissance_pattern = r"Date de naissance\s*:\s*(\d{2}/\d{2}/\d{4})"
    sexe_pattern = r"Sexe\s*:\s*([MF])"

    # Using re.DOTALL to ensure dot matches across newlines
    nom_match = re.search(nom_pattern, text, re.DOTALL)
    prenom_match = re.search(prenom_pattern, text, re.DOTALL)
    date_naissance_match = re.search(date_naissance_pattern, text, re.DOTALL)
    sexe_match = re.search(sexe_pattern, text, re.DOTALL)

    # Extracting matched groups, handling None
    nom = nom_match.group(1).strip() if nom_match else None
    prenom = prenom_match.group(1).strip() if prenom_match else None
    date_naissance = date_naissance_match.group(1) if date_naissance_match else None
    sexe = sexe_match.group(1) if sexe_match else None

    return nom, prenom, date_naissance, sexe

In [24]:
import os
import re


def extract_prenom_before_docteur(text):
    pattern = r"Prénom\s*:\s*([A-Z\s]+)"
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        # Extract the captured group before "Docteur"
        before_docteur = match.group(1)
        first_name = before_docteur.split()[0] if before_docteur else None
        return first_name
    return None


def extract_nom(text):
    nom_pattern = r"(?i)(^Nom\s*:\s*|Nom[\s+]*usuel\s*:\s*|Nom\s*:\s*)([A-Z]+)"
    nom_match = re.search(nom_pattern, text, re.DOTALL)
    return nom_match.group(2).strip() if nom_match else None


def extract_ddn(text):
    date_naissance_pattern = (
        r"(?i)(Date[\s+]*de[\s+]*naissance\s*:\s*)(\d{2}/\d{2}/\d{4})"
    )
    date_naissance_match = re.search(date_naissance_pattern, text, re.DOTALL)
    return date_naissance_match.group(2).strip() if date_naissance_match else None


def extract_sex(text):
    sexe_pattern = r"Sexe\s*:\s*([MF])"
    sexe_match = re.search(sexe_pattern, text, re.DOTALL)
    return sexe_match.group(1) if sexe_match else None


def extract_ddprelevement(text):
    date_prelevement_pattern = (
        r"(?i)(Prélevé le \s*:\s*|Prélevé[\s+]*le\s*:\s*)(\d{2}/\d{2}/\d{4})"
    )
    date_prelevement_match = re.search(date_prelevement_pattern, text, re.DOTALL)
    return date_prelevement_match.group(2) if date_prelevement_match else None


def extract_technique(text):
    # Most found pattern
    technique_pattern = r"(2\.|II\.|I\.|2/|2°/)[\s+]*(Biopsies\s+trans[ -]*bronchiques|Biopsies\s+transbronchiques|Biospies\s+transbronchiques|BTB)[\s\S+]*?Technique[^\S\r\n]*:[^\S\r\n]*([^;]+)"
    technique_match = re.search(technique_pattern, text, re.DOTALL | re.IGNORECASE)

    if not technique_match:
        # Sometimes, there isn't the numbers, only "Biopsie" in text etc..
        fallback_technique_pattern = r"(Biopsies\s+trans[ -]*bronchiques|Biopsies\s+transbronchiques|Biospies\s+transbronchiques|BTB)[\s\S+]*?Technique\s*:\s*([^;]+)"
        fallback_match = re.search(fallback_technique_pattern, text, re.DOTALL)
        if fallback_match:
            return fallback_match.group(2).strip() 
        
        else:
            #No mention of biopsie or lavage, just the mention of techniques two times. The BTB technique is always second.
            parts = re.split(r"(?=Technique\s*:)", text, flags=re.IGNORECASE)
            two_parts_fallback_technique_pattern = r"Technique\s*:\s*([^;]+)"
            if len(parts) > 2:
                two_parts_fallback_match = re.search(two_parts_fallback_technique_pattern, parts[-1], re.DOTALL)
                if two_parts_fallback_match:
                    return two_parts_fallback_match.group(1).strip() 
            
            #Else, look at any mention of technique in the document
            else:
                last_fallback_technique_pattern = r"Technique\s*:\s*([^;]+)"
                last_fallback_match = re.search(last_fallback_technique_pattern, text, re.DOTALL)
                if last_fallback_match:
                    return last_fallback_match.group(1).strip()

                    
    return technique_match.group(3).strip() if technique_match else None

def extract_niveaux_coupes(text):
    """
    In this function we just reuse the extract_technique patterns to extract the group after the semi-colon
    """
    # Most found pattern
    technique_pattern = r"(2\.|II\.|I\.|2/|2°/)[\s+]*(Biopsies\s+trans[ -]*bronchiques|Biopsies\s+transbronchiques|Biospies\s+transbronchiques|BTB)[\s\S+]*?Technique\s*:\s*([^;]+);\s*([^n]+)"
    technique_match = re.search(technique_pattern, text, re.DOTALL | re.IGNORECASE)

    if not technique_match:
        # Sometimes, there isn't the numbers, only "Biopsie" in text etc..
        fallback_technique_pattern = r"(Biopsies\s+trans[ -]*bronchiques|Biopsies\s+transbronchiques|Biospies\s+transbronchiques|BTB)[\s\S+]*?Technique\s*:\s*([^;]+);\s*([^n]+)"
        fallback_match = re.search(fallback_technique_pattern, text, re.DOTALL)
        if fallback_match:
            return fallback_match.group(3).strip() 
        
        else:
            #No mention of biopsie or lavage, just the mention of techniques two times. The BTB technique is always second.
            parts = re.split(r"(?=Technique\s*:)", text, flags=re.IGNORECASE)
            two_parts_fallback_technique_pattern = r"Technique\s*:\s*([^;]+);\s*([^n]+)"
            if len(parts) > 2:
                two_parts_fallback_match = re.search(two_parts_fallback_technique_pattern, parts[-1], re.DOTALL)
                if two_parts_fallback_match:
                    return two_parts_fallback_match.group(2).strip() 
            
            #Else, look at any mention of technique in the document
            else:
                last_fallback_technique_pattern = r"Technique\s*:\s*([^;]+);\s*([^n]+)"
                last_fallback_match = re.search(last_fallback_technique_pattern, text, re.DOTALL)
                if last_fallback_match:
                    return last_fallback_match.group(2).strip()

    return technique_match.group(4).strip() if technique_match else None

def extract_site(text):
    site_pattern = r"(Site[\s\xa0]*:)([\S]*[^\n]+)"
    site_match = re.search(site_pattern, text, re.DOTALL)
    print(site_match)
    return site_match.group(2).strip() if site_match else None


In [25]:


def process_text_files_in_directory(directory_path):
    data = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            ipp = filename[:9]
            file_path = os.path.join(directory_path, filename)
            text = read_text_file(file_path)
            nom = extract_nom(text)
            prenom = extract_prenom_before_docteur(text)
            ddn = extract_ddn(text)
            sexe = extract_sex(text)
            date_prelevement = extract_ddprelevement(text)
            technique = extract_technique(text)
            niveaux = extract_niveaux_coupes(text)
            site = extract_site(text)
            data.append([filename, ipp, nom, prenom, ddn, sexe, date_prelevement, technique,niveaux,site])

    return pd.DataFrame(
        data,
        columns=[
            "Filename",
            "IPP",
            "Nom",
            "Prénom",
            "Date de naissance",
            "Sexe",
            "Date de Prélèvement",
            "Technique",
            "Niveaux de coupes",
            "Site",
        ],
    )


# Replace 'path_to_your_directory_with_txts' with your directory's path
directory_path = "./data/data_extractMoustapha_2020:2021"
df = process_text_files_in_directory(directory_path)

<re.Match object; span=(5774, 5851), match='Site\xa0:\xa0non\xa0précisé                      >
<re.Match object; span=(5158, 5235), match='Site\xa0:\xa0LID\xa0+\xa0LM                      >
<re.Match object; span=(5147, 5214), match='Site :                                           >
<re.Match object; span=(2078, 2155), match='Site\xa0:\xa0LIG                                 >
<re.Match object; span=(4998, 5065), match='Site :                                           >
<re.Match object; span=(4850, 4927), match='Site\xa0:\xa0LIG                                 >
<re.Match object; span=(2614, 2681), match='Site : LID   LM                                  >
<re.Match object; span=(5928, 6005), match='Site\xa0:\xa0LID/LM                              >
<re.Match object; span=(5158, 5235), match='Site\xa0:\xa0Non\xa0précisé                      >
<re.Match object; span=(2078, 2155), match='Site\xa0:\xa0LID\xa0/\xa0LM                      >
<re.Match object; span=(2465, 2532), match='Site :

In [26]:
df

Unnamed: 0,Filename,IPP,Nom,Prénom,Date de naissance,Sexe,Date de Prélèvement,Technique,Niveaux de coupes,Site
0,300531886_69018907-395A-434F-9EAB-B2A9642866EA...,300531886,BOUTARD,BRUNO,04/05/1961,M,31/03/2021,HES,16,non précisé
1,300701581_C8216C08-75B0-4744-AB27-3AE0A89B94C4...,300701581,CAZEAU,Jean,28/05/1958,M,24/10/2021,HES,16,LID + LM
2,300676564_7278152A-DC79-4712-9856-19494341CAB2...,300676564,ROLI,JULIEN,08/06/1981,M,28/05/2020,HES,16,
3,300720938_6C09898C-B815-432C-96F4-9759FBAA74CB...,300720938,GUERN,LAURENCE,01/12/1970,F,13/09/2021,HES,16,LIG
4,300554737_3D17E344-E143-4B89-BCF4-30D55CBB1FF4...,300554737,CHARNAY,CYRIL,06/06/1993,M,26/03/2020,HES,16,
...,...,...,...,...,...,...,...,...,...,...
1049,300553661_4C54ECD5-BFBD-43AA-8295-39F30E33A3C8...,300553661,NUSSBAUMER,MYRTILLE,23/04/2001,F,24/11/2021,HES,16,LID + LM
1050,300670280_5A7A38FB-F732-4E34-898E-72D44F214041...,300670280,ARMAND,DANIEL,15/05/1956,M,03/03/2020,HES,16,LID
1051,300732450_DEBAB80B-F589-41AE-AA08-5F2E0D83659F...,300732450,BOYER,FABIOLA,26/02/1988,F,06/07/2021,HES,16,LID
1052,300409902_C9E9AFE4-1FBE-4C3E-9E2E-31B1706285AA...,300409902,BOGE,GILLES,14/10/1956,M,04/08/2020,HES,16,LM/LID


In [27]:
with pd.option_context('display.max_rows', None):
    print(df["Site"].value_counts())


Site
LID                                     291
-                                       104
LID+LM                                   76
LID   + LM                               62
                                         60
LIG                                      59
LID + LM                                 53
LID/LM                                   40
non  précisé                             35
non précisé                              34
Non précisé                              30
NC                                       22
/                                        15
LM                                       14
LID / LM                                 13
LM/LID                                   10
LM   +  LID                               8
non  indiqué                              7
LM+LID                                    6
Non précisé.                              6
droit                                     5
LIG + lingula                             5
LIG / Lingula              

In [28]:
df[df["Site"].isna()]

Unnamed: 0,Filename,IPP,Nom,Prénom,Date de naissance,Sexe,Date de Prélèvement,Technique,Niveaux de coupes,Site
298,300451464_E07BF54B-AF9E-4D04-9A64-985361191543...,300451464,ROBICHON,CAROLE,25/02/1969,F,11/12/2019,HES,16.0,
601,300471009_AB4043B5-9672-456B-B6DB-7D4838872372...,300471009,DJAE,DJAMA,01/08/1958,M,16/12/2020,,,
799,300191107_E1545F45-7487-4BD5-BC32-B5226285AB75...,300191107,MONTALENT,PIERRE,15/04/1978,M,11/12/2019,HES,16.0,
860,300620686_FFE01B22-AC6A-4309-B2C1-416ED7A7CBE3...,300620686,DAVID,MORGAN,06/06/1999,M,12/12/2019,HES,16.0,
